Fix to multinode code

2026-06-28 22:43:30 +01:00 · 2017-04-26 14:46:52 -04:00
720 changed files with 17745 additions and 94166 deletions
@@ -83,7 +83,6 @@ ltmain.sh
 .Trashes
 ehthumbs.db
 Thumbs.db
-.dirstamp

 # build directory #
 ###################
@@ -93,24 +92,28 @@ build*/*
 #####################
 *.xcodeproj/*
 build.sh
-.vscode
-*.code-workspace

 # Eigen source #
 ################
-Grid/Eigen
-Eigen/*
+lib/Eigen/*
+
+# FFTW source #
+################
+lib/fftw/*

 # libtool macros #
 ##################
 m4/lt*
 m4/libtool.m4

-# github pages #
-################
-gh-pages/
+# Buck files #
+##############
+.buck*
+buck-out
+BUCK
+make-bin-BUCK.sh

 # generated sources #
 #####################
-Grid/qcd/spin/gamma-gen/*.h
-Grid/qcd/spin/gamma-gen/*.cc
+lib/qcd/spin/gamma-gen/*.h
+lib/qcd/spin/gamma-gen/*.cc
@@ -9,11 +9,62 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
-      env: PREC=single
-    - os:        osx
-      osx_image: xcode8.3
-      compiler: clang
-      env: PREC=double
+    - compiler: gcc
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.9
+            - libmpfr-dev
+            - libgmp-dev
+            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
+            - binutils-dev
+      env: VERSION=-4.9
+    - compiler: gcc
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-5
+            - libmpfr-dev
+            - libgmp-dev
+            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
+            - binutils-dev
+      env: VERSION=-5
+    - compiler: clang
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.8
+            - libmpfr-dev
+            - libgmp-dev
+            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
+            - binutils-dev
+      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
+    - compiler: clang
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.8
+            - libmpfr-dev
+            - libgmp-dev
+            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
+            - binutils-dev
+      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
      
 before_install:
    - export GRIDDIR=`pwd`
@@ -21,41 +72,32 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
    
 install:
-    - export CWD=`pwd`
-    - echo $CWD
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
-    - which autoconf
-    - autoconf  --version
-    - which automake
-    - automake  --version
    - which $CC
    - $CC  --version
    - which $CXX
    - $CXX --version
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
    
 script:
    - ./bootstrap.sh
    - mkdir build
    - cd build
-    - mkdir lime
-    - cd lime
-    - mkdir build
-    - cd build
-    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
-    - tar xf lime-1.3.2.tar.gz
-    - cd lime-1.3.2
-    - ./configure --prefix=$CWD/build/lime/install
-    - make -j4
-    - make install
-    - cd $CWD/build
-    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
-    - make check
+    - echo make clean
+    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
+    - make -j4
+    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
+    - echo make clean
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make -j4; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
+
+
@@ -1,37 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/DisableWarnings.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef DISABLE_WARNINGS_H
-#define DISABLE_WARNINGS_H
-
- //disables and intel compiler specific warning (in json.hpp)
-#pragma warning disable 488  
-
-
-#endif
@@ -1,29 +0,0 @@
-#ifndef GRID_STD_H
-#define GRID_STD_H
-
-///////////////////
-// Std C++ dependencies
-///////////////////
-#include <cassert>
-#include <complex>
-#include <vector>
-#include <string>
-#include <iostream>
-#include <iomanip>
-#include <random>
-#include <functional>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <signal.h>
-#include <ctime>
-#include <sys/time.h>
-#include <chrono>
-#include <zlib.h>
-
-///////////////////
-// Grid config
-///////////////////
-#include "Config.h"
-
-#endif /* GRID_STD_H */
@@ -1,14 +0,0 @@
-#pragma once
-// Force Eigen to use MKL if Grid has been configured with --enable-mkl
-#ifdef USE_MKL
-#define EIGEN_USE_MKL_ALL
-#endif
-
-#if defined __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-#include <Grid/Eigen/Dense>
-#if defined __GNUC__
-#pragma GCC diagnostic pop
-#endif
@@ -1,63 +0,0 @@
-extra_sources=
-extra_headers=
-
-if BUILD_COMMS_MPI3
-  extra_sources+=communicator/Communicator_mpi3.cc
-  extra_sources+=communicator/Communicator_base.cc
-  extra_sources+=communicator/SharedMemoryMPI.cc
-  extra_sources+=communicator/SharedMemory.cc
-endif
-
-if BUILD_COMMS_NONE
-  extra_sources+=communicator/Communicator_none.cc
-  extra_sources+=communicator/Communicator_base.cc
-  extra_sources+=communicator/SharedMemoryNone.cc
-  extra_sources+=communicator/SharedMemory.cc
-endif
-
-if BUILD_HDF5
-  extra_sources+=serialisation/Hdf5IO.cc 
-  extra_headers+=serialisation/Hdf5IO.h
-  extra_headers+=serialisation/Hdf5Type.h
-endif
-
-all: version-cache
-
-version-cache:
-	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
-		a="uncommited changes";\
-	else\
-		a="clean";\
-	fi;\
-	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d $$a\\"%n" HEAD`" > vertmp;\
-	if [ -e version-cache ]; then\
-		d=`diff vertmp version-cache`;\
-		if [ "$${d}" != "" ]; then\
-			mv vertmp version-cache;\
-			rm -f Version.h;\
-		fi;\
-	else\
-		mv vertmp version-cache;\
-		rm -f Version.h;\
-	fi;\
-	rm -f vertmp
-
-Version.h:
-	cp version-cache Version.h
-
-.PHONY: version-cache
-
-#
-# Libraries
-#
-include Make.inc
-include Eigen.inc
-
-lib_LIBRARIES = libGrid.a
-
-CCFILES += $(extra_sources)
-HFILES  += $(extra_headers) Config.h Version.h
-
-libGrid_a_SOURCES              = $(CCFILES)
-libGrid_adir                   = $(includedir)/Grid
-nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files)
@@ -1,152 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/algorithms/approx/Forecast.h
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef INCLUDED_FORECAST_H
-#define INCLUDED_FORECAST_H
-
-namespace Grid {
-
-  // Abstract base class.
-  // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
-  // and returns a forecasted solution to the system D*psi = phi (psi).
-  template<class Matrix, class Field>
-  class Forecast
-  {
-    public:
-      virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
-  };
-
-  // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
-  // used to forecast solutions across poles of the EOFA heatbath.
-  //
-  // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
-  template<class Matrix, class Field>
-  class ChronoForecast : public Forecast<Matrix,Field>
-  {
-    public:
-      Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
-      {
-        int degree = prev_solns.size();
-        Field chi(phi); // forecasted solution
-
-        // Trivial cases
-        if(degree == 0){ chi = zero; return chi; }
-        else if(degree == 1){ return prev_solns[0]; }
-
-        RealD dot;
-        ComplexD xp;
-        Field r(phi); // residual
-        Field Mv(phi);
-        std::vector<Field> v(prev_solns); // orthonormalized previous solutions
-        std::vector<Field> MdagMv(degree,phi);
-
-        // Array to hold the matrix elements
-        std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
-
-        // Solution and source vectors
-        std::vector<ComplexD> a(degree);
-        std::vector<ComplexD> b(degree);
-
-        // Orthonormalize the vector basis
-        for(int i=0; i<degree; i++){
-          v[i] *= 1.0/std::sqrt(norm2(v[i]));
-          for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
-        }
-
-        // Perform sparse matrix multiplication and construct rhs
-        for(int i=0; i<degree; i++){
-          b[i] = innerProduct(v[i],phi);
-          Mat.M(v[i],Mv);
-          Mat.Mdag(Mv,MdagMv[i]);
-          G[i][i] = innerProduct(v[i],MdagMv[i]);
-        }
-
-        // Construct the matrix
-        for(int j=0; j<degree; j++){
-        for(int k=j+1; k<degree; k++){
-          G[j][k] = innerProduct(v[j],MdagMv[k]);
-          G[k][j] = std::conj(G[j][k]);
-        }}
-
-        // Gauss-Jordan elimination with partial pivoting
-        for(int i=0; i<degree; i++){
-
-          // Perform partial pivoting
-          int k = i;
-          for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
-          if(k != i){
-            xp = b[k];
-            b[k] = b[i];
-            b[i] = xp;
-            for(int j=0; j<degree; j++){
-              xp = G[k][j];
-              G[k][j] = G[i][j];
-              G[i][j] = xp;
-            }
-          }
-
-          // Convert matrix to upper triangular form
-          for(int j=i+1; j<degree; j++){
-            xp = G[j][i]/G[i][i];
-            b[j] -= xp * b[i];
-            for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
-          }
-        }
-
-        // Use Gaussian elimination to solve equations and calculate initial guess
-        chi = zero;
-        r = phi;
-        for(int i=degree-1; i>=0; i--){
-          a[i] = 0.0;
-          for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
-          a[i] = (b[i]-a[i])/G[i][i];
-          chi += a[i]*v[i];
-          r -= a[i]*MdagMv[i];
-        }
-
-        RealD true_r(0.0);
-        ComplexD tmp;
-        for(int i=0; i<degree; i++){
-          tmp = -b[i];
-          for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
-          tmp = std::conj(tmp)*tmp;
-          true_r += std::sqrt(tmp.real());
-        }
-
-        RealD error = std::sqrt(norm2(r)/norm2(phi));
-        std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
-
-        return chi;
-      };
-  };
-
-}
-
-#endif
@@ -1,606 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
-
-Copyright (C) 2017
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
-#define GRID_BLOCK_CONJUGATE_GRADIENT_H
-
-
-namespace Grid {
-
-enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
-
-//////////////////////////////////////////////////////////////////////////
-// Block conjugate gradient. Dimension zero should be the block direction
-//////////////////////////////////////////////////////////////////////////
-template <class Field>
-class BlockConjugateGradient : public OperatorFunction<Field> {
- public:
-
-
-  typedef typename Field::scalar_type scomplex;
-
-  int blockDim ;
-  int Nblock;
-
-  BlockCGtype CGtype;
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
-                           // Defaults true.
-  RealD Tolerance;
-  Integer MaxIterations;
-  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  
-  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
-  {};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Thin QR factorisation (google it)
-////////////////////////////////////////////////////////////////////////////////////////////////////
-void ThinQRfact (Eigen::MatrixXcd &m_rr,
-		 Eigen::MatrixXcd &C,
-		 Eigen::MatrixXcd &Cinv,
-		 Field & Q,
-		 const Field & R)
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  //Dimensions
-  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
-  //
-  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
-  //
-  //   Q  C = R => Q = R C^{-1}
-  //
-  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
-  //
-  // Set C = L^{dag}, and then Q^dag Q = ident 
-  //
-  // Checks:
-  // Cdag C = Rdag R ; passes.
-  // QdagQ  = 1      ; passes
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  sliceInnerProductMatrix(m_rr,R,R,Orthog);
-
-  // Force manifest hermitian to avoid rounding related
-  m_rr = 0.5*(m_rr+m_rr.adjoint());
-
-#if 0
-  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
-  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
-  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
-  auto  D_ldlt = m_rr.ldlt().vectorD(); 
-  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
-#endif
-
-  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
-  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
-  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
-  C    = L.adjoint();
-  Cinv = C.inverse();
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Q = R C^{-1}
-  //
-  // Q_j  = R_i Cinv(i,j) 
-  //
-  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  sliceMulMatrix(Q,Cinv,R,Orthog);
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Call one of several implementations
-////////////////////////////////////////////////////////////////////////////////////////////////////
-void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-{
-  if ( CGtype == BlockCGrQ ) {
-    BlockCGrQsolve(Linop,Src,Psi);
-  } else if (CGtype == BlockCG ) {
-    BlockCGsolve(Linop,Src,Psi);
-  } else if (CGtype == CGmultiRHS ) {
-    CGmultiRHSsolve(Linop,Src,Psi);
-  } else {
-    assert(0);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////
-// BlockCGrQ implementation:
-//--------------------------
-// X is guess/Solution
-// B is RHS
-// Solve A X_i = B_i    ;        i refers to Nblock index
-////////////////////////////////////////////////////////////////////////////
-void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
-  Nblock = B._grid->_fdimensions[Orthog];
-
-  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-  X.checkerboard = B.checkerboard;
-  conformable(X, B);
-
-  Field tmp(B);
-  Field Q(B);
-  Field D(B);
-  Field Z(B);
-  Field AD(B);
-
-  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-
-  // Initial residual computation & set up
-  std::vector<RealD> residuals(Nblock);
-  std::vector<RealD> ssq(Nblock);
-
-  sliceNorm(ssq,B,Orthog);
-  RealD sssum=0;
-  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-  sliceNorm(residuals,B,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  sliceNorm(residuals,X,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  /************************************************************************
-   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
-   ************************************************************************
-   * Dimensions:
-   *
-   *   X,B==(Nferm x Nblock)
-   *   A==(Nferm x Nferm)
-   *  
-   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
-   * 
-   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
-   * for k: 
-   *   Z  = AD
-   *   M  = [D^dag Z]^{-1}
-   *   X  = X + D MC
-   *   QS = Q - ZM
-   *   D  = Q + D S^dag
-   *   C  = S C
-   */
-  ///////////////////////////////////////
-  // Initial block: initial search dir is guess
-  ///////////////////////////////////////
-  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
-
-  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
-
-  Linop.HermOp(X, AD);
-  tmp = B - AD;  
-  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
-  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
-  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
-  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
-  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
-  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
-  D=Q;
-
-  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
-
-  ///////////////////////////////////////
-  // Timers
-  ///////////////////////////////////////
-  GridStopWatch sliceInnerTimer;
-  GridStopWatch sliceMaddTimer;
-  GridStopWatch QRTimer;
-  GridStopWatch MatrixTimer;
-  GridStopWatch SolverTimer;
-  SolverTimer.Start();
-
-  int k;
-  for (k = 1; k <= MaxIterations; k++) {
-
-    //3. Z  = AD
-    MatrixTimer.Start();
-    Linop.HermOp(D, Z);      
-    MatrixTimer.Stop();
-    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
-
-    //4. M  = [D^dag Z]^{-1}
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
-    sliceInnerTimer.Stop();
-    m_M       = m_DZ.inverse();
-    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
-    
-    //5. X  = X + D MC
-    m_tmp     = m_M * m_C;
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
-    sliceMaddTimer.Stop();
-
-    //6. QS = Q - ZM
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
-    sliceMaddTimer.Stop();
-    QRTimer.Start();
-    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
-    QRTimer.Stop();
-    
-    //7. D  = Q + D S^dag
-    m_tmp = m_S.adjoint();
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
-    sliceMaddTimer.Stop();
-
-    //8. C  = S C
-    m_C = m_S*m_C;
-    
-    /*********************
-     * convergence monitor
-     *********************
-     */
-    m_rr = m_C.adjoint() * m_C;
-
-    RealD max_resid=0;
-    RealD rrsum=0;
-    RealD rr;
-
-    for(int b=0;b<Nblock;b++) {
-      rrsum+=real(m_rr(b,b));
-      rr = real(m_rr(b,b))/ssq[b];
-      if ( rr > max_resid ) max_resid = rr;
-    }
-
-    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
-
-    if ( max_resid < Tolerance*Tolerance ) { 
-
-      SolverTimer.Stop();
-
-      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
-
-      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
-		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
-      }
-      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-      Linop.HermOp(X, AD);
-      AD = AD-B;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
-
-      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
-	    
-      IterationsToComplete = k;
-      return;
-    }
-
-  }
-  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
-
-  if (ErrorOnNoConverge) assert(0);
-  IterationsToComplete = k;
-}
-//////////////////////////////////////////////////////////////////////////
-// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
-//////////////////////////////////////////////////////////////////////////
-void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-{
-  int Orthog = blockDim; // First dimension is block dim; this is an assumption
-  Nblock = Src._grid->_fdimensions[Orthog];
-
-  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-  Psi.checkerboard = Src.checkerboard;
-  conformable(Psi, Src);
-
-  Field P(Src);
-  Field AP(Src);
-  Field R(Src);
-  
-  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  // Initial residual computation & set up
-  std::vector<RealD> residuals(Nblock);
-  std::vector<RealD> ssq(Nblock);
-
-  sliceNorm(ssq,Src,Orthog);
-  RealD sssum=0;
-  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-  sliceNorm(residuals,Src,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  sliceNorm(residuals,Psi,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  // Initial search dir is guess
-  Linop.HermOp(Psi, AP);
-  
-
-  /************************************************************************
-   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
-   ************************************************************************
-   * O'Leary : R = B - A X
-   * O'Leary : P = M R ; preconditioner M = 1
-   * O'Leary : alpha = PAP^{-1} RMR
-   * O'Leary : beta  = RMR^{-1}_old RMR_new
-   * O'Leary : X=X+Palpha
-   * O'Leary : R_new=R_old-AP alpha
-   * O'Leary : P=MR_new+P beta
-   */
-
-  R = Src - AP;  
-  P = R;
-  sliceInnerProductMatrix(m_rr,R,R,Orthog);
-
-  GridStopWatch sliceInnerTimer;
-  GridStopWatch sliceMaddTimer;
-  GridStopWatch MatrixTimer;
-  GridStopWatch SolverTimer;
-  SolverTimer.Start();
-
-  int k;
-  for (k = 1; k <= MaxIterations; k++) {
-
-    RealD rrsum=0;
-    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
-
-    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
-
-    MatrixTimer.Start();
-    Linop.HermOp(P, AP);
-    MatrixTimer.Stop();
-
-    // Alpha
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
-    sliceInnerTimer.Stop();
-    m_pAp_inv = m_pAp.inverse();
-    m_alpha   = m_pAp_inv * m_rr ;
-
-    // Psi, R update
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
-    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
-    sliceMaddTimer.Stop();
-
-    // Beta
-    m_rr_inv = m_rr.inverse();
-    sliceInnerTimer.Start();
-    sliceInnerProductMatrix(m_rr,R,R,Orthog);
-    sliceInnerTimer.Stop();
-    m_beta = m_rr_inv *m_rr;
-
-    // Search update
-    sliceMaddTimer.Start();
-    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
-    sliceMaddTimer.Stop();
-    P= AP;
-
-    /*********************
-     * convergence monitor
-     *********************
-     */
-    RealD max_resid=0;
-    RealD rr;
-    for(int b=0;b<Nblock;b++){
-      rr = real(m_rr(b,b))/ssq[b];
-      if ( rr > max_resid ) max_resid = rr;
-    }
-    
-    if ( max_resid < Tolerance*Tolerance ) { 
-
-      SolverTimer.Stop();
-
-      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
-      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
-		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
-      }
-      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-      Linop.HermOp(Psi, AP);
-      AP = AP-Src;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
-
-      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-	    
-      IterationsToComplete = k;
-      return;
-    }
-
-  }
-  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
-
-  if (ErrorOnNoConverge) assert(0);
-  IterationsToComplete = k;
-}
-//////////////////////////////////////////////////////////////////////////
-// multiRHS conjugate gradient. Dimension zero should be the block direction
-// Use this for spread out across nodes
-//////////////////////////////////////////////////////////////////////////
-void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-{
-  int Orthog = blockDim; // First dimension is block dim
-  Nblock = Src._grid->_fdimensions[Orthog];
-
-  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-  Psi.checkerboard = Src.checkerboard;
-  conformable(Psi, Src);
-
-  Field P(Src);
-  Field AP(Src);
-  Field R(Src);
-  
-  std::vector<ComplexD> v_pAp(Nblock);
-  std::vector<RealD> v_rr (Nblock);
-  std::vector<RealD> v_rr_inv(Nblock);
-  std::vector<RealD> v_alpha(Nblock);
-  std::vector<RealD> v_beta(Nblock);
-
-  // Initial residual computation & set up
-  std::vector<RealD> residuals(Nblock);
-  std::vector<RealD> ssq(Nblock);
-
-  sliceNorm(ssq,Src,Orthog);
-  RealD sssum=0;
-  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-  sliceNorm(residuals,Src,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  sliceNorm(residuals,Psi,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-  // Initial search dir is guess
-  Linop.HermOp(Psi, AP);
-
-  R = Src - AP;  
-  P = R;
-  sliceNorm(v_rr,R,Orthog);
-
-  GridStopWatch sliceInnerTimer;
-  GridStopWatch sliceMaddTimer;
-  GridStopWatch sliceNormTimer;
-  GridStopWatch MatrixTimer;
-  GridStopWatch SolverTimer;
-
-  SolverTimer.Start();
-  int k;
-  for (k = 1; k <= MaxIterations; k++) {
-
-    RealD rrsum=0;
-    for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
-
-    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
-
-    MatrixTimer.Start();
-    Linop.HermOp(P, AP);
-    MatrixTimer.Stop();
-
-    // Alpha
-    sliceInnerTimer.Start();
-    sliceInnerProductVector(v_pAp,P,AP,Orthog);
-    sliceInnerTimer.Stop();
-    for(int b=0;b<Nblock;b++){
-      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
-    }
-
-    // Psi, R update
-    sliceMaddTimer.Start();
-    sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
-    sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
-    sliceMaddTimer.Stop();
-
-    // Beta
-    for(int b=0;b<Nblock;b++){
-      v_rr_inv[b] = 1.0/v_rr[b];
-    }
-    sliceNormTimer.Start();
-    sliceNorm(v_rr,R,Orthog);
-    sliceNormTimer.Stop();
-    for(int b=0;b<Nblock;b++){
-      v_beta[b] = v_rr_inv[b] *v_rr[b];
-    }
-
-    // Search update
-    sliceMaddTimer.Start();
-    sliceMaddVector(P,v_beta,P,R,Orthog);
-    sliceMaddTimer.Stop();
-
-    /*********************
-     * convergence monitor
-     *********************
-     */
-    RealD max_resid=0;
-    for(int b=0;b<Nblock;b++){
-      RealD rr = v_rr[b]/ssq[b];
-      if ( rr > max_resid ) max_resid = rr;
-    }
-    
-    if ( max_resid < Tolerance*Tolerance ) { 
-
-      SolverTimer.Stop();
-
-      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
-      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
-      }
-      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-      Linop.HermOp(Psi, AP);
-      AP = AP-Src;
-      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
-
-      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-      std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
-      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-
-
-      IterationsToComplete = k;
-      return;
-    }
-
-  }
-  std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
-
-  if (ErrorOnNoConverge) assert(0);
-  IterationsToComplete = k;
-}
-
-};
-
-}
-#endif
@@ -1,256 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
-
-    Copyright (C) 2015
-
-Author: Christopher Kelly <ckelly@phys.columbia.edu>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
-#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
-
-namespace Grid {
-
-  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
-  class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
-  public:
-    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
-    // Defaults true.
-    RealD Tolerance;
-    Integer MaxIterations;
-    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-    Integer ReliableUpdatesPerformed;
-
-    bool DoFinalCleanup; //Final DP cleanup, defaults to true
-    Integer IterationsToCleanup; //Final DP cleanup step iterations
-    
-    LinearOperatorBase<FieldF> &Linop_f;
-    LinearOperatorBase<FieldD> &Linop_d;
-    GridBase* SinglePrecGrid;
-    RealD Delta; //reliable update parameter
-
-    //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
-    LinearOperatorBase<FieldF> *Linop_fallback;
-    RealD fallback_transition_tol;
-
-    
-    ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
-      : Tolerance(tol),
-        MaxIterations(maxit),
-	Delta(_delta),
-	Linop_f(_Linop_f),
-	Linop_d(_Linop_d),
-	SinglePrecGrid(_sp_grid),
-        ErrorOnNoConverge(err_on_no_conv),
-	DoFinalCleanup(true),
-	Linop_fallback(NULL)
-    {};
-
-    void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
-      Linop_fallback = &_Linop_fallback;
-      fallback_transition_tol = _fallback_transition_tol;      
-    }
-    
-    void operator()(const FieldD &src, FieldD &psi) {
-      LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
-      bool using_fallback = false;
-      
-      psi.checkerboard = src.checkerboard;
-      conformable(psi, src);
-
-      RealD cp, c, a, d, b, ssq, qq, b_pred;
-
-      FieldD p(src);
-      FieldD mmp(src);
-      FieldD r(src);
-
-      // Initial residual computation & set up
-      RealD guess = norm2(psi);
-      assert(std::isnan(guess) == 0);
-    
-      Linop_d.HermOpAndNorm(psi, mmp, d, b);
-    
-      r = src - mmp;
-      p = r;
-
-      a = norm2(p);
-      cp = a;
-      ssq = norm2(src);
-
-      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
-      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
-      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
-      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
-      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
-      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
-
-      RealD rsq = Tolerance * Tolerance * ssq;
-
-      // Check if guess is really REALLY good :)
-      if (cp <= rsq) {
-	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
-	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
-	return;
-      }
-
-      //Single prec initialization
-      FieldF r_f(SinglePrecGrid);
-      r_f.checkerboard = r.checkerboard;
-      precisionChange(r_f, r);
-
-      FieldF psi_f(r_f);
-      psi_f = zero;
-
-      FieldF p_f(r_f);
-      FieldF mmp_f(r_f);
-
-      RealD MaxResidSinceLastRelUp = cp; //initial residual    
-    
-      std::cout << GridLogIterative << std::setprecision(4)
-		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
-
-      GridStopWatch LinalgTimer;
-      GridStopWatch MatrixTimer;
-      GridStopWatch SolverTimer;
-
-      SolverTimer.Start();
-      int k = 0;
-      int l = 0;
-    
-      for (k = 1; k <= MaxIterations; k++) {
-	c = cp;
-
-	MatrixTimer.Start();
-	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
-	MatrixTimer.Stop();
-
-	LinalgTimer.Start();
-
-	a = c / d;
-	b_pred = a * (a * qq - d) / c;
-
-	cp = axpy_norm(r_f, -a, mmp_f, r_f);
-	b = cp / c;
-
-	// Fuse these loops ; should be really easy
-	psi_f = a * p_f + psi_f;
-	//p_f = p_f * b + r_f;
-
-	LinalgTimer.Stop();
-
-	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
-		  << " residual " << cp << " target " << rsq << std::endl;
-	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
-	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
-
-	if(cp > MaxResidSinceLastRelUp){
-	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
-	  MaxResidSinceLastRelUp = cp;
-	}
-	  
-	// Stopping condition
-	if (cp <= rsq) {
-	  //Although not written in the paper, I assume that I have to add on the final solution
-	  precisionChange(mmp, psi_f);
-	  psi = psi + mmp;
-	
-	
-	  SolverTimer.Stop();
-	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
-	  p = mmp - src;
-
-	  RealD srcnorm = sqrt(norm2(src));
-	  RealD resnorm = sqrt(norm2(p));
-	  RealD true_residual = resnorm / srcnorm;
-
-	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
-	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
-	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
-	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
-
-	  std::cout << GridLogMessage << "Time breakdown "<<std::endl;
-	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-
-	  IterationsToComplete = k;	
-	  ReliableUpdatesPerformed = l;
-	  
-	  if(DoFinalCleanup){
-	    //Do a final CG to cleanup
-	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
-	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
-	    CG.ErrorOnNoConverge = ErrorOnNoConverge;
-	    CG(Linop_d,src,psi);
-	    IterationsToCleanup = CG.IterationsToComplete;
-	  }
-	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
-
-	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
-	  return;
-	}
-	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
-	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
-		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
-	  precisionChange(mmp, psi_f);
-	  psi = psi + mmp;
-
-	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
-	  r = src - mmp;
-
-	  psi_f = zero;
-	  precisionChange(r_f, r);
-	  cp = norm2(r);
-	  MaxResidSinceLastRelUp = cp;
-
-	  b = cp/c;
-	  
-	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
-	  
-	  l = l+1;
-	}
-
-	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
-
-	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
-	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
-	  Linop_f_use = Linop_fallback;
-	  using_fallback = true;
-	}
-
-	
-      }
-      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
-		<< std::endl;
-      
-      if (ErrorOnNoConverge) assert(0);
-      IterationsToComplete = k;
-      ReliableUpdatesPerformed = l;      
-    }    
-  };
-
-
-};
-
-
-
-#endif
@@ -1,104 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_DEFLATION_H
-#define GRID_DEFLATION_H
-
-namespace Grid { 
-
-template<class Field>
-class ZeroGuesser: public LinearFunction<Field> {
-public:
-  virtual void operator()(const Field &src, Field &guess) { guess = zero; };
-};
-
-template<class Field>
-class SourceGuesser: public LinearFunction<Field> {
-public:
-  virtual void operator()(const Field &src, Field &guess) { guess = src; };
-};
-
-////////////////////////////////
-// Fine grid deflation
-////////////////////////////////
-template<class Field>
-class DeflatedGuesser: public LinearFunction<Field> {
-private:
-  const std::vector<Field> &evec;
-  const std::vector<RealD> &eval;
-
-public:
-
-  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
-
-  virtual void operator()(const Field &src,Field &guess) {
-    guess = zero;
-    assert(evec.size()==eval.size());
-    auto N = evec.size();
-    for (int i=0;i<N;i++) {
-      const Field& tmp = evec[i];
-      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
-    }
-    guess.checkerboard = src.checkerboard;
-  }
-};
-
-template<class FineField, class CoarseField>
-class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
-private:
-  const std::vector<FineField>   &subspace;
-  const std::vector<CoarseField> &evec_coarse;
-  const std::vector<RealD>       &eval_coarse;
-public:
-  
-  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
-				const std::vector<CoarseField> &_evec_coarse,
-				const std::vector<RealD>       &_eval_coarse)
-    : subspace(_subspace), 
-      evec_coarse(_evec_coarse), 
-      eval_coarse(_eval_coarse)  
-  {
-  }
-  
-  void operator()(const FineField &src,FineField &guess) { 
-    int N = (int)evec_coarse.size();
-    CoarseField src_coarse(evec_coarse[0]._grid);
-    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
-    blockProject(src_coarse,src,subspace);    
-    for (int i=0;i<N;i++) {
-      const CoarseField & tmp = evec_coarse[i];
-      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
-    }
-    blockPromote(guess_coarse,guess,subspace);
-    guess.checkerboard = src.checkerboard;
-  };
-};
-
-
-
-}
-#endif
@@ -1,842 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-Author: Christoph Lehner <clehner@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_BIRL_H
-#define GRID_BIRL_H
-
-#include <string.h> //memset
-//#include <zlib.h>
-#include <sys/stat.h>
-
-namespace Grid { 
-
-  ////////////////////////////////////////////////////////
-  // Move following 100 LOC to lattice/Lattice_basis.h
-  ////////////////////////////////////////////////////////
-template<class Field>
-void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
-{
-  for(int j=0; j<k; ++j){
-    auto ip = innerProduct(basis[j],w);
-    w = w - ip*basis[j];
-  }
-}
-
-template<class Field>
-void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
-{
-  typedef typename Field::vector_object vobj;
-  GridBase* grid = basis[0]._grid;
-      
-  parallel_region
-  {
-
-    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
-       
-    parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
-      for(int j=j0; j<j1; ++j) B[j]=0.;
-      
-      for(int j=j0; j<j1; ++j){
-	for(int k=k0; k<k1; ++k){
-	  B[j] +=Qt(j,k) * basis[k]._odata[ss];
-	}
-      }
-      for(int j=j0; j<j1; ++j){
-	  basis[j]._odata[ss] = B[j];
-      }
-    }
-  }
-}
-
-// Extract a single rotated vector
-template<class Field>
-void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
-{
-  typedef typename Field::vector_object vobj;
-  GridBase* grid = basis[0]._grid;
-
-  result.checkerboard = basis[0].checkerboard;
-  parallel_for(int ss=0;ss < grid->oSites();ss++){
-    vobj B = zero;
-    for(int k=k0; k<k1; ++k){
-      B +=Qt(j,k) * basis[k]._odata[ss];
-    }
-    result._odata[ss] = B;
-  }
-}
-
-template<class Field>
-void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
-{
-  int vlen = idx.size();
-
-  assert(vlen>=1);
-  assert(vlen<=sort_vals.size());
-  assert(vlen<=_v.size());
-
-  for (size_t i=0;i<vlen;i++) {
-
-    if (idx[i] != i) {
-
-      //////////////////////////////////////
-      // idx[i] is a table of desired sources giving a permutation.
-      // Swap v[i] with v[idx[i]].
-      // Find  j>i for which _vnew[j] = _vold[i],
-      // track the move idx[j] => idx[i]
-      // track the move idx[i] => i
-      //////////////////////////////////////
-      size_t j;
-      for (j=i;j<idx.size();j++)
-	if (idx[j]==i)
-	  break;
-
-      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
-
-      std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
-      std::swap(sort_vals[i],sort_vals[idx[i]]);
-
-      idx[j] = idx[i];
-      idx[i] = i;
-    }
-  }
-}
-
-inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
-{
-  std::vector<int> idx(sort_vals.size());
-  std::iota(idx.begin(), idx.end(), 0);
-
-  // sort indexes based on comparing values in v
-  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
-    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
-  });
-  return idx;
-}
-
-template<class Field>
-void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
-{
-  std::vector<int> idx = basisSortGetIndex(sort_vals);
-  if (reverse)
-    std::reverse(idx.begin(), idx.end());
-  
-  basisReorderInPlace(_v,sort_vals,idx);
-}
-
-/////////////////////////////////////////////////////////////
-// Implicitly restarted lanczos
-/////////////////////////////////////////////////////////////
-template<class Field> class ImplicitlyRestartedLanczosTester 
-{
- public:
-  virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
-  virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
-};
-
-enum IRLdiagonalisation { 
-  IRLdiagonaliseWithDSTEGR,
-  IRLdiagonaliseWithQR,
-  IRLdiagonaliseWithEigen
-};
-
-template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
-{
- public:
-
-  LinearFunction<Field>       &_HermOp;
-  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
-  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
-  {
-    return TestConvergence(j,resid,B,eval,evalMaxApprox);
-  }
-  int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
-  {
-    Field v(B);
-    RealD eval_poly = eval;
-    // Apply operator
-    _HermOp(B,v);
-
-    RealD vnum = real(innerProduct(B,v)); // HermOp.
-    RealD vden = norm2(B);
-    RealD vv0  = norm2(v);
-    eval   = vnum/vden;
-    v -= eval*B;
-
-    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
-
-    std::cout.precision(13);
-    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
-	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
-	     <<std::endl;
-
-    int conv=0;
-    if( (vv<eresid*eresid) ) conv = 1;
-
-    return conv;
-  }
-};
-
-template<class Field> 
-class ImplicitlyRestartedLanczos {
- private:
-  const RealD small = 1.0e-8;
-  int MaxIter;
-  int MinRestart; // Minimum number of restarts; only check for convergence after
-  int Nstop;   // Number of evecs checked for convergence
-  int Nk;      // Number of converged sought
-  //  int Np;      // Np -- Number of spare vecs in krylov space //  == Nm - Nk
-  int Nm;      // Nm -- total number of vectors
-  IRLdiagonalisation diagonalisation;
-  int orth_period;
-    
-  RealD OrthoTime;
-  RealD eresid, betastp;
-  ////////////////////////////////
-  // Embedded objects
-  ////////////////////////////////
-  LinearFunction<Field>       &_PolyOp;
-  LinearFunction<Field>       &_HermOp;
-  ImplicitlyRestartedLanczosTester<Field> &_Tester;
-  // Default tester provided (we need a ref to something in default case)
-  ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
-  /////////////////////////
-  // Constructor
-  /////////////////////////
-  
-public:       
-
-  //////////////////////////////////////////////////////////////////
-  // PAB:
-  //////////////////////////////////////////////////////////////////
-  // Too many options  & knobs. 
-  // Eliminate:
-  //   orth_period
-  //   betastp
-  //   MinRestart
-  //
-  // Do we really need orth_period
-  // What is the theoretical basis & guarantees of betastp ?
-  // Nstop=Nk viable?
-  // MinRestart avoidable with new convergence test?
-  // Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
-  // HermOp could be eliminated if we dropped the Power method for max eval.
-  // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
-  //////////////////////////////////////////////////////////////////
- ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
-			    LinearFunction<Field> & HermOp,
-			    ImplicitlyRestartedLanczosTester<Field> & Tester,
-			    int _Nstop, // sought vecs
-			    int _Nk, // sought vecs
-			    int _Nm, // spare vecs
-			    RealD _eresid, // resid in lmdue deficit 
-			    int _MaxIter, // Max iterations
-			    RealD _betastp=0.0, // if beta(k) < betastp: converged
-			    int _MinRestart=1, int _orth_period = 1,
-			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
-    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
-    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
-    eresid(_eresid),      betastp(_betastp),
-    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
-    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
-
-    ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
-			       LinearFunction<Field> & HermOp,
-			       int _Nstop, // sought vecs
-			       int _Nk, // sought vecs
-			       int _Nm, // spare vecs
-			       RealD _eresid, // resid in lmdue deficit 
-			       int _MaxIter, // Max iterations
-			       RealD _betastp=0.0, // if beta(k) < betastp: converged
-			       int _MinRestart=1, int _orth_period = 1,
-			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
-    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
-    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
-    eresid(_eresid),      betastp(_betastp),
-    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
-    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
-
-  ////////////////////////////////
-  // Helpers
-  ////////////////////////////////
-  template<typename T>  static RealD normalise(T& v) 
-  {
-    RealD nn = norm2(v);
-    nn = sqrt(nn);
-    v = v * (1.0/nn);
-    return nn;
-  }
-
-  void orthogonalize(Field& w, std::vector<Field>& evec,int k)
-  {
-    OrthoTime-=usecond()/1e6;
-    basisOrthogonalize(evec,w,k);
-    normalise(w);
-    OrthoTime+=usecond()/1e6;
-  }
-
-/* Rudy Arthur's thesis pp.137
------------------------
-Require: M > K P = M − K †
-Compute the factorization AVM = VM HM + fM eM 
-repeat
-  Q=I
-  for i = 1,...,P do
-    QiRi =HM −θiI Q = QQi
-    H M = Q †i H M Q i
-  end for
-  βK =HM(K+1,K) σK =Q(M,K)
-  r=vK+1βK +rσK
-  VK =VM(1:M)Q(1:M,1:K)
-  HK =HM(1:K,1:K)
-  →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
-until convergence
-*/
-  void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false)
-  {
-    GridBase *grid = src._grid;
-    assert(grid == evec[0]._grid);
-    
-    GridLogIRL.TimingMode(1);
-    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
-    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-    std::cout << GridLogIRL <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
-    std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
-    std::cout << GridLogIRL <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
-    std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
-    std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
-    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
-      std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
-    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
-      std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
-    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
-      std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
-    }
-    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-	
-    assert(Nm <= evec.size() && Nm <= eval.size());
-    
-    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
-    RealD evalMaxApprox = 0.0;
-    {
-      auto src_n = src;
-      auto tmp = src;
-      const int _MAX_ITER_IRL_MEVAPP_ = 50;
-      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
-	normalise(src_n);
-	_HermOp(src_n,tmp);
-	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
-	RealD vden = norm2(src_n);
-	RealD na = vnum/vden;
-	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
-	  i=_MAX_ITER_IRL_MEVAPP_;
-	evalMaxApprox = na;
-	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
-	src_n = tmp;
-      }
-    }
-	
-    std::vector<RealD> lme(Nm);  
-    std::vector<RealD> lme2(Nm);
-    std::vector<RealD> eval2(Nm);
-    std::vector<RealD> eval2_copy(Nm);
-    Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
-
-    Field f(grid);
-    Field v(grid);
-    int k1 = 1;
-    int k2 = Nk;
-    RealD beta_k;
-
-    Nconv = 0;
-  
-    // Set initial vector
-    evec[0] = src;
-    normalise(evec[0]);
-	
-    // Initial Nk steps
-    OrthoTime=0.;
-    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
-    std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
-    std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
-
-    //////////////////////////////////
-    // Restarting loop begins
-    //////////////////////////////////
-    int iter;
-    for(iter = 0; iter<MaxIter; ++iter){
-      
-      OrthoTime=0.;
-
-      std::cout<< GridLogMessage <<" **********************"<< std::endl;
-      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
-      std::cout<< GridLogMessage <<" **********************"<< std::endl;
-
-      std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
-      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
-      f *= lme[Nm-1];
-
-      std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
-      std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
-	  
-      //////////////////////////////////
-      // getting eigenvalues
-      //////////////////////////////////
-      for(int k=0; k<Nm; ++k){
-	eval2[k] = eval[k+k1-1];
-	lme2[k] = lme[k+k1-1];
-      }
-      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
-      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
-      std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
-
-      //////////////////////////////////
-      // sorting
-      //////////////////////////////////
-      eval2_copy = eval2;
-      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
-      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
-      const int chunk=8;
-      for(int io=0; io<k2;io+=chunk){
-	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
-	for(int ii=0;ii<chunk;ii++){
-	  if ( (io+ii)<k2 )
-	    std::cout<< " "<< std::setw(12)<< eval2[io+ii];
-	}
-	std::cout << std::endl;
-      }
-
-      //////////////////////////////////
-      // Implicitly shifted QR transformations
-      //////////////////////////////////
-      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
-      for(int ip=k2; ip<Nm; ++ip){ 
-	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
-      }
-      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
-
-      assert(k2<Nm);      assert(k2<Nm);      assert(k1>0);
-
-      basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
-      std::cout<<GridLogIRL <<"basisRotated  by Qt"<<std::endl;
-      
-      ////////////////////////////////////////////////////
-      // Compressed vector f and beta(k2)
-      ////////////////////////////////////////////////////
-      f *= Qt(k2-1,Nm-1);
-      f += lme[k2-1] * evec[k2];
-      beta_k = norm2(f);
-      beta_k = sqrt(beta_k);
-      std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
-	  
-      RealD betar = 1.0/beta_k;
-      evec[k2] = betar * f;
-      lme[k2-1] = beta_k;
-	  
-      ////////////////////////////////////////////////////
-      // Convergence test
-      ////////////////////////////////////////////////////
-      for(int k=0; k<Nm; ++k){    
-	eval2[k] = eval[k];
-	lme2[k] = lme[k];
-      }
-      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
-      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
-      std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
-	  
-      Nconv = 0;
-      if (iter >= MinRestart) {
-
-	std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
-
-	Field B(grid); B.checkerboard = evec[0].checkerboard;
-
-	//  power of two search pattern;  not every evalue in eval2 is assessed.
-	int allconv =1;
-	for(int jj = 1; jj<=Nstop; jj*=2){
-	  int j = Nstop-jj;
-	  RealD e = eval2_copy[j]; // Discard the evalue
-	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
-	    allconv=0;
-	  }
-	}
-	// Do evec[0] for good measure
-	{ 
-	  int j=0;
-	  RealD e = eval2_copy[0]; 
-	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
-	}
-	if ( allconv ) Nconv = Nstop;
-
-	// test if we converged, if so, terminate
-	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
-	//	if( Nconv>=Nstop || beta_k < betastp){
-	if( Nconv>=Nstop){
-	  goto converged;
-	}
-	  
-      } else {
-	std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
-      } // end of iter loop
-    }
-
-    std::cout<<GridLogError<<"\n NOT converged.\n";
-    abort();
-	
-  converged:
-    {
-      Field B(grid); B.checkerboard = evec[0].checkerboard;
-      basisRotate(evec,Qt,0,Nk,0,Nk,Nm);	    
-      std::cout << GridLogIRL << " Rotated basis"<<std::endl;
-      Nconv=0;
-      //////////////////////////////////////////////////////////////////////
-      // Full final convergence test; unconditionally applied
-      //////////////////////////////////////////////////////////////////////
-      for(int j = 0; j<=Nk; j++){
-	B=evec[j];
-	if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
-	  Nconv++;
-	}
-      }
-
-      if ( Nconv < Nstop )
-	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-
-      eval=eval2;
-      
-      //Keep only converged
-      eval.resize(Nconv);// Nstop?
-      evec.resize(Nconv,grid);// Nstop?
-      basisSortInPlace(evec,eval,reverse);
-      
-    }
-       
-    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-    std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
-    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-    std::cout << GridLogIRL << " -- Iterations  = "<< iter   << "\n";
-    std::cout << GridLogIRL << " -- beta(k)     = "<< beta_k << "\n";
-    std::cout << GridLogIRL << " -- Nconv       = "<< Nconv  << "\n";
-    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
-  }
-
- private:
-/* Saad PP. 195
-1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
-2. For k = 1,2,...,m Do:
-3. wk:=Avk−βkv_{k−1}      
-4. αk:=(wk,vk)       // 
-5. wk:=wk−αkvk       // wk orthog vk 
-6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-7. vk+1 := wk/βk+1
-8. EndDo
- */
-  void step(std::vector<RealD>& lmd,
-	    std::vector<RealD>& lme, 
-	    std::vector<Field>& evec,
-	    Field& w,int Nm,int k)
-  {
-    const RealD tiny = 1.0e-20;
-    assert( k< Nm );
-
-    GridStopWatch gsw_op,gsw_o;
-
-    Field& evec_k = evec[k];
-
-    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
-
-    if(k>0) w -= lme[k-1] * evec[k-1];
-
-    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
-    RealD     alph = real(zalph);
-
-    w = w - alph * evec_k;// 5. wk:=wk−αkvk
-
-    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-    // 7. vk+1 := wk/βk+1
-
-    lmd[k] = alph;
-    lme[k] = beta;
-
-    if (k>0 && k % orth_period == 0) {
-      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
-    }
-
-    if(k < Nm-1) evec[k+1] = w;
-
-    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
-    if ( beta < tiny ) 
-      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
-  }
-
-  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
-			 int Nk, int Nm,  
-			 Eigen::MatrixXd & Qt, // Nm x Nm
-			 GridBase *grid)
-  {
-    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
-
-    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
-    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
-    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
-    
-    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
-
-    for (int i = 0; i < Nk; i++) {
-      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
-    }
-    for (int i = 0; i < Nk; i++) {
-      for (int j = 0; j < Nk; j++) {
-	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
-      }
-    }
-  }
-
-  ///////////////////////////////////////////////////////////////////////////
-  // File could end here if settle on Eigen ??? !!!
-  ///////////////////////////////////////////////////////////////////////////
-  void QR_decomp(std::vector<RealD>& lmd,   // Nm 
-		 std::vector<RealD>& lme,   // Nm 
-		 int Nk, int Nm,            // Nk, Nm
-		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
-		 RealD Dsh, int kmin, int kmax)
-  {
-    int k = kmin-1;
-    RealD x;
-    
-    RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
-    RealD c = ( lmd[k] -Dsh) *Fden;
-    RealD s = -lme[k] *Fden;
-      
-    RealD tmpa1 = lmd[k];
-    RealD tmpa2 = lmd[k+1];
-    RealD tmpb  = lme[k];
-
-    lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
-    lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
-    lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
-    x        =-s*lme[k+1];
-    lme[k+1] = c*lme[k+1];
-      
-    for(int i=0; i<Nk; ++i){
-      RealD Qtmp1 = Qt(k,i);
-      RealD Qtmp2 = Qt(k+1,i);
-      Qt(k,i)  = c*Qtmp1 - s*Qtmp2;
-      Qt(k+1,i)= s*Qtmp1 + c*Qtmp2; 
-    }
-
-    // Givens transformations
-    for(int k = kmin; k < kmax-1; ++k){
-      
-      RealD Fden = 1.0/hypot(x,lme[k-1]);
-      RealD c = lme[k-1]*Fden;
-      RealD s = - x*Fden;
-	
-      RealD tmpa1 = lmd[k];
-      RealD tmpa2 = lmd[k+1];
-      RealD tmpb  = lme[k];
-
-      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
-      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
-      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
-      lme[k-1] = c*lme[k-1] -s*x;
-
-      if(k != kmax-2){
-	x = -s*lme[k+1];
-	lme[k+1] = c*lme[k+1];
-      }
-
-      for(int i=0; i<Nk; ++i){
-	RealD Qtmp1 = Qt(k,i);
-	RealD Qtmp2 = Qt(k+1,i);
-	Qt(k,i)     = c*Qtmp1 -s*Qtmp2;
-	Qt(k+1,i)   = s*Qtmp1 +c*Qtmp2;
-      }
-    }
-  }
-
-  void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
-		   int Nk, int Nm,   
-		   Eigen::MatrixXd & Qt,
-		   GridBase *grid)
-  {
-    Qt = Eigen::MatrixXd::Identity(Nm,Nm);
-    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
-      diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
-    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
-      diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
-    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
-      diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
-    } else { 
-      assert(0);
-    }
-  }
-
-#ifdef USE_LAPACK
-void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
-                   double *vl, double *vu, int *il, int *iu, double *abstol,
-                   int *m, double *w, double *z, int *ldz, int *isuppz,
-                   double *work, int *lwork, int *iwork, int *liwork,
-                   int *info);
-#endif
-
-void diagonalize_lapack(std::vector<RealD>& lmd,
-			std::vector<RealD>& lme, 
-			int Nk, int Nm,  
-			Eigen::MatrixXd& Qt,
-			GridBase *grid)
-{
-#ifdef USE_LAPACK
-  const int size = Nm;
-  int NN = Nk;
-  double evals_tmp[NN];
-  double evec_tmp[NN][NN];
-  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
-  double DD[NN];
-  double EE[NN];
-  for (int i = 0; i< NN; i++) {
-    for (int j = i - 1; j <= i + 1; j++) {
-      if ( j < NN && j >= 0 ) {
-	if (i==j) DD[i] = lmd[i];
-	if (i==j) evals_tmp[i] = lmd[i];
-	if (j==(i-1)) EE[j] = lme[j];
-      }
-    }
-  }
-  int evals_found;
-  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
-  int liwork =  3+NN*10 ;
-  int iwork[liwork];
-  double work[lwork];
-  int isuppz[2*NN];
-  char jobz = 'V'; // calculate evals & evecs
-  char range = 'I'; // calculate all evals
-  //    char range = 'A'; // calculate all evals
-  char uplo = 'U'; // refer to upper half of original matrix
-  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
-  int ifail[NN];
-  int info;
-  int total = grid->_Nprocessors;
-  int node  = grid->_processor;
-  int interval = (NN/total)+1;
-  double vl = 0.0, vu = 0.0;
-  int il = interval*node+1 , iu = interval*(node+1);
-  if (iu > NN)  iu=NN;
-  double tol = 0.0;
-  if (1) {
-    memset(evals_tmp,0,sizeof(double)*NN);
-    if ( il <= NN){
-      LAPACK_dstegr(&jobz, &range, &NN,
-		    (double*)DD, (double*)EE,
-		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
-		    &tol, // tolerance
-		    &evals_found, evals_tmp, (double*)evec_tmp, &NN,
-		    isuppz,
-		    work, &lwork, iwork, &liwork,
-		    &info);
-      for (int i = iu-1; i>= il-1; i--){
-	evals_tmp[i] = evals_tmp[i - (il-1)];
-	if (il>1) evals_tmp[i-(il-1)]=0.;
-	for (int j = 0; j< NN; j++){
-	  evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
-	  if (il>1) evec_tmp[i-(il-1)][j]=0.;
-	}
-      }
-    }
-    {
-      grid->GlobalSumVector(evals_tmp,NN);
-      grid->GlobalSumVector((double*)evec_tmp,NN*NN);
-    }
-  } 
-  // Safer to sort instead of just reversing it, 
-  // but the document of the routine says evals are sorted in increasing order. 
-  // qr gives evals in decreasing order.
-  for(int i=0;i<NN;i++){
-    lmd [NN-1-i]=evals_tmp[i];
-    for(int j=0;j<NN;j++){
-      Qt((NN-1-i),j)=evec_tmp[i][j];
-    }
-  }
-#else 
-  assert(0);
-#endif
-}
-
-void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
-		    int Nk, int Nm,   
-		    Eigen::MatrixXd & Qt,
-		    GridBase *grid)
-{
-  int QRiter = 100*Nm;
-  int kmin = 1;
-  int kmax = Nk;
-  
-  // (this should be more sophisticated)
-  for(int iter=0; iter<QRiter; ++iter){
-    
-    // determination of 2x2 leading submatrix
-    RealD dsub = lmd[kmax-1]-lmd[kmax-2];
-    RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
-    RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
-    // (Dsh: shift)
-    
-    // transformation
-    QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
-    
-    // Convergence criterion (redef of kmin and kamx)
-    for(int j=kmax-1; j>= kmin; --j){
-      RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
-      if(fabs(lme[j-1])+dds > dds){
-	kmax = j+1;
-	goto continued;
-      }
-    }
-    QRiter = iter;
-    return;
-    
-  continued:
-    for(int j=0; j<kmax-1; ++j){
-      RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
-      if(fabs(lme[j])+dds > dds){
-	kmin = j+1;
-	break;
-      }
-    }
-  }
-  std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
-  abort();
-}
-};
-}
-#endif
@@ -1,406 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
-
-    Copyright (C) 2015
-
-Author: Christoph Lehner <clehner@bnl.gov>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_LOCAL_COHERENCE_IRL_H
-#define GRID_LOCAL_COHERENCE_IRL_H
-
-namespace Grid { 
-
-
-struct LanczosParams : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
-				  ChebyParams, Cheby,/*Chebyshev*/
-				  int, Nstop,    /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
-				  int, Nk,       /*Vecs in Lanczos seek converge*/
-				  int, Nm,       /*Total vecs in Lanczos include restart*/
-				  RealD, resid,  /*residual*/
- 				  int, MaxIt, 
-				  RealD, betastp,  /* ? */
-				  int, MinRes);    // Must restart
-};
-
-struct LocalCoherenceLanczosParams : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
-				  bool, saveEvecs,
-				  bool, doFine,
-				  bool, doFineRead,
-				  bool, doCoarse,
-	       			  bool, doCoarseRead,
-				  LanczosParams, FineParams,
-				  LanczosParams, CoarseParams,
-				  ChebyParams,   Smoother,
-				  RealD        , coarse_relax_tol,
-				  std::vector<int>, blockSize,
-				  std::string, config,
-				  std::vector < std::complex<double>  >, omega,
-				  RealD, mass,
-				  RealD, M5);
-};
-
-// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
-template<class Fobj,class CComplex,int nbasis>
-class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
-public:
-  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
-  typedef Lattice<CoarseSiteVector>           CoarseField;
-  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj>          FineField;
-
-  LinearOperatorBase<FineField> &_Linop;
-  std::vector<FineField>        &subspace;
-
-  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
-    _Linop(linop), subspace(_subspace)
-  {  
-    assert(subspace.size() >0);
-  };
-
-  void operator()(const CoarseField& in, CoarseField& out) {
-    GridBase *FineGrid = subspace[0]._grid;    
-    int   checkerboard = subspace[0].checkerboard;
-      
-    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
-    FineField fout(FineGrid);   fout.checkerboard = checkerboard;
-
-    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
-    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
-    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
-  }
-};
-
-template<class Fobj,class CComplex,int nbasis>
-class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
-public:
-  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
-  typedef Lattice<CoarseSiteVector>           CoarseField;
-  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj>          FineField;
-
-
-  OperatorFunction<FineField>   & _poly;
-  LinearOperatorBase<FineField> &_Linop;
-  std::vector<FineField>        &subspace;
-
-  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
-			  LinearOperatorBase<FineField>& linop, 
-			  std::vector<FineField> & _subspace) :
-    _poly(poly),
-    _Linop(linop),
-    subspace(_subspace)
-  {  };
-
-  void operator()(const CoarseField& in, CoarseField& out) {
-    
-    GridBase *FineGrid = subspace[0]._grid;    
-    int   checkerboard = subspace[0].checkerboard;
-
-    FineField fin (FineGrid); fin.checkerboard =checkerboard;
-    FineField fout(FineGrid);fout.checkerboard =checkerboard;
-    
-    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
-    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
-    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
-  }
-};
-
-template<class Fobj,class CComplex,int nbasis>
-class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
-{
- public:
-  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
-  typedef Lattice<CoarseSiteVector>           CoarseField;
-  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj>          FineField;
-
-  LinearFunction<CoarseField> & _Poly;
-  OperatorFunction<FineField>   & _smoother;
-  LinearOperatorBase<FineField> &_Linop;
-  RealD                          _coarse_relax_tol;
-  std::vector<FineField>        &_subspace;
-  
-  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
-					   OperatorFunction<FineField>   &smoother,
-					   LinearOperatorBase<FineField> &Linop,
-					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3) 
-    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol)  
-  {    };
-
-  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
-  {
-    CoarseField v(B);
-    RealD eval_poly = eval;
-
-    // Apply operator
-    _Poly(B,v);
-
-    RealD vnum = real(innerProduct(B,v)); // HermOp.
-    RealD vden = norm2(B);
-    RealD vv0  = norm2(v);
-    eval   = vnum/vden;
-    v -= eval*B;
-
-    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
-
-    std::cout.precision(13);
-    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
-	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
-	     <<std::endl;
-
-    int conv=0;
-    if( (vv<eresid*eresid) ) conv = 1;
-    return conv;
-  }
-  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
-  {
-    GridBase *FineGrid = _subspace[0]._grid;    
-    int checkerboard   = _subspace[0].checkerboard;
-    FineField fB(FineGrid);fB.checkerboard =checkerboard;
-    FineField fv(FineGrid);fv.checkerboard =checkerboard;
-
-    blockPromote(B,fv,_subspace);  
-    
-    _smoother(_Linop,fv,fB); 
-
-    RealD eval_poly = eval;
-    _Linop.HermOp(fB,fv);
-
-    RealD vnum = real(innerProduct(fB,fv)); // HermOp.
-    RealD vden = norm2(fB);
-    RealD vv0  = norm2(fv);
-    eval   = vnum/vden;
-    fv -= eval*fB;
-    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-
-    std::cout.precision(13);
-    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
-	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
-	     <<std::endl;
-    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
-    if( (vv<eresid*eresid) ) return 1;
-    return 0;
-  }
-};
-
-////////////////////////////////////////////
-// Make serializable Lanczos params
-////////////////////////////////////////////
-template<class Fobj,class CComplex,int nbasis>
-class LocalCoherenceLanczos 
-{
-public:
-  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
-  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<CoarseSiteVector>           CoarseField;
-  typedef Lattice<Fobj>                       FineField;
-
-protected:
-  GridBase *_CoarseGrid;
-  GridBase *_FineGrid;
-  int _checkerboard;
-  LinearOperatorBase<FineField>                 & _FineOp;
-  
-  std::vector<RealD>                              &evals_fine;
-  std::vector<RealD>                              &evals_coarse; 
-  std::vector<FineField>                          &subspace;
-  std::vector<CoarseField>                        &evec_coarse;
-
-private:
-  std::vector<RealD>                              _evals_fine;
-  std::vector<RealD>                              _evals_coarse; 
-  std::vector<FineField>                          _subspace;
-  std::vector<CoarseField>                        _evec_coarse;
-
-public:
-
-  LocalCoherenceLanczos(GridBase *FineGrid,
-			GridBase *CoarseGrid,
-			LinearOperatorBase<FineField> &FineOp,
-			int checkerboard) :
-    _CoarseGrid(CoarseGrid),
-    _FineGrid(FineGrid),
-    _FineOp(FineOp),
-    _checkerboard(checkerboard),
-    evals_fine  (_evals_fine),
-    evals_coarse(_evals_coarse),
-    subspace    (_subspace),
-    evec_coarse(_evec_coarse)
-  {
-    evals_fine.resize(0);
-    evals_coarse.resize(0);
-  };
-  //////////////////////////////////////////////////////////////////////////
-  // Alternate constructore, external storage for use by Hadrons module
-  //////////////////////////////////////////////////////////////////////////
-  LocalCoherenceLanczos(GridBase *FineGrid,
-			GridBase *CoarseGrid,
-			LinearOperatorBase<FineField> &FineOp,
-			int checkerboard,
-			std::vector<FineField>   &ext_subspace,
-			std::vector<CoarseField> &ext_coarse,
-			std::vector<RealD>       &ext_eval_fine,
-			std::vector<RealD>       &ext_eval_coarse
-			) :
-    _CoarseGrid(CoarseGrid),
-    _FineGrid(FineGrid),
-    _FineOp(FineOp),
-    _checkerboard(checkerboard),
-    evals_fine  (ext_eval_fine), 
-    evals_coarse(ext_eval_coarse),
-    subspace    (ext_subspace),
-    evec_coarse (ext_coarse)
-  {
-    evals_fine.resize(0);
-    evals_coarse.resize(0);
-  };
-
-  void Orthogonalise(void ) {
-    CoarseScalar InnerProd(_CoarseGrid);
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-  };
-
-  template<typename T>  static RealD normalise(T& v) 
-  {
-    RealD nn = norm2(v);
-    nn = ::sqrt(nn);
-    v = v * (1.0/nn);
-    return nn;
-  }
-  /*
-  void fakeFine(void)
-  {
-    int Nk = nbasis;
-    subspace.resize(Nk,_FineGrid);
-    subspace[0]=1.0;
-    subspace[0].checkerboard=_checkerboard;
-    normalise(subspace[0]);
-    PlainHermOp<FineField>    Op(_FineOp);
-    for(int k=1;k<Nk;k++){
-      subspace[k].checkerboard=_checkerboard;
-      Op(subspace[k-1],subspace[k]);
-      normalise(subspace[k]);
-    }
-  }
-  */
-
-  void testFine(RealD resid) 
-  {
-    assert(evals_fine.size() == nbasis);
-    assert(subspace.size() == nbasis);
-    PlainHermOp<FineField>    Op(_FineOp);
-    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
-    for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
-    }
-  }
-
-  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
-  {
-    assert(evals_fine.size() == nbasis);
-    assert(subspace.size() == nbasis);
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
-
-    for(int k=0;k<evec_coarse.size();k++){
-      if ( k < nbasis ) { 
-	assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
-      } else { 
-	assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
-      }
-    }
-  }
-
-  void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, 
-		RealD MaxIt, RealD betastp, int MinRes)
-  {
-    assert(nbasis<=Nm);
-    Chebyshev<FineField>      Cheby(cheby_parms);
-    FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
-    PlainHermOp<FineField>    Op(_FineOp);
-
-    evals_fine.resize(Nm);
-    subspace.resize(Nm,_FineGrid);
-
-    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
-
-    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
-
-    int Nconv;
-    IRL.calc(evals_fine,subspace,src,Nconv,false);
-    
-    // Shrink down to number saved
-    assert(Nstop>=nbasis);
-    assert(Nconv>=nbasis);
-    evals_fine.resize(nbasis);
-    subspace.resize(nbasis,_FineGrid);
-  }
-  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
-		  int Nstop, int Nk, int Nm,RealD resid, 
-		  RealD MaxIt, RealD betastp, int MinRes)
-  {
-    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
-
-    evals_coarse.resize(Nm);
-    evec_coarse.resize(Nm,_CoarseGrid);
-
-    CoarseField src(_CoarseGrid);     src=1.0; 
-
-    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
-    int Nconv=0;
-    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
-    assert(Nconv>=Nstop);
-    evals_coarse.resize(Nstop);
-    evec_coarse.resize (Nstop,_CoarseGrid);
-    for (int i=0;i<Nstop;i++){
-      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
-    }
-  }
-};
-
-}
-#endif
@@ -1,186 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#pragma once
-
-namespace Grid {
-namespace QCD {
-
-
-template<class Field>
-class PauliVillarsSolverUnprec
-{
- public:
-  ConjugateGradient<Field> & CG;
-  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
-
-  template<class Matrix>
-  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
-  {
-    RealD m = _Matrix.Mass();
-    Field A  (_Matrix.FermionGrid());
-
-    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
-
-    _Matrix.SetMass(1.0);
-    _Matrix.Mdag(src,A);
-    CG(HermOp,A,sol);
-    _Matrix.SetMass(m);
-  };
-};
-
-template<class Field>
-class PauliVillarsSolverRBprec
-{
- public:
-  ConjugateGradient<Field> & CG;
-  PauliVillarsSolverRBprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
-
-  template<class Matrix>
-  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
-  {
-    RealD m = _Matrix.Mass();
-    Field A  (_Matrix.FermionGrid());
-
-    _Matrix.SetMass(1.0);
-    SchurRedBlackDiagMooeeSolve<Field> SchurSolver(CG);
-    SchurSolver(_Matrix,src,sol);
-    _Matrix.SetMass(m);
-  };
-};
-
-template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
- private:
-  PVinverter & PauliVillarsSolver;
- public:
-
- /////////////////////////////////////////////////////
- // First cut works, 10 Oct 2018.
- //
- // Must form a plan to get this into production for Zmobius acceleration
- // of the Mobius exact AMA corrections.
- //
- // TODO : understand absence of contact term in eqns in Hantao's thesis
- //        sol4 is contact term subtracted.
- //
- // Options
- // a) Defect correction approach:
- //    1) Compute defect from current soln (initially guess).
- //       This is ...... outerToInner check !!!!
- //    2) Deflated Zmobius solve to get 4d soln
- //       Ensure deflation is working
- //    3) Refine 5d Outer using the inner 4d delta soln
- // 
- // Step 1: localise PV inverse in a routine. [DONE]
- // Step 2: Schur based PV inverse            [DONE]
- // Step 3: Fourier accelerated PV inverse
- // Step 4: 
- /////////////////////////////////////////////////////
- 
-  Reconstruct5DfromPhysical(PVinverter &_PauliVillarsSolver) 
-    : PauliVillarsSolver(_PauliVillarsSolver) 
-  { 
-  };
-
-
-   template<class Matrix>
-   void PV(Matrix &_Matrix,const Field &src,Field &sol)
-   {
-     RealD m = _Matrix.Mass();
-     _Matrix.SetMass(1.0);
-     _Matrix.M(src,sol);
-     _Matrix.SetMass(m);
-   }
-   template<class Matrix>
-   void PVdag(Matrix &_Matrix,const Field &src,Field &sol)
-   {
-     RealD m = _Matrix.Mass();
-     _Matrix.SetMass(1.0);
-     _Matrix.Mdag(src,sol);
-     _Matrix.SetMass(m);
-   }
-  template<class Matrix>
-  void operator() (Matrix & _Matrix,const Field &sol4,const Field &src4, Field &sol5){
-
-    int Ls =  _Matrix.Ls;
-
-    Field psi4(_Matrix.GaugeGrid());
-    Field psi(_Matrix.FermionGrid());
-    Field A  (_Matrix.FermionGrid());
-    Field B  (_Matrix.FermionGrid());
-    Field c  (_Matrix.FermionGrid());
-
-    typedef typename Matrix::Coeff_t Coeff_t;
-
-    std::cout << GridLogMessage<< " ************************************************" << std::endl;
-    std::cout << GridLogMessage<< " Reconstruct5Dprop: c.f. MADWF algorithm         " << std::endl;
-    std::cout << GridLogMessage<< " ************************************************" << std::endl;
-
-    ///////////////////////////////////////
-    //Import source, include Dminus factors
-    ///////////////////////////////////////
-    _Matrix.ImportPhysicalFermionSource(src4,B); 
-
-    ///////////////////////////////////////
-    // Set up c from src4
-    ///////////////////////////////////////
-    PauliVillarsSolver(_Matrix,B,A);
-    _Matrix.Pdag(A,c);
-
-    //////////////////////////////////////
-    // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
-    //////////////////////////////////////
-    psi4 = - sol4;
-    InsertSlice(psi4, psi, 0   , 0);
-    for (int s=1;s<Ls;s++) {
-      ExtractSlice(psi4,c,s,0);
-       InsertSlice(psi4,psi,s,0);
-    }
-
-    /////////////////////////////
-    // Pdag PV^-1 Dm P 
-    /////////////////////////////
-    _Matrix.P(psi,B);
-    _Matrix.M(B,A);
-    PauliVillarsSolver(_Matrix,A,B);
-    _Matrix.Pdag(B,A);
-
-    //////////////////////////////
-    // Reinsert surface prop
-    //////////////////////////////
-    InsertSlice(sol4,A,0,0);
-
-    //////////////////////////////
-    // Convert from y back to x 
-    //////////////////////////////
-    _Matrix.P(A,sol5);
-    
-  }
-};
-
-}
-}
@@ -1,503 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_SCHUR_RED_BLACK_H
-#define GRID_SCHUR_RED_BLACK_H
-
-
-  /*
-   * Red black Schur decomposition
-   *
-   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
-   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
-   *                =         L                     D                     U
-   *
-   * L^-1 = (1              0 )
-   *        (-MoeMee^{-1}   1 )   
-   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
-   *           ( 0       1                    )
-   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
-   *           ( 0       1                    )
-   *
-   * U^-1 = (1   -Mee^{-1} Meo)
-   *        (0    1           )
-   * U^{dag} = ( 1                 0)
-   *           (Meo^dag Mee^{-dag} 1)
-   * U^{-dag} = (  1                 0)
-   *            (-Meo^dag Mee^{-dag} 1)
-   ***********************
-   *     M psi = eta
-   ***********************
-   *Odd
-   * i)                 D_oo psi_o =  L^{-1}  eta_o
-   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
-   *
-   * Wilson:
-   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
-   * Stag:
-   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
-   *
-   * L^-1 eta_o= (1              0 ) (e
-   *             (-MoeMee^{-1}   1 )   
-   *
-   *Even
-   * ii)  Mee psi_e + Meo psi_o = src_e
-   *
-   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-   *
-   * 
-   * TODO: Other options:
-   * 
-   * a) change checkerboards for Schur e<->o
-   *
-   * Left precon by Moo^-1
-   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
-   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
-   *
-   * Right precon by Moo^-1
-   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
-   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
-   *                              psi_o = M_oo^-1 phi_o
-   * TODO: Deflation 
-   */
-namespace Grid {
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Now make the norm reflect extra factor of Mee
-  template<class Field> class SchurRedBlackStaggeredSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-      
-      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
-    
-      /////////////////////////////////////////////////////
-      // src_o = (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      //src_o = tmp;     assert(src_o.checkerboard ==Odd);
-      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
-      guess(src_o, sol_o);
-      Mtmp = sol_o;
-      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
-      // Fionn A2A boolean behavioural control
-      if (subGuess)        sol_o = sol_o-Mtmp;
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-        std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagMooeeSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false)  :  _HermitianRBSolver(HermitianRBSolver) 
-  { 
-    CBfactorise=cb;
-    subtractGuess(initSubGuess);
-  };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-      guess(src_o,sol_o);
-      Mtmp = sol_o;
-      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)        sol_o = sol_o-Mtmp;
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagTwoSolve {
-  private:
-    OperatorFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise = 0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix,class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-      guess(src_o,tmp);
-      Mtmp = tmp;
-      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)      tmp = tmp-Mtmp;
-      _Matrix.MooeeInv(tmp,sol_o);       assert(  sol_o.checkerboard   ==Odd);
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take a matrix and form a Red Black solver calling a Herm solver
-  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field> class SchurRedBlackDiagTwoMixed {
-  private:
-    LinearFunction<Field> & _HermitianRBSolver;
-    int CBfactorise;
-    bool subGuess;
-  public:
-
-    /////////////////////////////////////////////////////
-    // Wrap the usual normal equations Schur trick
-    /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-      subtractGuess(initSubGuess);
-    };
-    void subtractGuess(const bool initSubGuess)
-    {
-      subGuess = initSubGuess;
-    }
-    bool isSubtractGuess(void)
-    {
-      return subGuess;
-    }
-
-    template<class Matrix>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out){
-      ZeroGuesser<Field> guess;
-      (*this)(_Matrix,in,out,guess);
-    }
-    template<class Matrix, class Guesser>
-    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
-
-      // FIXME CGdiagonalMee not implemented virtual function
-      // FIXME use CBfactorise to control schur decomp
-      GridBase *grid = _Matrix.RedBlackGrid();
-      GridBase *fgrid= _Matrix.Grid();
-
-      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-      Field src_e(grid);
-      Field src_o(grid);
-      Field sol_e(grid);
-      Field sol_o(grid);
-      Field   tmp(grid);
-      Field  Mtmp(grid);
-      Field resid(fgrid);
-
-      pickCheckerboard(Even,src_e,in);
-      pickCheckerboard(Odd ,src_o,in);
-      pickCheckerboard(Even,sol_e,out);
-      pickCheckerboard(Odd ,sol_o,out);
-    
-      /////////////////////////////////////////////////////
-      // src_o = Mdag * (source_o - Moe MeeInv source_e)
-      /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
-
-      // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
-
-      //////////////////////////////////////////////////////////////
-      // Call the red-black solver
-      //////////////////////////////////////////////////////////////
-      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
-//      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      guess(src_o,tmp);
-      Mtmp = tmp;
-      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
-      // Fionn A2A boolean behavioural control
-      if (subGuess)      tmp = tmp-Mtmp;
-      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
-
-      ///////////////////////////////////////////////////
-      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-      ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
-     
-      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
-      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
-
-      // Verify the unprec residual
-      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
-        resid = resid-in;
-        RealD ns = norm2(in);
-        RealD nr = norm2(resid);
-
-        std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
-      } else {
-        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
-      }
-    }     
-  };
-
-}
-#endif
@@ -1,125 +0,0 @@
-#include <Grid/GridCore.h>
-#include <fcntl.h>
-
-namespace Grid {
-
-MemoryStats *MemoryProfiler::stats = nullptr;
-bool         MemoryProfiler::debug = false;
-
-int PointerCache::victim;
-
-PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
-
-void *PointerCache::Insert(void *ptr,size_t bytes) {
-
-  if (bytes < 4096 ) return ptr;
-
-#ifdef GRID_OMP
-  assert(omp_in_parallel()==0);
-#endif 
-
-  void * ret = NULL;
-  int v = -1;
-
-  for(int e=0;e<Ncache;e++) {
-    if ( Entries[e].valid==0 ) {
-      v=e; 
-      break;
-    }
-  }
-
-  if ( v==-1 ) {
-    v=victim;
-    victim = (victim+1)%Ncache;
-  }
-
-  if ( Entries[v].valid ) {
-    ret = Entries[v].address;
-    Entries[v].valid = 0;
-    Entries[v].address = NULL;
-    Entries[v].bytes = 0;
-  }
-
-  Entries[v].address=ptr;
-  Entries[v].bytes  =bytes;
-  Entries[v].valid  =1;
-
-  return ret;
-}
-
-void *PointerCache::Lookup(size_t bytes) {
-
- if (bytes < 4096 ) return NULL;
-
-#ifdef _OPENMP
-  assert(omp_in_parallel()==0);
-#endif 
-
-  for(int e=0;e<Ncache;e++){
-    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
-      Entries[e].valid = 0;
-      return Entries[e].address;
-    }
-  }
-  return NULL;
-}
-
-
-void check_huge_pages(void *Buf,uint64_t BYTES)
-{
-#ifdef __linux__
-  int fd = open("/proc/self/pagemap", O_RDONLY);
-  assert(fd >= 0);
-  const int page_size = 4096;
-  uint64_t virt_pfn = (uint64_t)Buf / page_size;
-  off_t offset = sizeof(uint64_t) * virt_pfn;
-  uint64_t npages = (BYTES + page_size-1) / page_size;
-  uint64_t pagedata[npages];
-  uint64_t ret = lseek(fd, offset, SEEK_SET);
-  assert(ret == offset);
-  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
-  assert(ret == sizeof(uint64_t) * npages);
-  int nhugepages = npages / 512;
-  int n4ktotal, nnothuge;
-  n4ktotal = 0;
-  nnothuge = 0;
-  for (int i = 0; i < nhugepages; ++i) {
-    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
-    for (int j = 0; j < 512; ++j) {
-      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
-      ++n4ktotal;
-      if (pageaddr != baseaddr + j * page_size)
-	++nnothuge;
-      }
-  }
-  int rank = CartesianCommunicator::RankWorld();
-  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
-#endif
-}
-
-std::string sizeString(const size_t bytes)
-{
-  constexpr unsigned int bufSize = 256;
-  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
-  char                   buf[256];
-  size_t                 s     = 0;
-  double                 count = bytes;
-  
-  while (count >= 1024 && s < 7)
-  {
-      s++;
-      count /= 1024;
-  }
-  if (count - floor(count) == 0.0)
-  {
-      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
-  }
-  else
-  {
-      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
-  }
-  
-  return std::string(buf);
-}
-
-}
@@ -1,174 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/cartesian/Cartesian_full.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_CARTESIAN_FULL_H
-#define GRID_CARTESIAN_FULL_H
-
-namespace Grid{
-    
-/////////////////////////////////////////////////////////////////////////////////////////
-// Grid Support.
-/////////////////////////////////////////////////////////////////////////////////////////
-
-
-class GridCartesian: public GridBase {
-
-public:
-    int dummy;
-    virtual int  CheckerBoardFromOindexTable (int Oindex) {
-      return 0;
-    }
-    virtual int  CheckerBoardFromOindex (int Oindex)
-    {
-      return 0;
-    }
-    virtual int CheckerBoarded(int dim){
-      return 0;
-    }
-    virtual int CheckerBoard(const std::vector<int> &site){
-        return 0;
-    }
-    virtual int CheckerBoardDestination(int cb,int shift,int dim){
-        return 0;
-    }
-    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
-      return shift;
-    }
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
-      return shift;
-    }
-    /////////////////////////////////////////////////////////////////////////
-    // Constructor takes a parent grid and possibly subdivides communicator.
-    /////////////////////////////////////////////////////////////////////////
-    GridCartesian(const std::vector<int> &dimensions,
-		  const std::vector<int> &simd_layout,
-		  const std::vector<int> &processor_grid,
-		  const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
-    {
-      Init(dimensions,simd_layout,processor_grid);
-    }
-    GridCartesian(const std::vector<int> &dimensions,
-		  const std::vector<int> &simd_layout,
-		  const std::vector<int> &processor_grid,
-		  const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
-    {
-      Init(dimensions,simd_layout,processor_grid);
-    }
-    /////////////////////////////////////////////////////////////////////////
-    // Construct from comm world
-    /////////////////////////////////////////////////////////////////////////
-    GridCartesian(const std::vector<int> &dimensions,
-		  const std::vector<int> &simd_layout,
-		  const std::vector<int> &processor_grid) : GridBase(processor_grid)
-    {
-      Init(dimensions,simd_layout,processor_grid);
-    }
-
-    virtual ~GridCartesian() = default;
-
-    void Init(const std::vector<int> &dimensions,
-	      const std::vector<int> &simd_layout,
-	      const std::vector<int> &processor_grid)
-    {
-      ///////////////////////
-      // Grid information
-      ///////////////////////
-      _isCheckerBoarded = false;
-      _ndimension = dimensions.size();
-
-      _fdimensions.resize(_ndimension);
-      _gdimensions.resize(_ndimension);
-      _ldimensions.resize(_ndimension);
-      _rdimensions.resize(_ndimension);
-      _simd_layout.resize(_ndimension);
-      _lstart.resize(_ndimension);
-      _lend.resize(_ndimension);
-
-      _ostride.resize(_ndimension);
-      _istride.resize(_ndimension);
-
-      _fsites = _gsites = _osites = _isites = 1;
-
-      for (int d = 0; d < _ndimension; d++)
-      {
-        _fdimensions[d] = dimensions[d];   // Global dimensions
-        _gdimensions[d] = _fdimensions[d]; // Global dimensions
-        _simd_layout[d] = simd_layout[d];
-        _fsites = _fsites * _fdimensions[d];
-        _gsites = _gsites * _gdimensions[d];
-
-        // Use a reduced simd grid
-        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
-        //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
-        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
-
-        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
-        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
-
-        _lstart[d] = _processor_coor[d] * _ldimensions[d];
-        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
-        _osites *= _rdimensions[d];
-        _isites *= _simd_layout[d];
-
-        // Addressing support
-        if (d == 0)
-        {
-          _ostride[d] = 1;
-          _istride[d] = 1;
-        }
-        else
-        {
-          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
-          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
-        }
-      }
-
-      ///////////////////////
-      // subplane information
-      ///////////////////////
-      _slice_block.resize(_ndimension);
-      _slice_stride.resize(_ndimension);
-      _slice_nblock.resize(_ndimension);
-
-      int block = 1;
-      int nblock = 1;
-      for (int d = 0; d < _ndimension; d++)
-        nblock *= _rdimensions[d];
-
-      for (int d = 0; d < _ndimension; d++)
-      {
-        nblock /= _rdimensions[d];
-        _slice_block[d] = block;
-        _slice_stride[d] = _ostride[d] * _rdimensions[d];
-        _slice_nblock[d] = nblock;
-        block = block * _rdimensions[d];
-      }
-    };
-
-};
-}
-#endif
@@ -1,514 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_mpi.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/GridCore.h>
-#include <Grid/communicator/SharedMemory.h>
-
-namespace Grid {
-
-Grid_MPI_Comm       CartesianCommunicator::communicator_world;
-
-////////////////////////////////////////////
-// First initialise of comms system
-////////////////////////////////////////////
-void CartesianCommunicator::Init(int *argc, char ***argv) 
-{
-
-  int flag;
-  int provided;
-
-  MPI_Initialized(&flag); // needed to coexist with other libs apparently
-  if ( !flag ) {
-    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
-    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
-        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
-      assert(0);
-  }
-
-  Grid_quiesce_nodes();
-
-  // Never clean up as done once.
-  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-
-  GlobalSharedMemory::Init(communicator_world);
-  GlobalSharedMemory::SharedMemoryAllocate(
-		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
-		   GlobalSharedMemory::Hugepages);
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Use cartesian communicators now even in MPI3
-///////////////////////////////////////////////////////////////////////////
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(ierr==0);
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  assert(ierr==0);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  coor.resize(_ndimension);
-  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-  assert(ierr==0);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Initialises from communicator_world
-////////////////////////////////////////////////////////////////////////////////////////////////////////
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
-{
-  MPI_Comm optimal_comm;
-  ////////////////////////////////////////////////////
-  // Remap using the shared memory optimising routine
-  // The remap creates a comm which must be freed
-  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
-  InitFromMPICommunicator(processors,optimal_comm);
-  SetCommunicator(optimal_comm);
-  ///////////////////////////////////////////////////
-  // Free the temp communicator
-  ///////////////////////////////////////////////////
-  MPI_Comm_free(&optimal_comm);
-}
-
-//////////////////////////////////
-// Try to subdivide communicator
-//////////////////////////////////
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)    
-{
-  _ndimension = processors.size();
-
-  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
-  std::vector<int> parent_processor_coor(_ndimension,0);
-  std::vector<int> parent_processors    (_ndimension,1);
-
-  // Can make 5d grid from 4d etc...
-  int pad = _ndimension-parent_ndimension;
-  for(int d=0;d<parent_ndimension;d++){
-    parent_processor_coor[pad+d]=parent._processor_coor[d];
-    parent_processors    [pad+d]=parent._processors[d];
-  }
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // split the communicator
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  //  int Nparent = parent._processors ; 
-  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
-  int Nparent;
-  MPI_Comm_size(parent.communicator,&Nparent);
-  //  std::cout << " Parent size  "<<Nparent <<std::endl;
-
-  int childsize=1;
-  for(int d=0;d<processors.size();d++) {
-    childsize *= processors[d];
-  }
-  int Nchild = Nparent/childsize;
-  assert (childsize * Nchild == Nparent);
-
-  //  std::cout << " child size  "<<childsize <<std::endl;
-
-  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
-  std::vector<int> scoor(_ndimension); // coor of split within parent
-  std::vector<int> ssize(_ndimension); // coor of split within parent
-
-  for(int d=0;d<_ndimension;d++){
-    ccoor[d] = parent_processor_coor[d] % processors[d];
-    scoor[d] = parent_processor_coor[d] / processors[d];
-    ssize[d] = parent_processors[d]     / processors[d];
-  }
-
-  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
-  int crank;  
-  // Mpi uses the reverse Lexico convention to us; so reversed routines called
-  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
-  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);      // ssize is the number of split grids
-
-  MPI_Comm comm_split;
-  if ( Nchild > 1 ) { 
-
-    if(0){
-      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
-      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
-      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
-      std::cout<<std::endl;
-      
-      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
-      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
-      std::cout<<std::endl;
-      
-      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
-      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
-      std::cout<<std::endl;
-      
-      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
-      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
-      std::cout<<std::endl;
-      
-      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
-      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
-      std::cout<<std::endl;
-
-      //////////////////////////////////////////////////////////////////////////////////////////////////////
-      // Declare victory
-      //////////////////////////////////////////////////////////////////////////////////////////////////////
-      std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
-		<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
-      std::cout << " Split communicator " <<comm_split <<std::endl;
-    }
-
-    ////////////////////////////////////////////////////////////////
-    // Split the communicator
-    ////////////////////////////////////////////////////////////////
-    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
-    assert(ierr==0);
-
-  } else {
-    srank = 0;
-    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
-    assert(ierr==0);
-  }
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Set up from the new split communicator
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  InitFromMPICommunicator(processors,comm_split);
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Take the right SHM buffers
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  SetCommunicator(comm_split);
-  
-  ///////////////////////////////////////////////
-  // Free the temp communicator 
-  ///////////////////////////////////////////////
-  MPI_Comm_free(&comm_split);
-
-  if(0){ 
-    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
-    for(int d=0;d<processors.size();d++){
-      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
-    }
-  }
-  for(int d=0;d<processors.size();d++){
-    assert(_processor_coor[d] == ccoor[d] );
-  }
-}
-
-void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
-{
-  ////////////////////////////////////////////////////
-  // Creates communicator, and the communicator_halo
-  ////////////////////////////////////////////////////
-  _ndimension = processors.size();
-  _processor_coor.resize(_ndimension);
-
-  /////////////////////////////////
-  // Count the requested nodes
-  /////////////////////////////////
-  _Nprocessors=1;
-  _processors = processors;
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-
-  std::vector<int> periodic(_ndimension,1);
-  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
-  MPI_Comm_rank(communicator,&_processor);
-  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
-
-  if ( 0 && (communicator_base != communicator_world) ) {
-    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
-    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
-    for(int d=0;d<_processors.size();d++){
-      std::cout << _processor_coor[d]<<" ";
-    }
-    std::cout << std::endl;
-  }
-
-  int Size;
-  MPI_Comm_size(communicator,&Size);
-
-  communicator_halo.resize (2*_ndimension);
-  for(int i=0;i<_ndimension*2;i++){
-    MPI_Comm_dup(communicator,&communicator_halo[i]);
-  }
-  assert(Size==_Nprocessors);
-}
-
-CartesianCommunicator::~CartesianCommunicator()
-{
-  int MPI_is_finalised;
-  MPI_Finalized(&MPI_is_finalised);
-  if (communicator && !MPI_is_finalised) {
-    MPI_Comm_free(&communicator);
-    for(int i=0;i<communicator_halo.size();i++){
-      MPI_Comm_free(&communicator_halo[i]);
-    }
-  }  
-}
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  std::vector<CommsRequest_t> reqs(0);
-  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
-  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
-}
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) { 
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  int myrank = _processor;
-  int ierr;
-
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    MPI_Request xrq;
-    MPI_Request rrq;
-
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-    
-    assert(ierr==0);
-    list.push_back(xrq);
-    list.push_back(rrq);
-  } else { 
-    // Give the CPU to MPI immediately; can use threads to overlap optionally
-    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		      recv,bytes,MPI_CHAR,from, from,
-		      communicator,MPI_STATUS_IGNORE);
-    assert(ierr==0);
-  }
-}
-
-double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest,
-						     void *recv,
-						     int from,
-						     int bytes,int dir)
-{
-  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
-  StencilSendToRecvFromComplete(list,dir);
-  return offbytes;
-}
-
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
-							 int dest,
-							 void *recv,
-							 int from,
-							 int bytes,int dir)
-{
-  int ncomm  =communicator_halo.size(); 
-  int commdir=dir%ncomm;
-
-  MPI_Request xrq;
-  MPI_Request rrq;
-
-  int ierr;
-  int gdest = ShmRanks[dest];
-  int gfrom = ShmRanks[from];
-  int gme   = ShmRanks[_processor];
-
-  assert(dest != _processor);
-  assert(from != _processor);
-  assert(gme  == ShmRank);
-  double off_node_bytes=0.0;
-
-  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
-    assert(ierr==0);
-    list.push_back(rrq);
-    off_node_bytes+=bytes;
-  }
-
-  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
-    assert(ierr==0);
-    list.push_back(xrq);
-    off_node_bytes+=bytes;
-  }
-
-  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
-    this->StencilSendToRecvFromComplete(list,dir);
-  }
-
-  return off_node_bytes;
-}
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
-{
-  SendToRecvFromComplete(waitall);
-}
-void CartesianCommunicator::StencilBarrier(void)
-{
-  MPI_Barrier  (ShmComm);
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  int nreq=list.size();
-
-  if (nreq==0) return;
-
-  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-  list.resize(0);
-}
-void CartesianCommunicator::Barrier(void)
-{
-  int ierr = MPI_Barrier(communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  int ierr=MPI_Bcast(data,
-		     bytes,
-		     MPI_BYTE,
-		     root,
-		     communicator);
-  assert(ierr==0);
-}
-int CartesianCommunicator::RankWorld(void){ 
-  int r; 
-  MPI_Comm_rank(communicator_world,&r);
-  return r;
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  int ierr= MPI_Bcast(data,
-		      bytes,
-		      MPI_BYTE,
-		      root,
-		      communicator_world);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
-{
-  std::vector<int> row(_ndimension,1);
-  assert(dim>=0 && dim<_ndimension);
-
-  //  Split the communicator
-  row[dim] = _processors[dim];
-
-  int me;
-  CartesianCommunicator Comm(row,*this,me);
-  Comm.AllToAll(in,out,words,bytes);
-}
-void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
-{
-  // MPI is a pain and uses "int" arguments
-  // 64*64*64*128*16 == 500Million elements of data.
-  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
-  // (Turns up on 32^3 x 64 Gparity too)
-  MPI_Datatype object;
-  int iwords; 
-  int ibytes;
-  iwords = words;
-  ibytes = bytes;
-  assert(words == iwords); // safe to cast to int ?
-  assert(bytes == ibytes); // safe to cast to int ?
-  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
-  MPI_Type_commit(&object);
-  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
-  MPI_Type_free(&object);
-}
-
-
-
-}
-
@@ -1,92 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/SharedMemory.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/GridCore.h>
-
-namespace Grid { 
-
-// static data
-
-uint64_t            GlobalSharedMemory::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
-int                 GlobalSharedMemory::Hugepages = 0;
-int                 GlobalSharedMemory::_ShmSetup;
-int                 GlobalSharedMemory::_ShmAlloc;
-uint64_t            GlobalSharedMemory::_ShmAllocBytes;
-
-std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
-
-Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
-int                 GlobalSharedMemory::WorldShmRank;
-int                 GlobalSharedMemory::WorldShmSize;
-std::vector<int>    GlobalSharedMemory::WorldShmRanks;
-
-Grid_MPI_Comm       GlobalSharedMemory::WorldComm;
-int                 GlobalSharedMemory::WorldSize;
-int                 GlobalSharedMemory::WorldRank;
-
-int                 GlobalSharedMemory::WorldNodes;
-int                 GlobalSharedMemory::WorldNode;
-
-void GlobalSharedMemory::SharedMemoryFree(void)
-{
-  assert(_ShmAlloc);
-  assert(_ShmAllocBytes>0);
-  for(int r=0;r<WorldShmSize;r++){
-    munmap(WorldShmCommBufs[r],_ShmAllocBytes);
-  }
-  _ShmAlloc = 0;
-  _ShmAllocBytes = 0;
-}
-/////////////////////////////////
-// Alloc, free shmem region
-/////////////////////////////////
-void *SharedMemory::ShmBufferMalloc(size_t bytes){
-  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
-  void *ptr = (void *)heap_top;
-  heap_top  += bytes;
-  heap_bytes+= bytes;
-  if (heap_bytes >= heap_size) {
-    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
-    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
-    assert(heap_bytes<heap_size);
-  }
-  return ptr;
-}
-void SharedMemory::ShmBufferFreeAll(void) { 
-  heap_top  =(size_t)ShmBufferSelf();
-  heap_bytes=0;
-}
-void *SharedMemory::ShmBufferSelf(void)
-{
-  return ShmCommBufs[ShmRank];
-}
-
-
-
-}
@@ -1,165 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/SharedMemory.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-
-// TODO
-// 1) move includes into SharedMemory.cc
-//
-// 2) split shared memory into a) optimal communicator creation from comm world
-// 
-//                             b) shared memory buffers container
-//                                -- static globally shared; init once
-//                                -- per instance set of buffers.
-//                                   
-
-#pragma once 
-
-#include <Grid/GridCore.h>
-
-#if defined (GRID_COMMS_MPI3) 
-#include <mpi.h>
-#endif 
-#include <semaphore.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <limits.h>
-#include <sys/types.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <sys/mman.h>
-#include <zlib.h>
-#ifdef HAVE_NUMAIF_H
-#include <numaif.h>
-#endif
-
-namespace Grid {
-
-#if defined (GRID_COMMS_MPI3) 
-  typedef MPI_Comm    Grid_MPI_Comm;
-  typedef MPI_Request CommsRequest_t;
-#else 
-  typedef int CommsRequest_t;
-  typedef int Grid_MPI_Comm;
-#endif
-
-class GlobalSharedMemory {
- private:
-  static const int     MAXLOG2RANKSPERNODE = 16;            
-
-  // Init once lock on the buffer allocation
-  static int      _ShmSetup;
-  static int      _ShmAlloc;
-  static uint64_t _ShmAllocBytes;
-
- public:
-  static int      ShmSetup(void)      { return _ShmSetup; }
-  static int      ShmAlloc(void)      { return _ShmAlloc; }
-  static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
-  static uint64_t      MAX_MPI_SHM_BYTES;
-  static int           Hugepages;
-
-  static std::vector<void *> WorldShmCommBufs;
-
-  static Grid_MPI_Comm WorldComm;
-  static int           WorldRank;
-  static int           WorldSize;
-
-  static Grid_MPI_Comm WorldShmComm;
-  static int           WorldShmRank;
-  static int           WorldShmSize;
-
-  static int           WorldNodes;
-  static int           WorldNode;
-
-  static std::vector<int>  WorldShmRanks;
-
-  //////////////////////////////////////////////////////////////////////////////////////
-  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
-  //////////////////////////////////////////////////////////////////////////////////////
-  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  ///////////////////////////////////////////////////
-  // Provide shared memory facilities off comm world
-  ///////////////////////////////////////////////////
-  static void SharedMemoryAllocate(uint64_t bytes, int flags);
-  static void SharedMemoryFree(void);
-
-};
-
-//////////////////////////////
-// one per communicator
-//////////////////////////////
-class SharedMemory 
-{
- private:
-  static const int     MAXLOG2RANKSPERNODE = 16;            
-
-  size_t heap_top;
-  size_t heap_bytes;
-  size_t heap_size;
-
- protected:
-
-  Grid_MPI_Comm    ShmComm; // for barriers
-  int    ShmRank; 
-  int    ShmSize;
-  std::vector<void *> ShmCommBufs;
-  std::vector<int>    ShmRanks;// Mapping comm ranks to Shm ranks
-
- public:
-  SharedMemory() {};
-  ~SharedMemory();
-  ///////////////////////////////////////////////////////////////////////////////////////
-  // set the buffers & sizes
-  ///////////////////////////////////////////////////////////////////////////////////////
-  void SetCommunicator(Grid_MPI_Comm comm);
-
-  ////////////////////////////////////////////////////////////////////////
-  // For this instance ; disjoint buffer sets between splits if split grid
-  ////////////////////////////////////////////////////////////////////////
-  void ShmBarrier(void); 
-
-  ///////////////////////////////////////////////////
-  // Call on any instance
-  ///////////////////////////////////////////////////
-  void SharedMemoryTest(void);
-  void *ShmBufferSelf(void);
-  void *ShmBuffer    (int rank);
-  void *ShmBufferTranslate(int rank,void * local_p);
-  void *ShmBufferMalloc(size_t bytes);
-  void  ShmBufferFreeAll(void) ;
-  
-  //////////////////////////////////////////////////////////////////////////
-  // Make info on Nodes & ranks and Shared memory available
-  //////////////////////////////////////////////////////////////////////////
-  int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
-  int RankCount(void) { return GlobalSharedMemory::WorldSize;};
-
-};
-
-}
@@ -1,651 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/SharedMemory.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/GridCore.h>
-#include <pwd.h>
-
-namespace Grid { 
-
-/*Construct from an MPI communicator*/
-void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
-{
-  assert(_ShmSetup==0);
-  WorldComm = comm;
-  MPI_Comm_rank(WorldComm,&WorldRank);
-  MPI_Comm_size(WorldComm,&WorldSize);
-  // WorldComm, WorldSize, WorldRank
-
-  /////////////////////////////////////////////////////////////////////
-  // Split into groups that can share memory
-  /////////////////////////////////////////////////////////////////////
-  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
-  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
-  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
-  // WorldShmComm, WorldShmSize, WorldShmRank
-
-  // WorldNodes
-  WorldNodes = WorldSize/WorldShmSize;
-  assert( (WorldNodes * WorldShmSize) == WorldSize );
-
-  // FIXME: Check all WorldShmSize are the same ?
-
-  /////////////////////////////////////////////////////////////////////
-  // find world ranks in our SHM group (i.e. which ranks are on our node)
-  /////////////////////////////////////////////////////////////////////
-  MPI_Group WorldGroup, ShmGroup;
-  MPI_Comm_group (WorldComm, &WorldGroup); 
-  MPI_Comm_group (WorldShmComm, &ShmGroup);
-
-  std::vector<int> world_ranks(WorldSize);   for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
-
-  WorldShmRanks.resize(WorldSize); 
-  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]); 
-
-  ///////////////////////////////////////////////////////////////////
-  // Identify who is in my group and nominate the leader
-  ///////////////////////////////////////////////////////////////////
-  int g=0;
-  std::vector<int> MyGroup;
-  MyGroup.resize(WorldShmSize);
-  for(int rank=0;rank<WorldSize;rank++){
-    if(WorldShmRanks[rank]!=MPI_UNDEFINED){
-      assert(g<WorldShmSize);
-      MyGroup[g++] = rank;
-    }
-  }
-  
-  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
-  int myleader = MyGroup[0];
-  
-  std::vector<int> leaders_1hot(WorldSize,0);
-  std::vector<int> leaders_group(WorldNodes,0);
-  leaders_1hot [ myleader ] = 1;
-    
-  ///////////////////////////////////////////////////////////////////
-  // global sum leaders over comm world
-  ///////////////////////////////////////////////////////////////////
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
-  assert(ierr==0);
-
-  ///////////////////////////////////////////////////////////////////
-  // find the group leaders world rank
-  ///////////////////////////////////////////////////////////////////
-  int group=0;
-  for(int l=0;l<WorldSize;l++){
-    if(leaders_1hot[l]){
-      leaders_group[group++] = l;
-    }
-  }
-
-  ///////////////////////////////////////////////////////////////////
-  // Identify the node of the group in which I (and my leader) live
-  ///////////////////////////////////////////////////////////////////
-  WorldNode=-1;
-  for(int g=0;g<WorldNodes;g++){
-    if (myleader == leaders_group[g]){
-      WorldNode=g;
-    }
-  }
-  assert(WorldNode!=-1);
-  _ShmSetup=1;
-}
-// Gray encode support 
-int BinaryToGray (int  binary) {
-  int gray = (binary>>1)^binary;
-  return gray;
-}
-int Log2Size(int TwoToPower,int MAXLOG2)
-{
-  int log2size = -1;
-  for(int i=0;i<=MAXLOG2;i++){
-    if ( (0x1<<i) == TwoToPower ) {
-      log2size = i;
-      break;
-    }
-  }
-  return log2size;
-}
-void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
-{
-#ifdef HYPERCUBE
-  ////////////////////////////////////////////////////////////////
-  // Assert power of two shm_size.
-  ////////////////////////////////////////////////////////////////
-  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
-  assert(log2size != -1);
-
-  ////////////////////////////////////////////////////////////////
-  // Identify the hypercube coordinate of this node using hostname
-  ////////////////////////////////////////////////////////////////
-  // n runs 0...7 9...16 18...25 27...34     (8*4)  5 bits
-  // i runs 0..7                                    3 bits
-  // r runs 0..3                                    2 bits
-  // 2^10 = 1024 nodes
-  const int maxhdim = 10; 
-  std::vector<int> HyperCubeCoords(maxhdim,0);
-  std::vector<int> RootHyperCubeCoords(maxhdim,0);
-  int R;
-  int I;
-  int N;
-  const int namelen = _POSIX_HOST_NAME_MAX;
-  char name[namelen];
-
-  // Parse ICE-XA hostname to get hypercube location
-  gethostname(name,namelen);
-  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
-  assert(nscan==3);
-
-  int nlo = N%9;
-  int nhi = N/9;
-  uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
-  uint32_t rootcoor  = hypercoor;
-
-  //////////////////////////////////////////////////////////////////
-  // Print debug info
-  //////////////////////////////////////////////////////////////////
-  for(int d=0;d<maxhdim;d++){
-    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
-  }
-
-  std::string hname(name);
-  std::cout << "hostname "<<hname<<std::endl;
-  std::cout << "R " << R << " I " << I << " N "<< N
-            << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
-
-  //////////////////////////////////////////////////////////////////
-  // broadcast node 0's base coordinate for this partition.
-  //////////////////////////////////////////////////////////////////
-  MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); 
-  hypercoor=hypercoor-rootcoor;
-  assert(hypercoor<WorldSize);
-  assert(hypercoor>=0);
-
-  //////////////////////////////////////
-  // Printing
-  //////////////////////////////////////
-  for(int d=0;d<maxhdim;d++){
-    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
-  }
-
-  ////////////////////////////////////////////////////////////////
-  // Identify subblock of ranks on node spreading across dims
-  // in a maximally symmetrical way
-  ////////////////////////////////////////////////////////////////
-  int ndimension              = processors.size();
-  std::vector<int> processor_coor(ndimension);
-  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
-  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
-  std::vector<int> HyperCoor(ndimension);
-  int dim = 0;
-  for(int l2=0;l2<log2size;l2++){
-    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
-    ShmDims[dim]*=2;
-    dim=(dim+1)%ndimension;
-  }
-
-  ////////////////////////////////////////////////////////////////
-  // Establish torus of processes and nodes with sub-blockings
-  ////////////////////////////////////////////////////////////////
-  for(int d=0;d<ndimension;d++){
-    NodeDims[d] = WorldDims[d]/ShmDims[d];
-  }
-  ////////////////////////////////////////////////////////////////
-  // Map Hcube according to physical lattice 
-  // must partition. Loop over dims and find out who would join.
-  ////////////////////////////////////////////////////////////////
-  int hcoor = hypercoor;
-  for(int d=0;d<ndimension;d++){
-     int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
-     int msk  = (0x1<<bits)-1;
-     HyperCoor[d]=hcoor & msk;  
-     HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
-     hcoor = hcoor >> bits;
-  } 
-  ////////////////////////////////////////////////////////////////
-  // Check processor counts match
-  ////////////////////////////////////////////////////////////////
-  int Nprocessors=1;
-  for(int i=0;i<ndimension;i++){
-    Nprocessors*=processors[i];
-  }
-  assert(WorldSize==Nprocessors);
-
-  ////////////////////////////////////////////////////////////////
-  // Establish mapping between lexico physics coord and WorldRank
-  ////////////////////////////////////////////////////////////////
-  int rank;
-
-  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
-
-  for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
-
-  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
-  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
-  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
-
-  /////////////////////////////////////////////////////////////////
-  // Build the new communicator
-  /////////////////////////////////////////////////////////////////
-  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
-  assert(ierr==0);
-#else 
-  ////////////////////////////////////////////////////////////////
-  // Assert power of two shm_size.
-  ////////////////////////////////////////////////////////////////
-  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
-  assert(log2size != -1);
-
-  ////////////////////////////////////////////////////////////////
-  // Identify subblock of ranks on node spreading across dims
-  // in a maximally symmetrical way
-  ////////////////////////////////////////////////////////////////
-  int ndimension              = processors.size();
-  std::vector<int> processor_coor(ndimension);
-  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
-  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
-  int dim = 0;
-  for(int l2=0;l2<log2size;l2++){
-    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
-    ShmDims[dim]*=2;
-    dim=(dim+1)%ndimension;
-  }
-
-  ////////////////////////////////////////////////////////////////
-  // Establish torus of processes and nodes with sub-blockings
-  ////////////////////////////////////////////////////////////////
-  for(int d=0;d<ndimension;d++){
-    NodeDims[d] = WorldDims[d]/ShmDims[d];
-  }
-
-  ////////////////////////////////////////////////////////////////
-  // Check processor counts match
-  ////////////////////////////////////////////////////////////////
-  int Nprocessors=1;
-  for(int i=0;i<ndimension;i++){
-    Nprocessors*=processors[i];
-  }
-  assert(WorldSize==Nprocessors);
-
-  ////////////////////////////////////////////////////////////////
-  // Establish mapping between lexico physics coord and WorldRank
-  ////////////////////////////////////////////////////////////////
-  int rank;
-
-  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
-  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
-  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
-  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
-
-  /////////////////////////////////////////////////////////////////
-  // Build the new communicator
-  /////////////////////////////////////////////////////////////////
-  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
-  assert(ierr==0);
-#endif
-}
-////////////////////////////////////////////////////////////////////////////////////////////
-// SHMGET
-////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_MPI3_SHMGET
-void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
-{
-  std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // allocate the shared windows for our group
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  MPI_Barrier(WorldShmComm);
-  WorldShmCommBufs.resize(WorldShmSize);
-  std::vector<int> shmids(WorldShmSize);
-
-  if ( WorldShmRank == 0 ) {
-    for(int r=0;r<WorldShmSize;r++){
-      size_t size = bytes;
-      key_t key   = IPC_PRIVATE;
-      int flags = IPC_CREAT | SHM_R | SHM_W;
-#ifdef SHM_HUGETLB
-      if (Hugepages) flags|=SHM_HUGETLB;
-#endif
-      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
-        int errsv = errno;
-        printf("Errno %d\n",errsv);
-        printf("key   %d\n",key);
-        printf("size  %lld\n",size);
-        printf("flags %d\n",flags);
-        perror("shmget");
-        exit(1);
-      }
-    }
-  }
-  MPI_Barrier(WorldShmComm);
-  MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
-  MPI_Barrier(WorldShmComm);
-
-  for(int r=0;r<WorldShmSize;r++){
-    WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
-    if (WorldShmCommBufs[r] == (uint64_t *)-1) {
-      perror("Shared memory attach failure");
-      shmctl(shmids[r], IPC_RMID, NULL);
-      exit(2);
-    }
-  }
-  MPI_Barrier(WorldShmComm);
-  ///////////////////////////////////
-  // Mark for clean up
-  ///////////////////////////////////
-  for(int r=0;r<WorldShmSize;r++){
-    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
-  }
-  MPI_Barrier(WorldShmComm);
-
-  _ShmAlloc=1;
-  _ShmAllocBytes  = bytes;
-}
-#endif
- 
-////////////////////////////////////////////////////////////////////////////////////////////
-// Hugetlbfs mapping intended
-////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_MPI3_SHMMMAP
-void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
-{
-  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // allocate the shared windows for our group
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  MPI_Barrier(WorldShmComm);
-  WorldShmCommBufs.resize(WorldShmSize);
-  
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // Hugetlbfs and others map filesystems as mappable huge pages
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  char shm_name [NAME_MAX];
-  for(int r=0;r<WorldShmSize;r++){
-    
-    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
-    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
-    if ( fd == -1) { 
-      printf("open %s failed\n",shm_name);
-      perror("open hugetlbfs");
-      exit(0);
-    }
-    int mmap_flag = MAP_SHARED ;
-#ifdef MAP_POPULATE    
-    mmap_flag|=MAP_POPULATE;
-#endif
-#ifdef MAP_HUGETLB
-    if ( flags ) mmap_flag |= MAP_HUGETLB;
-#endif
-    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
-    if ( ptr == (void *)MAP_FAILED ) {    
-      printf("mmap %s failed\n",shm_name);
-      perror("failed mmap");      assert(0);    
-    }
-    assert(((uint64_t)ptr&0x3F)==0);
-    close(fd);
-    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
-  }
-  _ShmAlloc=1;
-  _ShmAllocBytes  = bytes;
-};
-#endif // MMAP
-
-#ifdef GRID_MPI3_SHM_NONE
-void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
-{
-  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // allocate the shared windows for our group
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  MPI_Barrier(WorldShmComm);
-  WorldShmCommBufs.resize(WorldShmSize);
-  
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // Hugetlbf and others map filesystems as mappable huge pages
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  char shm_name [NAME_MAX];
-  assert(WorldShmSize == 1);
-  for(int r=0;r<WorldShmSize;r++){
-    
-    int fd=-1;
-    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
-#ifdef MAP_POPULATE    
-    mmap_flag|=MAP_POPULATE;
-#endif
-#ifdef MAP_HUGETLB
-    if ( flags ) mmap_flag |= MAP_HUGETLB;
-#endif
-    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
-    if ( ptr == (void *)MAP_FAILED ) {    
-      printf("mmap %s failed\n",shm_name);
-      perror("failed mmap");      assert(0);    
-    }
-    assert(((uint64_t)ptr&0x3F)==0);
-    close(fd);
-    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
-  }
-  _ShmAlloc=1;
-  _ShmAllocBytes  = bytes;
-};
-#endif // MMAP
-
-#ifdef GRID_MPI3_SHMOPEN
-////////////////////////////////////////////////////////////////////////////////////////////
-// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
-// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
-// the posix shm virtual file system
-////////////////////////////////////////////////////////////////////////////////////////////
-void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
-{ 
-  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0); 
-  MPI_Barrier(WorldShmComm);
-  WorldShmCommBufs.resize(WorldShmSize);
-
-  char shm_name [NAME_MAX];
-  if ( WorldShmRank == 0 ) {
-    for(int r=0;r<WorldShmSize;r++){
-	
-      size_t size = bytes;
-      
-      struct passwd *pw = getpwuid (getuid());
-      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
-      
-      shm_unlink(shm_name);
-      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
-      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
-      ftruncate(fd, size);
-	
-      int mmap_flag = MAP_SHARED;
-#ifdef MAP_POPULATE 
-      mmap_flag |= MAP_POPULATE;
-#endif
-#ifdef MAP_HUGETLB
-      if (flags) mmap_flag |= MAP_HUGETLB;
-#endif
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
-      
-      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
-      if ( ptr == (void * )MAP_FAILED ) {       
-	perror("failed mmap");     
-	assert(0);    
-      }
-      assert(((uint64_t)ptr&0x3F)==0);
-      
-      WorldShmCommBufs[r] =ptr;
-      close(fd);
-    }
-  }
-
-  MPI_Barrier(WorldShmComm);
-  
-  if ( WorldShmRank != 0 ) { 
-    for(int r=0;r<WorldShmSize;r++){
-
-      size_t size = bytes ;
-      
-      struct passwd *pw = getpwuid (getuid());
-      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
-      
-      int fd=shm_open(shm_name,O_RDWR,0666);
-      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
-      
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
-      assert(((uint64_t)ptr&0x3F)==0);
-      WorldShmCommBufs[r] =ptr;
-
-      close(fd);
-    }
-  }
-  _ShmAlloc=1;
-  _ShmAllocBytes = bytes;
-}
-#endif
-
-
-
-
-  ////////////////////////////////////////////////////////
-  // Global shared functionality finished
-  // Now move to per communicator functionality
-  ////////////////////////////////////////////////////////
-void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
-{
-  int rank, size;
-  MPI_Comm_rank(comm,&rank);
-  MPI_Comm_size(comm,&size);
-  ShmRanks.resize(size);
-
-  /////////////////////////////////////////////////////////////////////
-  // Split into groups that can share memory
-  /////////////////////////////////////////////////////////////////////
-  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
-  MPI_Comm_rank(ShmComm     ,&ShmRank);
-  MPI_Comm_size(ShmComm     ,&ShmSize);
-  ShmCommBufs.resize(ShmSize);
-
-  //////////////////////////////////////////////////////////////////////
-  // Map ShmRank to WorldShmRank and use the right buffer
-  //////////////////////////////////////////////////////////////////////
-  assert (GlobalSharedMemory::ShmAlloc()==1);
-  heap_size = GlobalSharedMemory::ShmAllocBytes();
-  for(int r=0;r<ShmSize;r++){
-
-    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
-
-    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
-
-    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
-    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
-  }
-  ShmBufferFreeAll();
-
-  /////////////////////////////////////////////////////////////////////
-  // find comm ranks in our SHM group (i.e. which ranks are on our node)
-  /////////////////////////////////////////////////////////////////////
-  MPI_Group FullGroup, ShmGroup;
-  MPI_Comm_group (comm   , &FullGroup); 
-  MPI_Comm_group (ShmComm, &ShmGroup);
-
-  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
-  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
-}
-//////////////////////////////////////////////////////////////////
-// On node barrier
-//////////////////////////////////////////////////////////////////
-void SharedMemory::ShmBarrier(void)
-{
-  MPI_Barrier  (ShmComm);
-}
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Test the shared memory is working
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-void SharedMemory::SharedMemoryTest(void)
-{
-  ShmBarrier();
-  if ( ShmRank == 0 ) {
-    for(int r=0;r<ShmSize;r++){
-      uint64_t * check = (uint64_t *) ShmCommBufs[r];
-      check[0] = GlobalSharedMemory::WorldNode;
-      check[1] = r;
-      check[2] = 0x5A5A5A;
-    }
-  }
-  ShmBarrier();
-  for(int r=0;r<ShmSize;r++){
-    uint64_t * check = (uint64_t *) ShmCommBufs[r];
-    
-    assert(check[0]==GlobalSharedMemory::WorldNode);
-    assert(check[1]==r);
-    assert(check[2]==0x5A5A5A);
-    
-  }
-  ShmBarrier();
-}
-
-void *SharedMemory::ShmBuffer(int rank)
-{
-  int gpeer = ShmRanks[rank];
-  if (gpeer == MPI_UNDEFINED){
-    return NULL;
-  } else { 
-    return ShmCommBufs[gpeer];
-  }
-}
-void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
-{
-  static int count =0;
-  int gpeer = ShmRanks[rank];
-  assert(gpeer!=ShmRank); // never send to self
-  if (gpeer == MPI_UNDEFINED){
-    return NULL;
-  } else { 
-    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
-    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
-    return (void *) remote;
-  }
-}
-SharedMemory::~SharedMemory()
-{
-  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
-  if ( !MPI_is_finalised ) { 
-    MPI_Comm_free(&ShmComm);
-  }
-};
-
-}
@@ -1,128 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/SharedMemory.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/GridCore.h>
-
-namespace Grid { 
-
-/*Construct from an MPI communicator*/
-void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
-{
-  assert(_ShmSetup==0);
-  WorldComm = 0;
-  WorldRank = 0;
-  WorldSize = 1;
-  WorldShmComm = 0 ;
-  WorldShmRank = 0 ;
-  WorldShmSize = 1 ;
-  WorldNodes   = 1 ;
-  WorldNode    = 0 ;
-  WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
-  WorldShmCommBufs.resize(1);
-  _ShmSetup=1;
-}
-
-void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
-{
-  optimal_comm = WorldComm;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////
-// Hugetlbfs mapping intended, use anonymous mmap
-////////////////////////////////////////////////////////////////////////////////////////////
-void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
-{
-  void * ShmCommBuf ; 
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
-  int mmap_flag =0;
-#ifdef MAP_ANONYMOUS
-  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
-#endif
-#ifdef MAP_ANON
-  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
-#endif
-#ifdef MAP_HUGETLB
-  if ( flags ) mmap_flag |= MAP_HUGETLB;
-#endif
-  ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
-  if (ShmCommBuf == (void *)MAP_FAILED) {
-    perror("mmap failed ");
-    exit(EXIT_FAILURE);  
-  }
-#ifdef MADV_HUGEPAGE
-  if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
-#endif
-  bzero(ShmCommBuf,bytes);
-  WorldShmCommBufs[0] = ShmCommBuf;
-  _ShmAllocBytes=bytes;
-  _ShmAlloc=1;
-};
-
-  ////////////////////////////////////////////////////////
-  // Global shared functionality finished
-  // Now move to per communicator functionality
-  ////////////////////////////////////////////////////////
-void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
-{
-  assert(GlobalSharedMemory::ShmAlloc()==1);
-  ShmRanks.resize(1);
-  ShmCommBufs.resize(1);
-  ShmRanks[0] = 0;
-  ShmRank     = 0;
-  ShmSize     = 1;
-  //////////////////////////////////////////////////////////////////////
-  // Map ShmRank to WorldShmRank and use the right buffer
-  //////////////////////////////////////////////////////////////////////
-  ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
-  heap_size      = GlobalSharedMemory::ShmAllocBytes();
-  ShmBufferFreeAll();
-  return;
-}
-//////////////////////////////////////////////////////////////////
-// On node barrier
-//////////////////////////////////////////////////////////////////
-void SharedMemory::ShmBarrier(void){ return ; }
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Test the shared memory is working
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-void SharedMemory::SharedMemoryTest(void) { return; }
-
-void *SharedMemory::ShmBuffer(int rank)
-{
-  return NULL;
-}
-void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
-{
-  return NULL;
-}
-SharedMemory::~SharedMemory()
-{};
-
-}
@@ -1,729 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/parallelIO/BinaryIO.h
-
-    Copyright (C) 2015
-
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu<guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_BINARY_IO_H
-#define GRID_BINARY_IO_H
-
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
-#define USE_MPI_IO
-#else
-#undef  USE_MPI_IO
-#endif
-
-#ifdef HAVE_ENDIAN_H
-#include <endian.h>
-#endif
-
-#include <arpa/inet.h>
-#include <algorithm>
-
-namespace Grid { 
-
-
-/////////////////////////////////////////////////////////////////////////////////
-// Byte reversal garbage
-/////////////////////////////////////////////////////////////////////////////////
-inline uint32_t byte_reverse32(uint32_t f) { 
-      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-      return f;
-}
-inline uint64_t byte_reverse64(uint64_t f) { 
-  uint64_t g;
-  g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-  g = g << 32;
-  f = f >> 32;
-  g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-  return g;
-}
-
-#if BYTE_ORDER == BIG_ENDIAN 
-inline uint64_t Grid_ntohll(uint64_t A) { return A; }
-#else
-inline uint64_t Grid_ntohll(uint64_t A) { 
-  return byte_reverse64(A);
-}
-#endif
-
-// A little helper
-inline void removeWhitespace(std::string &key)
-{
-  key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Static class holding the parallel IO code
-// Could just use a namespace
-///////////////////////////////////////////////////////////////////////////////////////////////////
-class BinaryIO {
- public:
-
-  /////////////////////////////////////////////////////////////////////////////
-  // more byte manipulation helpers
-  /////////////////////////////////////////////////////////////////////////////
-
-  template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,uint32_t &nersc_csum)
-  {
-    typedef typename vobj::scalar_object sobj;
-
-    GridBase *grid = lat._grid;
-    uint64_t lsites = grid->lSites();
-
-    std::vector<sobj> scalardata(lsites); 
-    unvectorizeToLexOrdArray(scalardata,lat);    
-
-    NerscChecksum(grid,scalardata,nersc_csum);
-  }
-
-  template <class fobj>
-  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
-  {
-    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
-
-    uint64_t lsites = grid->lSites();
-    if (fbuf.size() == 1)
-    {
-      lsites = 1;
-    }
-
-PARALLEL_REGION
-    {
-      uint32_t nersc_csum_thr = 0;
-
-PARALLEL_FOR_LOOP_INTERN
-      for (uint64_t local_site = 0; local_site < lsites; local_site++)
-      {
-        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
-        for (uint64_t j = 0; j < size32; j++)
-        {
-          nersc_csum_thr = nersc_csum_thr + site_buf[j];
-        }
-      }
-
-PARALLEL_CRITICAL
-      {
-        nersc_csum += nersc_csum_thr;
-      }
-    }
-  }
-
-  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
-  {
-    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
-
-
-    int nd = grid->_ndimension;
-
-    uint64_t lsites              =grid->lSites();
-    if (fbuf.size()==1) {
-      lsites=1;
-    }
-    std::vector<int> local_vol   =grid->LocalDimensions();
-    std::vector<int> local_start =grid->LocalStarts();
-    std::vector<int> global_vol  =grid->FullDimensions();
-
-PARALLEL_REGION
-    { 
-      std::vector<int> coor(nd);
-      uint32_t scidac_csuma_thr=0;
-      uint32_t scidac_csumb_thr=0;
-      uint32_t site_crc=0;
-
-PARALLEL_FOR_LOOP_INTERN
-      for(uint64_t local_site=0;local_site<lsites;local_site++){
-
-	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
-
-	/* 
-	 * Scidac csum  is rather more heavyweight
-	 * FIXME -- 128^3 x 256 x 16 will overflow.
-	 */
-	
-	int global_site;
-
-	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
-
-	for(int d=0;d<nd;d++) {
-	  coor[d] = coor[d]+local_start[d];
-	}
-
-	Lexicographic::IndexFromCoor(coor,global_site,global_vol);
-
-	uint32_t gsite29   = global_site%29;
-	uint32_t gsite31   = global_site%31;
-	
-	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
-	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
-	//	std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
-	scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
-	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
-      }
-
-PARALLEL_CRITICAL
-      {
-	scidac_csuma^= scidac_csuma_thr;
-	scidac_csumb^= scidac_csumb_thr;
-      }
-    }
-  }
-
-  // Network is big endian
-  static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} 
-  static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} 
-  static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} 
-  static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} 
-
-  static inline void be32toh_v(void *file_object,uint64_t bytes)
-  {
-    uint32_t * f = (uint32_t *)file_object;
-    uint64_t count = bytes/sizeof(uint32_t);
-    parallel_for(uint64_t i=0;i<count;i++){  
-      f[i] = ntohl(f[i]);
-    }
-  }
-  // LE must Swap and switch to host
-  static inline void le32toh_v(void *file_object,uint64_t bytes)
-  {
-    uint32_t *fp = (uint32_t *)file_object;
-    uint32_t f;
-
-    uint64_t count = bytes/sizeof(uint32_t);
-    parallel_for(uint64_t i=0;i<count;i++){  
-      f = fp[i];
-      // got network order and the network to host
-      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-      fp[i] = ntohl(f);
-    }
-  }
-
-  // BE is same as network
-  static inline void be64toh_v(void *file_object,uint64_t bytes)
-  {
-    uint64_t * f = (uint64_t *)file_object;
-    uint64_t count = bytes/sizeof(uint64_t);
-    parallel_for(uint64_t i=0;i<count;i++){  
-      f[i] = Grid_ntohll(f[i]);
-    }
-  }
-  
-  // LE must swap and switch;
-  static inline void le64toh_v(void *file_object,uint64_t bytes)
-  {
-    uint64_t *fp = (uint64_t *)file_object;
-    uint64_t f,g;
-    
-    uint64_t count = bytes/sizeof(uint64_t);
-    parallel_for(uint64_t i=0;i<count;i++){  
-      f = fp[i];
-      // got network order and the network to host
-      g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-      g = g << 32;
-      f = f >> 32;
-      g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-      fp[i] = Grid_ntohll(g);
-    }
-  }
-  /////////////////////////////////////////////////////////////////////////////
-  // Real action:
-  // Read or Write distributed lexico array of ANY object to a specific location in file 
-  //////////////////////////////////////////////////////////////////////////////////////
-
-  static const int BINARYIO_MASTER_APPEND = 0x10;
-  static const int BINARYIO_UNORDERED     = 0x08;
-  static const int BINARYIO_LEXICOGRAPHIC = 0x04;
-  static const int BINARYIO_READ          = 0x02;
-  static const int BINARYIO_WRITE         = 0x01;
-
-  template<class word,class fobj>
-  static inline void IOobject(word w,
-			      GridBase *grid,
-			      std::vector<fobj> &iodata,
-			      std::string file,
-			      uint64_t& offset,
-			      const std::string &format, int control,
-			      uint32_t &nersc_csum,
-			      uint32_t &scidac_csuma,
-			      uint32_t &scidac_csumb)
-  {
-    grid->Barrier();
-    GridStopWatch timer; 
-    GridStopWatch bstimer;
-    
-    nersc_csum=0;
-    scidac_csuma=0;
-    scidac_csumb=0;
-
-    int ndim                 = grid->Dimensions();
-    int nrank                = grid->ProcessorCount();
-    int myrank               = grid->ThisRank();
-
-    std::vector<int>  psizes = grid->ProcessorGrid(); 
-    std::vector<int>  pcoor  = grid->ThisProcessorCoor();
-    std::vector<int> gLattice= grid->GlobalDimensions();
-    std::vector<int> lLattice= grid->LocalDimensions();
-
-    std::vector<int> lStart(ndim);
-    std::vector<int> gStart(ndim);
-
-    // Flatten the file
-    uint64_t lsites = grid->lSites();
-    if ( control & BINARYIO_MASTER_APPEND )  {
-      assert(iodata.size()==1);
-    } else {
-      assert(lsites==iodata.size());
-    }
-    for(int d=0;d<ndim;d++){
-      gStart[d] = lLattice[d]*pcoor[d];
-      lStart[d] = 0;
-    }
-
-#ifdef USE_MPI_IO
-    std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
-    std::vector<int> dargs   (ndim,MPI_DISTRIBUTE_DFLT_DARG);
-    MPI_Datatype mpiObject;
-    MPI_Datatype fileArray;
-    MPI_Datatype localArray;
-    MPI_Datatype mpiword;
-    MPI_Offset disp = offset;
-    MPI_File fh ;
-    MPI_Status status;
-    int numword;
-
-    if ( sizeof( word ) == sizeof(float ) ) {
-      numword = sizeof(fobj)/sizeof(float);
-      mpiword = MPI_FLOAT;
-    } else {
-      numword = sizeof(fobj)/sizeof(double);
-      mpiword = MPI_DOUBLE;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Sobj in MPI phrasing
-    //////////////////////////////////////////////////////////////////////////////
-    int ierr;
-    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);    assert(ierr==0);
-    ierr = MPI_Type_commit(&mpiObject);
-
-    //////////////////////////////////////////////////////////////////////////////
-    // File global array data type
-    //////////////////////////////////////////////////////////////////////////////
-    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);    assert(ierr==0);
-    ierr=MPI_Type_commit(&fileArray);    assert(ierr==0);
-
-    //////////////////////////////////////////////////////////////////////////////
-    // local lattice array
-    //////////////////////////////////////////////////////////////////////////////
-    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);    assert(ierr==0);
-    ierr=MPI_Type_commit(&localArray);    assert(ierr==0);
-#endif
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Byte order
-    //////////////////////////////////////////////////////////////////////////////
-    int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Do the I/O
-    //////////////////////////////////////////////////////////////////////////////
-    if ( control & BINARYIO_READ ) { 
-
-      timer.Start();
-
-      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
-#ifdef USE_MPI_IO
-	std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
-	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
-	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
-	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
-	MPI_File_close(&fh);
-	MPI_Type_free(&fileArray);
-	MPI_Type_free(&localArray);
-#else 
-	assert(0);
-#endif
-      } else {
-	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
-        std::ifstream fin;
-	fin.open(file, std::ios::binary | std::ios::in);
-        if (control & BINARYIO_MASTER_APPEND)
-        {
-          fin.seekg(-sizeof(fobj), fin.end);
-        }
-        else
-        {
-          fin.seekg(offset + myrank * lsites * sizeof(fobj));
-        }
-        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
-        assert(fin.fail() == 0);
-        fin.close();
-      }
-      timer.Stop();
-
-      grid->Barrier();
-
-      bstimer.Start();
-      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
-      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      NerscChecksum(grid,iodata,nersc_csum);
-      bstimer.Stop();
-    }
-    
-    if ( control & BINARYIO_WRITE ) { 
-
-      bstimer.Start();
-      NerscChecksum(grid,iodata,nersc_csum);
-      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
-      bstimer.Stop();
-
-      grid->Barrier();
-
-      timer.Start();
-      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
-#ifdef USE_MPI_IO
-        std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
-        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
-	//        std::cout << GridLogMessage << "Checking for errors" << std::endl;
-        if (ierr != MPI_SUCCESS)
-        {
-          char error_string[BUFSIZ];
-          int length_of_error_string, error_class;
-
-          MPI_Error_class(ierr, &error_class);
-          MPI_Error_string(error_class, error_string, &length_of_error_string);
-          fprintf(stderr, "%3d: %s\n", myrank, error_string);
-          MPI_Error_string(ierr, error_string, &length_of_error_string);
-          fprintf(stderr, "%3d: %s\n", myrank, error_string);
-          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
-        }
-
-        std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
-        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
-        assert(ierr == 0);
-
-        std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
-        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
-        assert(ierr == 0);
-
-        MPI_Offset os;
-        MPI_File_get_position(fh, &os);
-        MPI_File_get_byte_offset(fh, os, &disp);
-        offset = disp;
-
-
-        MPI_File_close(&fh);
-        MPI_Type_free(&fileArray);
-        MPI_Type_free(&localArray);
-#else 
-	assert(0);
-#endif
-      } else { 
-
-        std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
-        
-	std::ofstream fout; 
-	fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
-	try {
-	  if (offset) { // Must already exist and contain data
-	    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
-	  } else {     // Allow create
-	    fout.open(file,std::ios::binary|std::ios::out);
-	  }
-	} catch (const std::fstream::failure& exc) {
-	  std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
-	  std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
-	  //	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
-#ifdef USE_MPI_IO
-	  MPI_Abort(MPI_COMM_WORLD,1);
-#else
-	  exit(1);
-#endif
-	}
-	
-	if ( control & BINARYIO_MASTER_APPEND )  {
-	  try {
-	    fout.seekp(0,fout.end);
-	  } catch (const std::fstream::failure& exc) {
-	    std::cout << "Exception in seeking file end " << file << std::endl;
-	  }
-	} else {
-	  try { 
-	    fout.seekp(offset+myrank*lsites*sizeof(fobj));
-	  } catch (const std::fstream::failure& exc) {
-	    std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
-	  }
-	}
-
-	try {
-	  fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
-	}
-	catch (const std::fstream::failure& exc) {
-	  std::cout << "Exception in writing file " << file << std::endl;
-	  std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
-#ifdef USE_MPI_IO
-	  MPI_Abort(MPI_COMM_WORLD,1);
-#else
-	  exit(1);
-#endif
-	}
-  offset  = fout.tellp();
-	fout.close();
-      }
-      timer.Stop();
-    }
-    
-    std::cout<<GridLogMessage<<"IOobject: ";
-    if ( control & BINARYIO_READ) std::cout << " read  ";
-    else                          std::cout << " write ";
-    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
-    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
-
-    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Safety check
-    //////////////////////////////////////////////////////////////////////////////
-    // if the data size is 1 we do not want to sum over the MPI ranks
-    if (iodata.size() != 1){
-      grid->Barrier();
-      grid->GlobalSum(nersc_csum);
-      grid->GlobalXOR(scidac_csuma);
-      grid->GlobalXOR(scidac_csumb);
-      grid->Barrier();
-    }
-  }
-
-  /////////////////////////////////////////////////////////////////////////////
-  // Read a Lattice of object
-  //////////////////////////////////////////////////////////////////////////////////////
-  template<class vobj,class fobj,class munger>
-  static inline void readLatticeObject(Lattice<vobj> &Umu,
-				       std::string file,
-				       munger munge,
-				       uint64_t offset,
-				       const std::string &format,
-				       uint32_t &nersc_csum,
-				       uint32_t &scidac_csuma,
-				       uint32_t &scidac_csumb)
-  {
-    typedef typename vobj::scalar_object sobj;
-    typedef typename vobj::Realified::scalar_type word;    word w=0;
-
-    GridBase *grid = Umu._grid;
-    uint64_t lsites = grid->lSites();
-
-    std::vector<sobj> scalardata(lsites); 
-    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
-    
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
-	     nersc_csum,scidac_csuma,scidac_csumb);
-
-    GridStopWatch timer; 
-    timer.Start();
-
-    parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
-
-    vectorizeFromLexOrdArray(scalardata,Umu);    
-    grid->Barrier();
-
-    timer.Stop();
-    std::cout<<GridLogMessage<<"readLatticeObject: vectorize overhead "<<timer.Elapsed()  <<std::endl;
-  }
-
-  /////////////////////////////////////////////////////////////////////////////
-  // Write a Lattice of object
-  //////////////////////////////////////////////////////////////////////////////////////
-  template<class vobj,class fobj,class munger>
-    static inline void writeLatticeObject(Lattice<vobj> &Umu,
-					  std::string file,
-					  munger munge,
-					  uint64_t offset,
-					  const std::string &format,
-					  uint32_t &nersc_csum,
-					  uint32_t &scidac_csuma,
-					  uint32_t &scidac_csumb)
-  {
-    typedef typename vobj::scalar_object sobj;
-    typedef typename vobj::Realified::scalar_type word;    word w=0;
-    GridBase *grid = Umu._grid;
-    uint64_t lsites = grid->lSites();
-
-    std::vector<sobj> scalardata(lsites); 
-    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Munge [ .e.g 3rd row recon ]
-    //////////////////////////////////////////////////////////////////////////////
-    GridStopWatch timer; timer.Start();
-    unvectorizeToLexOrdArray(scalardata,Umu);    
-
-    parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
-
-    grid->Barrier();
-    timer.Stop();
-
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
-	     nersc_csum,scidac_csuma,scidac_csumb);
-
-    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
-  }
-  
-  /////////////////////////////////////////////////////////////////////////////
-  // Read a RNG;  use IOobject and lexico map to an array of state 
-  //////////////////////////////////////////////////////////////////////////////////////
-  static inline void readRNG(GridSerialRNG &serial,
-			     GridParallelRNG &parallel,
-			     std::string file,
-			     uint64_t offset,
-			     uint32_t &nersc_csum,
-			     uint32_t &scidac_csuma,
-			     uint32_t &scidac_csumb)
-  {
-    typedef typename GridSerialRNG::RngStateType RngStateType;
-    const int RngStateCount = GridSerialRNG::RngStateCount;
-    typedef std::array<RngStateType,RngStateCount> RNGstate;
-    typedef RngStateType word;    word w=0;
-
-    std::string format = "IEEE32BIG";
-
-    GridBase *grid = parallel._grid;
-    uint64_t gsites = grid->gSites();
-    uint64_t lsites = grid->lSites();
-
-    uint32_t nersc_csum_tmp   = 0;
-    uint32_t scidac_csuma_tmp = 0;
-    uint32_t scidac_csumb_tmp = 0;
-
-    GridStopWatch timer;
-
-    std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
-
-    std::vector<RNGstate> iodata(lsites);
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
-	     nersc_csum,scidac_csuma,scidac_csumb);
-
-    timer.Start();
-    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
-      std::vector<RngStateType> tmp(RngStateCount);
-      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
-      parallel.SetState(tmp,lidx);
-    }
-    timer.Stop();
-
-    iodata.resize(1);
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND,
-	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
-
-    {
-      std::vector<RngStateType> tmp(RngStateCount);
-      std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
-      serial.SetState(tmp,0);
-    }
-
-    nersc_csum   = nersc_csum   + nersc_csum_tmp;
-    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
-    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
-
-    std::cout << GridLogMessage << "RNG file nersc_checksum   " << std::hex << nersc_csum << std::dec << std::endl;
-    std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
-    std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
-
-    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
-  }
-  /////////////////////////////////////////////////////////////////////////////
-  // Write a RNG; lexico map to an array of state and use IOobject
-  //////////////////////////////////////////////////////////////////////////////////////
-  static inline void writeRNG(GridSerialRNG &serial,
-			      GridParallelRNG &parallel,
-			      std::string file,
-			      uint64_t offset,
-			      uint32_t &nersc_csum,
-			      uint32_t &scidac_csuma,
-			      uint32_t &scidac_csumb)
-  {
-    typedef typename GridSerialRNG::RngStateType RngStateType;
-    typedef RngStateType word; word w=0;
-    const int RngStateCount = GridSerialRNG::RngStateCount;
-    typedef std::array<RngStateType,RngStateCount> RNGstate;
-
-    GridBase *grid = parallel._grid;
-    uint64_t gsites = grid->gSites();
-    uint64_t lsites = grid->lSites();
-
-    uint32_t nersc_csum_tmp;
-    uint32_t scidac_csuma_tmp;
-    uint32_t scidac_csumb_tmp;
-
-    GridStopWatch timer;
-    std::string format = "IEEE32BIG";
-
-    std::cout << GridLogMessage << "RNG write I/O on file " << file << std::endl;
-
-    timer.Start();
-    std::vector<RNGstate> iodata(lsites);
-    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
-      std::vector<RngStateType> tmp(RngStateCount);
-      parallel.GetState(tmp,lidx);
-      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
-    }
-    timer.Stop();
-
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
-	     nersc_csum,scidac_csuma,scidac_csumb);
-    iodata.resize(1);
-    {
-      std::vector<RngStateType> tmp(RngStateCount);
-      serial.GetState(tmp,0);
-      std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
-    }
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
-	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
-
-    nersc_csum   = nersc_csum   + nersc_csum_tmp;
-    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
-    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
-    
-    std::cout << GridLogMessage << "RNG file checksum " << std::hex << nersc_csum    << std::dec << std::endl;
-    std::cout << GridLogMessage << "RNG file checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
-    std::cout << GridLogMessage << "RNG file checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
-    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
-  }
-};
-}
-#endif
@@ -1,875 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/parallelIO/IldgIO.h
-
-Copyright (C) 2015
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_ILDG_IO_H
-#define GRID_ILDG_IO_H
-
-#ifdef HAVE_LIME
-#include <algorithm>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <map>
-
-#include <pwd.h>
-#include <sys/utsname.h>
-#include <unistd.h>
-
-//C-Lime is a must have for this functionality
-extern "C" {  
-#include "lime.h"
-}
-
-namespace Grid {
-namespace QCD {
-
-  /////////////////////////////////
-  // Encode word types as strings
-  /////////////////////////////////
- template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
- template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
- template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
- template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
- template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
- template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
- template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
-
-  /////////////////////////////////////////
-  // Encode a generic tensor as a string
-  /////////////////////////////////////////
- template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
-
-   typedef typename getPrecision<vobj>::real_scalar_type stype;
-
-   int _ColourN       = indexRank<ColourIndex,vobj>();
-   int _ColourScalar  =  isScalar<ColourIndex,vobj>();
-   int _ColourVector  =  isVector<ColourIndex,vobj>();
-   int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
-
-   int _SpinN       = indexRank<SpinIndex,vobj>();
-   int _SpinScalar  =  isScalar<SpinIndex,vobj>();
-   int _SpinVector  =  isVector<SpinIndex,vobj>();
-   int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
-
-   int _LorentzN       = indexRank<LorentzIndex,vobj>();
-   int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
-   int _LorentzVector  =  isVector<LorentzIndex,vobj>();
-   int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
-
-   std::stringstream stream;
-
-   stream << "GRID_";
-   stream << ScidacWordMnemonic<stype>();
-
-   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
-   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
-
-   if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
-   if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
-
-   if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
-   if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
-
-   if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
-
-
-   typesize = sizeof(typename vobj::scalar_type);
-
-   if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
-   else                 typesize*= _ColourN;
-
-   if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
-   else                 typesize*= _SpinN;
-
-   colors    = _ColourN;
-   spins     = _SpinN;
-   datacount = _LorentzN;
-
-   return stream.str();
- }
- 
- template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) { 
-   return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
- };
-
-
- ////////////////////////////////////////////////////////////
- // Helper to fill out metadata
- ////////////////////////////////////////////////////////////
- template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
-					  FieldMetaData &header,
-					  scidacRecord & _scidacRecord,
-					  scidacFile   & _scidacFile) 
- {
-   typedef typename getPrecision<vobj>::real_scalar_type stype;
-
-   /////////////////////////////////////
-   // Pull Grid's metadata
-   /////////////////////////////////////
-   PrepareMetaData(field,header);
-
-   /////////////////////////////////////
-   // Scidac Private File structure
-   /////////////////////////////////////
-   _scidacFile              = scidacFile(field._grid);
-
-   /////////////////////////////////////
-   // Scidac Private Record structure
-   /////////////////////////////////////
-   scidacRecord sr;
-   sr.datatype   = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
-   sr.date       = header.creation_date;
-   sr.precision  = ScidacWordMnemonic<stype>();
-   sr.recordtype = GRID_IO_FIELD;
-
-   _scidacRecord = sr;
-
-   //   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
- }
- 
- ///////////////////////////////////////////////////////
- // Scidac checksum
- ///////////////////////////////////////////////////////
- static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
- {
-   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
-   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
-   if ( scidac_csuma !=scidac_checksuma) return 0;
-   if ( scidac_csumb !=scidac_checksumb) return 0;
-   return 1;
- }
-
-////////////////////////////////////////////////////////////////////////////////////
-// Lime, ILDG and Scidac I/O classes
-////////////////////////////////////////////////////////////////////////////////////
-class GridLimeReader : public BinaryIO {
- public:
-   ///////////////////////////////////////////////////
-   // FIXME: format for RNG? Now just binary out instead
-   ///////////////////////////////////////////////////
-
-   FILE       *File;
-   LimeReader *LimeR;
-   std::string filename;
-
-   /////////////////////////////////////////////
-   // Open the file
-   /////////////////////////////////////////////
-   void open(const std::string &_filename) 
-   {
-     filename= _filename;
-     File = fopen(filename.c_str(), "r");
-     if (File == nullptr)
-     {
-       std::cerr << "cannot open file '" << filename << "'" << std::endl;
-       abort();
-     }
-     LimeR = limeCreateReader(File);
-   }
-   /////////////////////////////////////////////
-   // Close the file
-   /////////////////////////////////////////////
-   void close(void){
-     fclose(File);
-     //     limeDestroyReader(LimeR);
-   }
-
-  ////////////////////////////////////////////
-  // Read a generic lattice field and verify checksum
-  ////////////////////////////////////////////
-  template<class vobj>
-  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
-  {
-    typedef typename vobj::scalar_object sobj;
-    scidacChecksum scidacChecksum_;
-    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-
-    std::string format = getFormatString<vobj>();
-
-    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
-
-      uint64_t file_bytes =limeReaderBytes(LimeR);
-
-      //      std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
-      //      std::cout << GridLogMessage<< " readLimeObject seeking "<<  record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
-
-      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
-
-	//	std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
-
-	uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
-
-	//	std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
-	//	std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
-	//	std::cout << "R Payload expected " <<PayloadSize<<std::endl;
-	//	std::cout << "R file size " <<file_bytes <<std::endl;
-
-	assert(PayloadSize == file_bytes);// Must match or user error
-
-	uint64_t offset= ftello(File);
-	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
-	BinarySimpleMunger<sobj,sobj> munge;
-	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-
-	/////////////////////////////////////////////
-	// Insist checksum is next record
-	/////////////////////////////////////////////
-	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
-
-	/////////////////////////////////////////////
-	// Verify checksums
-	/////////////////////////////////////////////
-	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
-	return;
-      }
-    }
-  }
-  ////////////////////////////////////////////
-  // Read a generic serialisable object
-  ////////////////////////////////////////////
-  void readLimeObject(std::string &xmlstring,std::string record_name)
-  {
-    // should this be a do while; can we miss a first record??
-    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
-
-      //      std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
-      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
-
-      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
-
-	//	std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
-	std::vector<char> xmlc(nbytes+1,'\0');
-	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
-	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
-
-   xmlstring = std::string(&xmlc[0]);
-	return;
-      }
-
-    }  
-    assert(0);
-  }
-
-  template<class serialisable_object>
-  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
-  {
-    std::string xmlstring;
-
-    readLimeObject(xmlstring, record_name);
-	  XmlReader RD(xmlstring, true, "");
-	  read(RD,object_name,object);
-  }
-};
-
-class GridLimeWriter : public BinaryIO 
-{
- public:
-
-   ///////////////////////////////////////////////////
-   // FIXME: format for RNG? Now just binary out instead
-   // FIXME: collective calls or not ?
-   //      : must know if I am the I/O boss
-   ///////////////////////////////////////////////////
-   FILE       *File;
-   LimeWriter *LimeW;
-   std::string filename;
-   bool        boss_node;
-   GridLimeWriter( bool isboss = true) {
-     boss_node = isboss;
-   }
-   void open(const std::string &_filename) { 
-     filename= _filename;
-     if ( boss_node ) {
-       File = fopen(filename.c_str(), "w");
-       LimeW = limeCreateWriter(File); assert(LimeW != NULL );
-     }
-   }
-   /////////////////////////////////////////////
-   // Close the file
-   /////////////////////////////////////////////
-   void close(void) {
-     if ( boss_node ) {
-       fclose(File);
-     }
-     //  limeDestroyWriter(LimeW);
-   }
-  ///////////////////////////////////////////////////////
-  // Lime utility functions
-  ///////////////////////////////////////////////////////
-  int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
-  {
-    if ( boss_node ) {
-      LimeRecordHeader *h;
-      h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
-      assert(limeWriteRecordHeader(h, LimeW) >= 0);
-      limeDestroyHeader(h);
-    }
-    return LIME_SUCCESS;
-  }
-  ////////////////////////////////////////////
-  // Write a generic serialisable object
-  ////////////////////////////////////////////
-  void writeLimeObject(int MB,int ME,XmlWriter &writer,std::string object_name,std::string record_name)
-  {
-    if ( boss_node ) {
-      std::string xmlstring = writer.docString();
-
-      //    std::cout << "WriteLimeObject" << record_name <<std::endl;
-      uint64_t nbytes = xmlstring.size();
-      //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
-      int err;
-      LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
-      assert(h!= NULL);
-      
-      err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
-      err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
-      err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
-      limeDestroyHeader(h);
-    }
-  }
-
-  template<class serialisable_object>
-  void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, const unsigned int scientificPrec = 0)
-  {
-    XmlWriter WR("","");
-
-    if (scientificPrec)
-    {
-      WR.scientificFormat(true);
-      WR.setPrecision(scientificPrec);
-    }
-    write(WR,object_name,object);
-    writeLimeObject(MB, ME, WR, object_name, record_name);
-  }
-  ////////////////////////////////////////////////////
-  // Write a generic lattice field and csum
-  // This routine is Collectively called by all nodes
-  // in communicator used by the field._grid
-  ////////////////////////////////////////////////////
-  template<class vobj>
-  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
-  {
-    ////////////////////////////////////////////////////////////////////
-    // NB: FILE and iostream are jointly writing disjoint sequences in the
-    // the same file through different file handles (integer units).
-    // 
-    // These are both buffered, so why I think this code is right is as follows.
-    //
-    // i)  write record header to FILE *File, telegraphing the size; flush
-    // ii) ftello reads the offset from FILE *File . 
-    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
-    //      Closes iostream and flushes.
-    // iv) fseek on FILE * to end of this disjoint section.
-    //  v) Continue writing scidac record.
-    ////////////////////////////////////////////////////////////////////
-    
-    GridBase *grid = field._grid;
-    assert(boss_node == field._grid->IsBoss() );
-
-    ////////////////////////////////////////////
-    // Create record header
-    ////////////////////////////////////////////
-    typedef typename vobj::scalar_object sobj;
-    int err;
-    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
-    if ( boss_node ) {
-      createLimeRecordHeader(record_name, 0, 0, PayloadSize);
-      fflush(File);
-    }
-    
-    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
-    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
-    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
-
-    ////////////////////////////////////////////////
-    // Check all nodes agree on file position
-    ////////////////////////////////////////////////
-    uint64_t offset1;
-    if ( boss_node ) {
-      offset1 = ftello(File);    
-    }
-    grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
-
-    ///////////////////////////////////////////
-    // The above is collective. Write by other means into the binary record
-    ///////////////////////////////////////////
-    std::string format = getFormatString<vobj>();
-    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
-
-    ///////////////////////////////////////////
-    // Wind forward and close the record
-    ///////////////////////////////////////////
-    if ( boss_node ) {
-      fseek(File,0,SEEK_END);             
-      uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
-      assert( (offset2-offset1) == PayloadSize);
-    }
-
-    /////////////////////////////////////////////////////////////
-    // Check MPI-2 I/O did what we expect to file
-    /////////////////////////////////////////////////////////////
-
-    if ( boss_node ) { 
-      err=limeWriterCloseRecord(LimeW);  assert(err>=0);
-    }
-    ////////////////////////////////////////
-    // Write checksum element, propagaing forward from the BinaryIO
-    // Always pair a checksum with a binary object, and close message
-    ////////////////////////////////////////
-    scidacChecksum checksum;
-    std::stringstream streama; streama << std::hex << scidac_csuma;
-    std::stringstream streamb; streamb << std::hex << scidac_csumb;
-    checksum.suma= streama.str();
-    checksum.sumb= streamb.str();
-    if ( boss_node ) { 
-      writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
-    }
-  }
-};
-
-class ScidacWriter : public GridLimeWriter {
- public:
-
-  ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss)  { };
-
-  template<class SerialisableUserFile>
-  void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
-  {
-    scidacFile    _scidacFile(grid);
-    if ( this->boss_node ) {
-      writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
-      writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
-    }
-  }
-  ////////////////////////////////////////////////
-  // Write generic lattice field in scidac format
-  ////////////////////////////////////////////////
-  template <class vobj, class userRecord>
-  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
-                              const unsigned int recordScientificPrec = 0) 
-  {
-    GridBase * grid = field._grid;
-
-    ////////////////////////////////////////
-    // fill the Grid header
-    ////////////////////////////////////////
-    FieldMetaData header;
-    scidacRecord  _scidacRecord;
-    scidacFile    _scidacFile;
-
-    ScidacMetaData(field,header,_scidacRecord,_scidacFile);
-
-    //////////////////////////////////////////////
-    // Fill the Lime file record by record
-    //////////////////////////////////////////////
-    if ( this->boss_node ) {
-      writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
-      writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML), recordScientificPrec);
-      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    }
-    // Collective call
-    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
-  }
-};
-
-
-class ScidacReader : public GridLimeReader {
- public:
-
-   template<class SerialisableUserFile>
-   void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
-   {
-     scidacFile    _scidacFile(grid);
-     readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
-     readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
-   }
-  ////////////////////////////////////////////////
-  // Write generic lattice field in scidac format
-  ////////////////////////////////////////////////
-  template <class vobj, class userRecord>
-  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
-  {
-    typedef typename vobj::scalar_object sobj;
-    GridBase * grid = field._grid;
-
-    ////////////////////////////////////////
-    // fill the Grid header
-    ////////////////////////////////////////
-    FieldMetaData header;
-    scidacRecord  _scidacRecord;
-    scidacFile    _scidacFile;
-
-    //////////////////////////////////////////////
-    // Fill the Lime file record by record
-    //////////////////////////////////////////////
-    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
-    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
-    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
-  }
-  void skipPastBinaryRecord(void) {
-    std::string rec_name(ILDG_BINARY_DATA);
-    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
-      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
-	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
-	return;
-      }
-    }    
-  }
-  void skipPastObjectRecord(std::string rec_name) {
-    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
-      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
-	return;
-      }
-    }
-  }
-  void skipScidacFieldRecord() {
-    skipPastObjectRecord(std::string(GRID_FORMAT));
-    skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
-    skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
-    skipPastBinaryRecord();
-  }
-};
-
-
-class IldgWriter : public ScidacWriter {
- public:
-  
-  IldgWriter(bool isboss) : ScidacWriter(isboss) {};
-
-  ///////////////////////////////////
-  // A little helper
-  ///////////////////////////////////
-  void writeLimeIldgLFN(std::string &LFN)
-  {
-    uint64_t PayloadSize = LFN.size();
-    int err;
-    createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
-    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
-    err=limeWriterCloseRecord(LimeW); assert(err>=0);
-  }
-
-  ////////////////////////////////////////////////////////////////
-  // Special ILDG operations ; gauge configs only.
-  // Don't require scidac records EXCEPT checksum
-  // Use Grid MetaData object if present.
-  ////////////////////////////////////////////////////////////////
-  template <class vsimd>
-  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
-  {
-    GridBase * grid = Umu._grid;
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef iLorentzColourMatrix<vsimd> vobj;
-    typedef typename vobj::scalar_object sobj;
-
-    ////////////////////////////////////////
-    // fill the Grid header
-    ////////////////////////////////////////
-    FieldMetaData header;
-    scidacRecord  _scidacRecord;
-    scidacFile    _scidacFile;
-
-    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
-
-    std::string format = header.floating_point;
-    header.ensemble_id    = description;
-    header.ensemble_label = description;
-    header.sequence_number = sequence;
-    header.ildg_lfn = LFN;
-
-    assert ( (format == std::string("IEEE32BIG"))  
-           ||(format == std::string("IEEE64BIG")) );
-
-    //////////////////////////////////////////////////////
-    // Fill ILDG header data struct
-    //////////////////////////////////////////////////////
-    ildgFormat ildgfmt ;
-    ildgfmt.field     = std::string("su3gauge");
-
-    if ( format == std::string("IEEE32BIG") ) { 
-      ildgfmt.precision = 32;
-    } else { 
-      ildgfmt.precision = 64;
-    }
-    ildgfmt.version = 1.0;
-    ildgfmt.lx = header.dimension[0];
-    ildgfmt.ly = header.dimension[1];
-    ildgfmt.lz = header.dimension[2];
-    ildgfmt.lt = header.dimension[3];
-    assert(header.nd==4);
-    assert(header.nd==header.dimension.size());
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Fill the USQCD info field
-    //////////////////////////////////////////////////////////////////////////////
-    usqcdInfo info;
-    info.version=1.0;
-    info.plaq   = header.plaquette;
-    info.linktr = header.link_trace;
-
-    std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
-    //////////////////////////////////////////////
-    // Fill the Lime file record by record
-    //////////////////////////////////////////////
-    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
-    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
-    writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
-    writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
-    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT)); // rec
-    writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
-    writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
-    //    limeDestroyWriter(LimeW);
-  }
-};
-
-class IldgReader : public GridLimeReader {
- public:
-
-  ////////////////////////////////////////////////////////////////
-  // Read either Grid/SciDAC/ILDG configuration
-  // Don't require scidac records EXCEPT checksum
-  // Use Grid MetaData object if present.
-  // Else use ILDG MetaData object if present.
-  // Else use SciDAC MetaData object if present.
-  ////////////////////////////////////////////////////////////////
-  template <class vsimd>
-  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
-
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef typename GaugeField::vector_object  vobj;
-    typedef typename vobj::scalar_object sobj;
-
-    typedef LorentzColourMatrixF fobj;
-    typedef LorentzColourMatrixD dobj;
-
-    GridBase *grid = Umu._grid;
-
-    std::vector<int> dims = Umu._grid->FullDimensions();
-
-    assert(dims.size()==4);
-
-    // Metadata holders
-    ildgFormat     ildgFormat_    ;
-    std::string    ildgLFN_       ;
-    scidacChecksum scidacChecksum_; 
-    usqcdInfo      usqcdInfo_     ;
-
-    // track what we read from file
-    int found_ildgFormat    =0;
-    int found_ildgLFN       =0;
-    int found_scidacChecksum=0;
-    int found_usqcdInfo     =0;
-    int found_ildgBinary =0;
-    int found_FieldMetaData =0;
-
-    uint32_t nersc_csum;
-    uint32_t scidac_csuma;
-    uint32_t scidac_csumb;
-
-    // Binary format
-    std::string format;
-
-    //////////////////////////////////////////////////////////////////////////
-    // Loop over all records
-    // -- Order is poorly guaranteed except ILDG header preceeds binary section.
-    // -- Run like an event loop.
-    // -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing
-    //    that Scidac. 
-    // -- Insist on Scidac checksum record.
-    //////////////////////////////////////////////////////////////////////////
-
-    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
-
-      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
-      
-      //////////////////////////////////////////////////////////////////
-      // If not BINARY_DATA read a string and parse
-      //////////////////////////////////////////////////////////////////
-      if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) )  ) {
-	
-	// Copy out the string
-	std::vector<char> xmlc(nbytes+1,'\0');
-	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
-	//	std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
-
-	//////////////////////////////////
-	// ILDG format record
-
-  std::string xmlstring(&xmlc[0]);
-	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) { 
-
-	  XmlReader RD(xmlstring, true, "");
-	  read(RD,"ildgFormat",ildgFormat_);
-
-	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
-	  if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
-
-	  assert( ildgFormat_.lx == dims[0]);
-	  assert( ildgFormat_.ly == dims[1]);
-	  assert( ildgFormat_.lz == dims[2]);
-	  assert( ildgFormat_.lt == dims[3]);
-
-	  found_ildgFormat = 1;
-	}
-
-	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
-	  FieldMetaData_.ildg_lfn = xmlstring;
-	  found_ildgLFN = 1;
-	}
-
-	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) { 
-
-	  XmlReader RD(xmlstring, true, "");
-	  read(RD,"FieldMetaData",FieldMetaData_);
-
-	  format = FieldMetaData_.floating_point;
-
-	  assert(FieldMetaData_.dimension[0] == dims[0]);
-	  assert(FieldMetaData_.dimension[1] == dims[1]);
-	  assert(FieldMetaData_.dimension[2] == dims[2]);
-	  assert(FieldMetaData_.dimension[3] == dims[3]);
-
-	  found_FieldMetaData = 1;
-	}
-
-	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
-	  // is it a USQCD info field
-	  if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) { 
-	    //	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
-	    XmlReader RD(xmlstring, true, "");
-	    read(RD,"usqcdInfo",usqcdInfo_);
-	    found_usqcdInfo = 1;
-	  }
-	}
-
-	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
-	  XmlReader RD(xmlstring, true, "");
-	  read(RD,"scidacChecksum",scidacChecksum_);
-	  found_scidacChecksum = 1;
-	}
-
-      } else {  
-	/////////////////////////////////
-	// Binary data
-	/////////////////////////////////
-	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
-	uint64_t offset= ftello(File);
-	if ( format == std::string("IEEE64BIG") ) {
-	  GaugeSimpleMunger<dobj, sobj> munge;
-	  BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-	} else { 
-	  GaugeSimpleMunger<fobj, sobj> munge;
-	  BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-	}
-
-	found_ildgBinary = 1;
-      }
-
-    }
-
-    //////////////////////////////////////////////////////
-    // Minimally must find binary segment and checksum
-    // Since this is an ILDG reader require ILDG format
-    //////////////////////////////////////////////////////
-    assert(found_ildgBinary);
-    assert(found_ildgFormat);
-    assert(found_scidacChecksum);
-
-    // Must find something with the lattice dimensions
-    assert(found_FieldMetaData||found_ildgFormat);
-
-    if ( found_FieldMetaData ) {
-
-      std::cout << GridLogMessage<<"Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
-
-    } else { 
-
-      assert(found_ildgFormat);
-      assert ( ildgFormat_.field == std::string("su3gauge") );
-
-      ///////////////////////////////////////////////////////////////////////////////////////
-      // Populate our Grid metadata as best we can
-      ///////////////////////////////////////////////////////////////////////////////////////
-
-      std::ostringstream vers; vers << ildgFormat_.version;
-      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
-
-      FieldMetaData_.nd=4;
-      FieldMetaData_.dimension.resize(4);
-
-      FieldMetaData_.dimension[0] = ildgFormat_.lx ;
-      FieldMetaData_.dimension[1] = ildgFormat_.ly ;
-      FieldMetaData_.dimension[2] = ildgFormat_.lz ;
-      FieldMetaData_.dimension[3] = ildgFormat_.lt ;
-
-      if ( found_usqcdInfo ) { 
-	FieldMetaData_.plaquette = usqcdInfo_.plaq;
-	FieldMetaData_.link_trace= usqcdInfo_.linktr;
-	std::cout << GridLogMessage <<"This configuration was probably written by USQCD "<<std::endl;
-	std::cout << GridLogMessage <<"USQCD xml record Plaquette : "<<FieldMetaData_.plaquette<<std::endl;
-	std::cout << GridLogMessage <<"USQCD xml record LinkTrace : "<<FieldMetaData_.link_trace<<std::endl;
-      } else { 
-	FieldMetaData_.plaquette = 0.0;
-	FieldMetaData_.link_trace= 0.0;
-	std::cout << GridLogWarning << "This configuration is unsafe with no plaquette records that can verify it !!! "<<std::endl;
-      }
-    }
-
-    ////////////////////////////////////////////////////////////
-    // Really really want to mandate a scidac checksum
-    ////////////////////////////////////////////////////////////
-    if ( found_scidacChecksum ) {
-      FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
-      FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
-      scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
-      assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
-      assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
-      std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
-    } else { 
-      std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
-      assert(0); // Can I insist always checksum ?
-    }
-
-    if ( found_FieldMetaData || found_usqcdInfo ) {
-      FieldMetaData checker;
-      GaugeStatistics(Umu,checker);
-      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
-      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
-      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
-    }
-  }
- };
-
-}}
-
-//HAVE_LIME
-#endif
-
-#endif
@@ -1,237 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/parallelIO/IldgIO.h
-
-Copyright (C) 2015
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_ILDGTYPES_IO_H
-#define GRID_ILDGTYPES_IO_H
-
-#ifdef HAVE_LIME
-extern "C" { // for linkage
-#include "lime.h"
-}
-
-namespace Grid {
-
-/////////////////////////////////////////////////////////////////////////////////
-// Data representation of records that enter ILDG and SciDac formats
-/////////////////////////////////////////////////////////////////////////////////
-
-#define GRID_FORMAT      "grid-format"
-#define ILDG_FORMAT      "ildg-format"
-#define ILDG_BINARY_DATA "ildg-binary-data"
-#define ILDG_DATA_LFN    "ildg-data-lfn"
-#define SCIDAC_CHECKSUM           "scidac-checksum"
-#define SCIDAC_PRIVATE_FILE_XML   "scidac-private-file-xml"
-#define SCIDAC_FILE_XML           "scidac-file-xml"
-#define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml"
-#define SCIDAC_RECORD_XML         "scidac-record-xml"
-#define SCIDAC_BINARY_DATA        "scidac-binary-data"
-// Unused SCIDAC records names; could move to support this functionality
-#define SCIDAC_SITELIST           "scidac-sitelist"
-
-  ////////////////////////////////////////////////////////////
-  const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
-  const int GRID_IO_MULTIFILE  = 1; // hardcode lift from QIO compat
-  const int GRID_IO_FIELD      = 0; // hardcode lift from QIO compat
-  const int GRID_IO_GLOBAL     = 1; // hardcode lift from QIO compat
-  ////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////
-// QIO uses mandatory "private" records fixed format
-// Private is in principle "opaque" however it can't be changed now because that would break existing 
-// file compatability, so should be correct to assume the undocumented but defacto file structure.
-/////////////////////////////////////////////////////////////////////////////////
-
-struct emptyUserRecord : Serializable { 
-  GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
-  emptyUserRecord() { dummy=0; };
-};
-
-////////////////////////
-// Scidac private file xml
-// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
-////////////////////////
-struct scidacFile : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
-                                  double, version,
-                                  int, spacetime,
-				  std::string, dims, // must convert to int
-                                  int, volfmt);
-
-  std::vector<int> getDimensions(void) { 
-    std::stringstream stream(dims);
-    std::vector<int> dimensions;
-    int n;
-    while(stream >> n){
-      dimensions.push_back(n);
-    }
-    return dimensions;
-  }
-
-  void setDimensions(std::vector<int> dimensions) { 
-    char delimiter = ' ';
-    std::stringstream stream;
-    for(int i=0;i<dimensions.size();i++){ 
-      stream << dimensions[i];
-      if ( i != dimensions.size()-1) { 
-	stream << delimiter <<std::endl;
-      }
-    }
-    dims = stream.str();
-  }
-
-  // Constructor provides Grid
-  scidacFile() =default; // default constructor
-  scidacFile(GridBase * grid){
-    version      = 1.0;
-    spacetime    = grid->_ndimension;
-    setDimensions(grid->FullDimensions()); 
-    volfmt       = GRID_IO_SINGLEFILE;
-  }
-
-};
-
-///////////////////////////////////////////////////////////////////////
-// scidac-private-record-xml : example
-// <scidacRecord>
-// <version>1.1</version><date>Tue Jul 26 21:14:44 2011 UTC</date><recordtype>0</recordtype>
-// <datatype>QDP_D3_ColorMatrix</datatype><precision>D</precision><colors>3</colors><spins>4</spins>
-// <typesize>144</typesize><datacount>4</datacount>
-// </scidacRecord>
-///////////////////////////////////////////////////////////////////////
-
-struct scidacRecord : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
-                                  double, version,
-                                  std::string, date,
-				  int, recordtype,
-				  std::string, datatype,
-				  std::string, precision,
-				  int, colors,
-				  int, spins,
-				  int, typesize,
-				  int, datacount);
-
-  scidacRecord()
-  : version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
-  {}
-};
-
-////////////////////////
-// ILDG format
-////////////////////////
-struct ildgFormat : Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
-				  double, version,
-				  std::string, field,
-				  int, precision,
-				  int, lx,
-				  int, ly,
-				  int, lz,
-				  int, lt);
-  ildgFormat() { version=1.0; };
-};
-////////////////////////
-// USQCD info
-////////////////////////
-struct usqcdInfo : Serializable { 
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
-				  double, version,
-				  double, plaq,
-				  double, linktr,
-				  std::string, info);
-  usqcdInfo() { 
-    version=1.0; 
-  };
-};
-////////////////////////
-// Scidac Checksum
-////////////////////////
-struct scidacChecksum : Serializable { 
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
-				  double, version,
-				  std::string, suma,
-				  std::string, sumb);
-  scidacChecksum() { 
-    version=1.0; 
-  };
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Type:           scidac-file-xml         <title>MILC ILDG archival gauge configuration</title>
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Type:           
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////
-// Scidac private file xml 
-// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile> 
-////////////////////////                                                                                                                                                                              
-
-#if 0
-////////////////////////////////////////////////////////////////////////////////////////
-// From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
-////////////////////////////////////////////////////////////////////////////////////////
-struct usqcdPropFile : Serializable { 
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
-				  double, version,
-				  std::string, type,
-				  std::string, info);
-  usqcdPropFile() { 
-    version=1.0; 
-  };
-};
-struct usqcdSourceInfo : Serializable { 
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
-				  double, version,
-				  std::string, info);
-  usqcdSourceInfo() { 
-    version=1.0; 
-  };
-};
-struct usqcdPropInfo : Serializable { 
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
-				  double, version,
-				  int, spin,
-				  int, color,
-				  std::string, info);
-  usqcdPropInfo() { 
-    version=1.0; 
-  };
-};
-#endif
-
-}
-#endif
-#endif
@@ -1,327 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/parallelIO/NerscIO.h
-
-    Copyright (C) 2015
-
-
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <algorithm>
-#include <iostream>
-#include <iomanip>
-#include <fstream>
-#include <map>
-#include <unistd.h>
-#include <sys/utsname.h>
-#include <pwd.h>
-
-namespace Grid {
-
-  ///////////////////////////////////////////////////////
-  // Precision mapping
-  ///////////////////////////////////////////////////////
-  template<class vobj> static std::string getFormatString (void)
-  {
-    std::string format;
-    typedef typename getPrecision<vobj>::real_scalar_type stype;
-    if ( sizeof(stype) == sizeof(float) ) {
-      format = std::string("IEEE32BIG");
-    }
-    if ( sizeof(stype) == sizeof(double) ) {
-      format = std::string("IEEE64BIG");
-    }
-    return format;
-  }
-  ////////////////////////////////////////////////////////////////////////////////
-  // header specification/interpretation
-  ////////////////////////////////////////////////////////////////////////////////
-    class FieldMetaData : Serializable {
-    public:
-
-      GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
-				      int, nd,
-				      std::vector<int>, dimension,
-				      std::vector<std::string>, boundary,
-				      int, data_start,
-				      std::string, hdr_version,
-				      std::string, storage_format,
-				      double, link_trace,
-				      double, plaquette,
-				      uint32_t, checksum,
-				      uint32_t, scidac_checksuma,
-				      uint32_t, scidac_checksumb,
-				      unsigned int, sequence_number,
-				      std::string, data_type,
-				      std::string, ensemble_id,
-				      std::string, ensemble_label,
-				      std::string, ildg_lfn,
-				      std::string, creator,
-				      std::string, creator_hardware,
-				      std::string, creation_date,
-				      std::string, archive_date,
-				      std::string, floating_point);
-      // WARNING: non-initialised values might lead to twisted parallel IO
-      // issues, std::string are fine because they initliase to size 0
-      // as per C++ standard.
-      FieldMetaData(void) 
-      : nd(4), dimension(4,0), boundary(4, ""), data_start(0),
-      link_trace(0.), plaquette(0.), checksum(0),
-      scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
-      {}
-    };
-
-  namespace QCD {
-
-    using namespace Grid;
-
-
-    //////////////////////////////////////////////////////////////////////
-    // Bit and Physical Checksumming and QA of data
-    //////////////////////////////////////////////////////////////////////
-    inline void GridMetaData(GridBase *grid,FieldMetaData &header)
-    {
-      int nd = grid->_ndimension;
-      header.nd = nd;
-      header.dimension.resize(nd);
-      header.boundary.resize(nd);
-      header.data_start = 0;
-      for(int d=0;d<nd;d++) {
-	header.dimension[d] = grid->_fdimensions[d];
-      }
-      for(int d=0;d<nd;d++) {
-	header.boundary[d] = std::string("PERIODIC");
-      }
-    }
-
-    inline void MachineCharacteristics(FieldMetaData &header)
-    {
-      // Who
-      struct passwd *pw = getpwuid (getuid());
-      if (pw) header.creator = std::string(pw->pw_name); 
-
-      // When
-      std::time_t t = std::time(nullptr);
-      std::tm tm_ = *std::localtime(&t);
-      std::ostringstream oss; 
-      //      oss << std::put_time(&tm_, "%c %Z");
-      header.creation_date = oss.str();
-      header.archive_date  = header.creation_date;
-
-      // What
-      struct utsname name;  uname(&name);
-      header.creator_hardware = std::string(name.nodename)+"-";
-      header.creator_hardware+= std::string(name.machine)+"-";
-      header.creator_hardware+= std::string(name.sysname)+"-";
-      header.creator_hardware+= std::string(name.release);
-    }
-
-#define dump_meta_data(field, s)					\
-      s << "BEGIN_HEADER"      << std::endl;				\
-      s << "HDR_VERSION = "    << field.hdr_version    << std::endl;	\
-      s << "DATATYPE = "       << field.data_type      << std::endl;	\
-      s << "STORAGE_FORMAT = " << field.storage_format << std::endl;	\
-      for(int i=0;i<4;i++){						\
-	s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
-      }									\
-      s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
-      s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl; \
-      for(int i=0;i<4;i++){						\
-	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
-      }									\
-									\
-      s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
-      s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
-      s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
-      s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
-      s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;	\
-      s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;	\
-      s << "CREATOR = "         << field.creator          << std::endl;	\
-      s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;	\
-      s << "CREATION_DATE = "   << field.creation_date    << std::endl;	\
-      s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;	\
-      s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
-      s << "END_HEADER"         << std::endl;
-
-template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
-{
-  GridBase *grid = field._grid;
-  std::string format = getFormatString<vobj>();
-   header.floating_point = format;
-   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
-   GridMetaData(grid,header); 
-   MachineCharacteristics(header);
- }
- inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
- {
-   // How to convert data precision etc...
-   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
-   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
- }
- inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
- {
-   // How to convert data precision etc...
-   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
-   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
- }
- template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
- {
-   
-   GridBase *grid = field._grid;
-   std::string format = getFormatString<vLorentzColourMatrixF>();
-   header.floating_point = format;
-   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
-   GridMetaData(grid,header); 
-   GaugeStatistics(field,header);
-   MachineCharacteristics(header);
- }
- template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
- {
-   GridBase *grid = field._grid;
-   std::string format = getFormatString<vLorentzColourMatrixD>();
-   header.floating_point = format;
-   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
-   GridMetaData(grid,header); 
-   GaugeStatistics(field,header);
-   MachineCharacteristics(header);
- }
-
-    //////////////////////////////////////////////////////////////////////
-    // Utilities ; these are QCD aware
-    //////////////////////////////////////////////////////////////////////
-    inline void reconstruct3(LorentzColourMatrix & cm)
-    {
-      const int x=0;
-      const int y=1;
-      const int z=2;
-      for(int mu=0;mu<Nd;mu++){
-	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
-      }
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    // Some data types for intermediate storage
-    ////////////////////////////////////////////////////////////////////////////////
-    template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
-
-    typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
-    typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
-    typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
-
-/////////////////////////////////////////////////////////////////////////////////
-// Simple classes for precision conversion
-/////////////////////////////////////////////////////////////////////////////////
-template <class fobj, class sobj>
-struct BinarySimpleUnmunger {
-  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
-  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
-  
-  void operator()(sobj &in, fobj &out) {
-    // take word by word and transform accoding to the status
-    fobj_stype *out_buffer = (fobj_stype *)&out;
-    sobj_stype *in_buffer = (sobj_stype *)&in;
-    size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
-    size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
-    assert(fobj_words == sobj_words);
-    
-    for (unsigned int word = 0; word < sobj_words; word++)
-      out_buffer[word] = in_buffer[word];  // type conversion on the fly
-    
-  }
-};
-
-template <class fobj, class sobj>
-struct BinarySimpleMunger {
-  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
-  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
-
-  void operator()(fobj &in, sobj &out) {
-    // take word by word and transform accoding to the status
-    fobj_stype *in_buffer = (fobj_stype *)&in;
-    sobj_stype *out_buffer = (sobj_stype *)&out;
-    size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
-    size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
-    assert(fobj_words == sobj_words);
-    
-    for (unsigned int word = 0; word < sobj_words; word++)
-      out_buffer[word] = in_buffer[word];  // type conversion on the fly
-    
-  }
-};
-
-
-    template<class fobj,class sobj>
-    struct GaugeSimpleMunger{
-      void operator()(fobj &in, sobj &out) {
-        for (int mu = 0; mu < Nd; mu++) {
-          for (int i = 0; i < Nc; i++) {
-          for (int j = 0; j < Nc; j++) {
-	    out(mu)()(i, j) = in(mu)()(i, j);
-	  }}
-        }
-      };
-    };
-
-    template <class fobj, class sobj>
-    struct GaugeSimpleUnmunger {
-
-      void operator()(sobj &in, fobj &out) {
-        for (int mu = 0; mu < Nd; mu++) {
-          for (int i = 0; i < Nc; i++) {
-          for (int j = 0; j < Nc; j++) {
-	    out(mu)()(i, j) = in(mu)()(i, j);
-	  }}
-        }
-      };
-    };
-
-    template<class fobj,class sobj>
-    struct Gauge3x2munger{
-      void operator() (fobj &in,sobj &out){
-	for(int mu=0;mu<Nd;mu++){
-	  for(int i=0;i<2;i++){
-	  for(int j=0;j<3;j++){
-	    out(mu)()(i,j) = in(mu)(i)(j);
-	  }}
-	}
-	reconstruct3(out);
-      }
-    };
-
-    template<class fobj,class sobj>
-    struct Gauge3x2unmunger{
-      void operator() (sobj &in,fobj &out){
-	for(int mu=0;mu<Nd;mu++){
-	  for(int i=0;i<2;i++){
-	  for(int j=0;j<3;j++){
-	    out(mu)(i)(j) = in(mu)()(i,j);
-	  }}
-	}
-      }
-    };
-  }
-
-
-}
@@ -1,363 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/parallelIO/NerscIO.h
-
-    Copyright (C) 2015
-
-    Author: Matt Spraggs <matthew.spraggs@gmail.com>
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_NERSC_IO_H
-#define GRID_NERSC_IO_H
-
-namespace Grid {
-  namespace QCD {
-
-    using namespace Grid;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    // Write and read from fstream; comput header offset for payload
-    ////////////////////////////////////////////////////////////////////////////////
-    class NerscIO : public BinaryIO { 
-    public:
-
-      static inline void truncate(std::string file){
-	std::ofstream fout(file,std::ios::out);
-      }
-  
-      static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
-      {
-      std::ofstream fout(file,std::ios::out|std::ios::in);
-      fout.seekp(0,std::ios::beg);
-      dump_meta_data(field, fout);
-      field.data_start = fout.tellp();
-      return field.data_start;
-    }
-
-      // for the header-reader
-      static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
-      {
-      uint64_t offset=0;
-      std::map<std::string,std::string> header;
-      std::string line;
-
-      //////////////////////////////////////////////////
-      // read the header
-      //////////////////////////////////////////////////
-      std::ifstream fin(file);
-
-      getline(fin,line); // read one line and insist is 
-
-      removeWhitespace(line);
-      std::cout << GridLogMessage << "* " << line << std::endl;
-
-      assert(line==std::string("BEGIN_HEADER"));
-
-      do {
-      getline(fin,line); // read one line
-      std::cout << GridLogMessage << "* "<<line<< std::endl;
-      int eq = line.find("=");
-      if(eq >0) {
-      std::string key=line.substr(0,eq);
-      std::string val=line.substr(eq+1);
-      removeWhitespace(key);
-      removeWhitespace(val);
-      
-      header[key] = val;
-    }
-    } while( line.find("END_HEADER") == std::string::npos );
-
-      field.data_start = fin.tellg();
-
-      //////////////////////////////////////////////////
-      // chomp the values
-      //////////////////////////////////////////////////
-      field.hdr_version    = header["HDR_VERSION"];
-      field.data_type      = header["DATATYPE"];
-      field.storage_format = header["STORAGE_FORMAT"];
-  
-      field.dimension[0] = std::stol(header["DIMENSION_1"]);
-      field.dimension[1] = std::stol(header["DIMENSION_2"]);
-      field.dimension[2] = std::stol(header["DIMENSION_3"]);
-      field.dimension[3] = std::stol(header["DIMENSION_4"]);
-
-      assert(grid->_ndimension == 4);
-      for(int d=0;d<4;d++){
-      assert(grid->_fdimensions[d]==field.dimension[d]);
-    }
-
-      field.link_trace = std::stod(header["LINK_TRACE"]);
-      field.plaquette  = std::stod(header["PLAQUETTE"]);
-
-      field.boundary[0] = header["BOUNDARY_1"];
-      field.boundary[1] = header["BOUNDARY_2"];
-      field.boundary[2] = header["BOUNDARY_3"];
-      field.boundary[3] = header["BOUNDARY_4"];
-
-      field.checksum = std::stoul(header["CHECKSUM"],0,16);
-      field.ensemble_id      = header["ENSEMBLE_ID"];
-      field.ensemble_label   = header["ENSEMBLE_LABEL"];
-      field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
-      field.creator          = header["CREATOR"];
-      field.creator_hardware = header["CREATOR_HARDWARE"];
-      field.creation_date    = header["CREATION_DATE"];
-      field.archive_date     = header["ARCHIVE_DATE"];
-      field.floating_point   = header["FLOATING_POINT"];
-
-      return field.data_start;
-    }
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Now the meat: the object readers
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    template<class vsimd>
-    static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
-					 FieldMetaData& header,
-					 std::string file)
-    {
-      typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-
-      GridBase *grid = Umu._grid;
-      uint64_t offset = readHeader(file,Umu._grid,header);
-
-      FieldMetaData clone(header);
-
-      std::string format(header.floating_point);
-
-      int ieee32big = (format == std::string("IEEE32BIG"));
-      int ieee32    = (format == std::string("IEEE32"));
-      int ieee64big = (format == std::string("IEEE64BIG"));
-      int ieee64    = (format == std::string("IEEE64"));
-
-      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-      // depending on datatype, set up munger;
-      // munger is a function of <floating point, Real, data_type>
-      if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
-	if ( ieee32 || ieee32big ) {
-	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
-	    (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
-	     nersc_csum,scidac_csuma,scidac_csumb);
-	}
-	if ( ieee64 || ieee64big ) {
-	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
-	    (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
-	     nersc_csum,scidac_csuma,scidac_csumb);
-	}
-      } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
-	if ( ieee32 || ieee32big ) {
-	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
-	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
-	     nersc_csum,scidac_csuma,scidac_csumb);
-	}
-	if ( ieee64 || ieee64big ) {
-	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
-	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
-	     nersc_csum,scidac_csuma,scidac_csumb);
-	}
-      } else {
-	assert(0);
-      }
-
-      GaugeStatistics(Umu,clone);
-
-      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
-	       <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
-      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
-	       <<" header    "<<header.plaquette<<std::endl;
-      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
-	       <<" header    "<<header.link_trace<<std::endl;
-
-      if ( fabs(clone.plaquette -header.plaquette ) >=  1.0e-5 ) { 
-	std::cout << " Plaquette mismatch "<<std::endl;
-	std::cout << Umu[0]<<std::endl;
-	std::cout << Umu[1]<<std::endl;
-      }
-      if ( nersc_csum != header.checksum ) { 
-	std::cerr << " checksum mismatch " << std::endl;
-	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
-	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
-	std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
-	exit(0);
-      }
-      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
-      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
-      assert(nersc_csum == header.checksum );
-      
-      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
-    }
-
-      template<class vsimd>
-      static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
-					    std::string file, 
-					    int two_row,
-					    int bits32)
-      {
-	typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-
-	typedef iLorentzColourMatrix<vsimd> vobj;
-	typedef typename vobj::scalar_object sobj;
-
-	FieldMetaData header;
-	///////////////////////////////////////////
-	// Following should become arguments
-	///////////////////////////////////////////
-	header.sequence_number = 1;
-	header.ensemble_id     = "UKQCD";
-	header.ensemble_label  = "DWF";
-
-	typedef LorentzColourMatrixD fobj3D;
-	typedef LorentzColour2x3D    fobj2D;
-  
-	GridBase *grid = Umu._grid;
-
-	GridMetaData(grid,header);
-	assert(header.nd==4);
-	GaugeStatistics(Umu,header);
-	MachineCharacteristics(header);
-
-	uint64_t offset;
-
-	// Sod it -- always write 3x3 double
-	header.floating_point = std::string("IEEE64BIG");
-	header.data_type      = std::string("4D_SU3_GAUGE_3x3");
-	GaugeSimpleUnmunger<fobj3D,sobj> munge;
-	if ( grid->IsBoss() ) { 
-	  truncate(file);
-	  offset = writeHeader(header,file);
-	}
-	grid->Broadcast(0,(void *)&offset,sizeof(offset));
-
-	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-	BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-								  nersc_csum,scidac_csuma,scidac_csumb);
-	header.checksum = nersc_csum;
-	if ( grid->IsBoss() ) { 
-	  writeHeader(header,file);
-	}
-
-	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
-		 <<std::hex<<header.checksum
-		 <<std::dec<<" plaq "<< header.plaquette <<std::endl;
-
-      }
-      ///////////////////////////////
-      // RNG state
-      ///////////////////////////////
-      static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
-      {
-	typedef typename GridParallelRNG::RngStateType RngStateType;
-
-	// Following should become arguments
-	FieldMetaData header;
-	header.sequence_number = 1;
-	header.ensemble_id     = "UKQCD";
-	header.ensemble_label  = "DWF";
-
-	GridBase *grid = parallel._grid;
-
-	GridMetaData(grid,header);
-	assert(header.nd==4);
-	header.link_trace=0.0;
-	header.plaquette=0.0;
-	MachineCharacteristics(header);
-
-	uint64_t offset;
-  
-#ifdef RNG_RANLUX
-	header.floating_point = std::string("UINT64");
-	header.data_type      = std::string("RANLUX48");
-#endif
-#ifdef RNG_MT19937
-	header.floating_point = std::string("UINT32");
-	header.data_type      = std::string("MT19937");
-#endif
-#ifdef RNG_SITMO
-	header.floating_point = std::string("UINT64");
-	header.data_type      = std::string("SITMO");
-#endif
-
-	if ( grid->IsBoss() ) { 
-	  truncate(file);
-	  offset = writeHeader(header,file);
-	}
-	grid->Broadcast(0,(void *)&offset,sizeof(offset));
-	
-	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-	BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
-	header.checksum = nersc_csum;
-	if ( grid->IsBoss() ) { 
-	  offset = writeHeader(header,file);
-	}
-
-	std::cout<<GridLogMessage 
-		 <<"Written NERSC RNG STATE "<<file<< " checksum "
-		 <<std::hex<<header.checksum
-		 <<std::dec<<std::endl;
-
-      }
-    
-      static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
-      {
-	typedef typename GridParallelRNG::RngStateType RngStateType;
-
-	GridBase *grid = parallel._grid;
-
-	uint64_t offset = readHeader(file,grid,header);
-
-	FieldMetaData clone(header);
-
-	std::string format(header.floating_point);
-	std::string data_type(header.data_type);
-
-#ifdef RNG_RANLUX
-	assert(format == std::string("UINT64"));
-	assert(data_type == std::string("RANLUX48"));
-#endif
-#ifdef RNG_MT19937
-	assert(format == std::string("UINT32"));
-	assert(data_type == std::string("MT19937"));
-#endif
-#ifdef RNG_SITMO
-	assert(format == std::string("UINT64"));
-	assert(data_type == std::string("SITMO"));
-#endif
-
-	// depending on datatype, set up munger;
-	// munger is a function of <floating point, Real, data_type>
-	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-	BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
-
-	if ( nersc_csum != header.checksum ) { 
-	  std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
-	  exit(0);
-	}
-	assert(nersc_csum == header.checksum );
-
-	std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
-      }
-
-    };
-
-  }}
-#endif
@@ -1,124 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/QCD.h
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_LT_H
-#define GRID_LT_H
-namespace Grid{
-
-// First steps in the complete generalization of the Physics part
-// Design not final
-namespace LatticeTheories {
-
-template <int Dimensions>
-struct LatticeTheory {
-  static const int Nd = Dimensions;
-  static const int Nds = Dimensions * 2;  // double stored field
-  template <typename vtype>
-  using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
-};
-
-template <int Dimensions, int Colours>
-struct LatticeGaugeTheory : public LatticeTheory<Dimensions> {
-  static const int Nds = Dimensions * 2;
-  static const int Nd = Dimensions;
-  static const int Nc = Colours;
-
-  template <typename vtype> 
-  using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > >;
-  template <typename vtype>
-  using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd>;
-  template <typename vtype>
-  using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>;
-  template <typename vtype>
-  using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
-};
-
-template <int Dimensions, int Colours, int Spin>
-struct FermionicLatticeGaugeTheory
-    : public LatticeGaugeTheory<Dimensions, Colours> {
-  static const int Nd = Dimensions;
-  static const int Nds = Dimensions * 2;
-  static const int Nc = Colours;
-  static const int Ns = Spin;
-
-  template <typename vtype>
-  using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
-  template <typename vtype>
-  using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
-  template <typename vtype>
-  using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
-  template <typename vtype>
-  using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
-  // These 2 only if Spin is a multiple of 2
-  static const int Nhs = Spin / 2;
-  template <typename vtype>
-  using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
-  template <typename vtype>
-  using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
-
-  //tests
-  typedef iColourMatrix<Complex> ColourMatrix;
-  typedef iColourMatrix<ComplexF> ColourMatrixF;
-  typedef iColourMatrix<ComplexD> ColourMatrixD;
-
-
-};
-
-// Examples, not complete now.
-struct QCD : public FermionicLatticeGaugeTheory<4, 3, 4> {
-    static const int Xp = 0;
-    static const int Yp = 1;
-    static const int Zp = 2;
-    static const int Tp = 3;
-    static const int Xm = 4;
-    static const int Ym = 5;
-    static const int Zm = 6;
-    static const int Tm = 7;
-
-    typedef FermionicLatticeGaugeTheory FLGT;
-
-    typedef FLGT::iSpinMatrix<Complex  >          SpinMatrix;
-    typedef FLGT::iSpinMatrix<ComplexF >          SpinMatrixF;
-    typedef FLGT::iSpinMatrix<ComplexD >          SpinMatrixD;
-
-};
-struct QED : public FermionicLatticeGaugeTheory<4, 1, 4> {//fill
-};
-
-template <int Dimensions>
-struct Scalar : public LatticeTheory<Dimensions> {};
-
-};  // LatticeTheories
-
-} // Grid
-
-#endif
@@ -1,56 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/ActionBase.h
-
-Copyright (C) 2015-2016
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef ACTION_BASE_H
-#define ACTION_BASE_H
-
-namespace Grid {
-namespace QCD {
-
-template <class GaugeField >
-class Action 
-{
-
- public:
-  bool is_smeared = false;
-  // Heatbath?
-  virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
-  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
-  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
-  virtual std::string action_name()    = 0;                             // return the action name
-  virtual std::string LogParameters()  = 0;                             // prints action parameters
-  virtual ~Action(){}
-};
-
-}
-}
-
-#endif // ACTION_BASE_H
@@ -1,92 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/ActionParams.h
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef GRID_QCD_ACTION_PARAMS_H
-#define GRID_QCD_ACTION_PARAMS_H
-
-namespace Grid {
-namespace QCD {
-
-  // These can move into a params header and be given MacroMagic serialisation
-  struct GparityWilsonImplParams {
-    bool overlapCommsCompute;
-    std::vector<int> twists;
-    GparityWilsonImplParams() : twists(Nd, 0), overlapCommsCompute(false){};
-  };
-  
-  struct WilsonImplParams {
-    bool overlapCommsCompute;
-    std::vector<Complex> boundary_phases;
-    WilsonImplParams() : overlapCommsCompute(false) {
-      boundary_phases.resize(Nd, 1.0);
-    };
-    WilsonImplParams(const std::vector<Complex> phi)
-      : boundary_phases(phi), overlapCommsCompute(false) {}
-  };
-
-  struct StaggeredImplParams {
-    StaggeredImplParams()  {};
-  };
-  
-  struct OneFlavourRationalParams : Serializable {
-    GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, 
-				    RealD, lo, 
-				    RealD, hi, 
-				    int,   MaxIter, 
-				    RealD, tolerance, 
-				    int,   degree, 
-				    int,   precision);
-    
-    // MaxIter and tolerance, vectors??
-    
-    // constructor 
-    OneFlavourRationalParams(	RealD _lo      = 0.0, 
-				RealD _hi      = 1.0, 
-				int _maxit     = 1000,
-				RealD tol      = 1.0e-8, 
-                           	int _degree    = 10,
-				int _precision = 64)
-      : lo(_lo),
-	hi(_hi),
-	MaxIter(_maxit),
-	tolerance(tol),
-	degree(_degree),
-	precision(_precision){};
-  };
-  
-  
-}
-}
-
-
-
-
-#endif
@@ -1,100 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/AbstractEOFAFermion.h
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef  GRID_QCD_ABSTRACT_EOFA_FERMION_H
-#define  GRID_QCD_ABSTRACT_EOFA_FERMION_H
-
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
-
-namespace Grid {
-namespace QCD {
-
-  // DJM: Abstract base class for EOFA fermion types.
-  // Defines layout of additional EOFA-specific parameters and operators.
-  // Use to construct EOFA pseudofermion actions that are agnostic to
-  // Shamir / Mobius / etc., and ensure that no one can construct EOFA
-  // pseudofermion action with non-EOFA fermion type.
-  template<class Impl>
-  class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-
-    public:
-      // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
-      RealD mq1;
-      RealD mq2;
-      RealD mq3;
-      RealD shift;
-      int pm;
-
-      RealD alpha; // Mobius scale
-      RealD k;     // EOFA normalization constant
-
-      virtual void Instantiatable(void) = 0;
-
-      // EOFA-specific operations
-      // Force user to implement in derived classes
-      virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
-      virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
-      virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;
-
-      // Implement derivatives in base class:
-      // for EOFA both DWF and Mobius just need d(Dw)/dU
-      virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDeriv(mat, U, V, dag);
-      };
-      virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDerivOE(mat, U, V, dag);
-      };
-      virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
-        this->DhopDerivEO(mat, U, V, dag);
-      };
-
-      // Recompute 5D coefficients for different value of shift constant
-      // (needed for heatbath loop over poles)
-      virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
-
-      // Constructors
-      AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
-        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
-        : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
-          _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
-      {
-        int Ls = this->Ls;
-        this->alpha = _b + _c;
-        this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
-                    ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
-                    ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
-      };
-  };
-}}
-
-#endif
@@ -1,438 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    template<class Impl>
-    DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
-      GaugeField            &_Umu,
-      GridCartesian         &FiveDimGrid,
-      GridRedBlackCartesian &FiveDimRedBlackGrid,
-      GridCartesian         &FourDimGrid,
-      GridRedBlackCartesian &FourDimRedBlackGrid,
-      RealD _mq1, RealD _mq2, RealD _mq3,
-      RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
-    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-        _shift, _pm, _M5, 1.0, 0.0, p)
-    {
-        RealD eps = 1.0;
-        Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-        assert(zdata->n == this->Ls);
-
-        std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
-        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-
-        Approx::zolotarev_free(zdata);
-    }
-
-    /***************************************************************
-     * Additional EOFA operators only called outside the inverter.
-     * Since speed is not essential, simple axpby-style
-     * implementations should be fine.
-     ***************************************************************/
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-    {
-        int Ls = this->Ls;
-
-        Din = zero;
-        if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
-        else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-        else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
-        else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
-    }
-
-    // This is just the identity for DWF
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-    // This is just the identity for DWF
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
-
-    /*****************************************************************************************************/
-
-    template<class Impl>
-    RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        FermionField Din(psi._grid);
-
-        this->Meooe5D(psi, Din);
-        this->DW(Din, chi, DaggerNo);
-        axpby(chi, 1.0, 1.0, chi, psi);
-        this->M5D(psi, chi);
-        return(norm2(chi));
-    }
-
-    template<class Impl>
-    RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        FermionField Din(psi._grid);
-
-        this->DW(psi, Din, DaggerYes);
-        this->MeooeDag5D(Din, chi);
-        this->M5Ddag(psi, chi);
-        axpby(chi, 1.0, 1.0, chi, psi);
-        return(norm2(chi));
-    }
-
-    /********************************************************************
-     * Performance critical fermion operators called inside the inverter
-     ********************************************************************/
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD shift = this->shift;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-
-        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-        Coeff_t shiftp(0.0), shiftm(0.0);
-        if(shift != 0.0){
-          if(pm == 1){ shiftp = shift*(mq3-mq2); }
-          else{ shiftm = -shift*(mq3-mq2); }
-        }
-
-        std::vector<Coeff_t> diag(Ls,1.0);
-        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
-
-        #if(0)
-            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
-            for(int i=0; i<diag.size(); ++i){
-                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-            }
-            for(int i=0; i<upper.size(); ++i){
-                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-            }
-            for(int i=0; i<lower.size(); ++i){
-                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-            }
-        #endif
-
-        this->M5D(psi, chi, chi, lower, diag, upper);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD shift = this->shift;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-
-        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-        Coeff_t shiftp(0.0), shiftm(0.0);
-        if(shift != 0.0){
-          if(pm == 1){ shiftp = shift*(mq3-mq2); }
-          else{ shiftm = -shift*(mq3-mq2); }
-        }
-
-        std::vector<Coeff_t> diag(Ls,1.0);
-        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
-
-        #if(0)
-            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-            for(int i=0; i<diag.size(); ++i){
-                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-            }
-            for(int i=0; i<upper.size(); ++i){
-                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-            }
-            for(int i=0; i<lower.size(); ++i){
-                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-            }
-        #endif
-
-        this->M5Ddag(psi, chi, chi, lower, diag, upper);
-    }
-
-    // half checkerboard operations
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        std::vector<Coeff_t> diag = this->bee;
-        std::vector<Coeff_t> upper(Ls);
-        std::vector<Coeff_t> lower(Ls);
-
-        for(int s=0; s<Ls; s++){
-          upper[s] = -this->cee[s];
-          lower[s] = -this->cee[s];
-        }
-        upper[Ls-1] = this->dm;
-        lower[0]    = this->dp;
-
-        this->M5D(psi, psi, chi, lower, diag, upper);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-    {
-        int Ls = this->Ls;
-
-        std::vector<Coeff_t> diag = this->bee;
-        std::vector<Coeff_t> upper(Ls);
-        std::vector<Coeff_t> lower(Ls);
-
-        for(int s=0; s<Ls; s++){
-          upper[s] = -this->cee[s];
-          lower[s] = -this->cee[s];
-        }
-        upper[Ls-1] = this->dp;
-        lower[0]    = this->dm;
-
-        this->M5Ddag(psi, psi, chi, lower, diag, upper);
-    }
-
-    /****************************************************************************************/
-
-    //Zolo
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
-    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-        RealD shift = this->shift;
-
-        ////////////////////////////////////////////////////////
-        // Constants for the preconditioned matrix Cayley form
-        ////////////////////////////////////////////////////////
-        this->bs.resize(Ls);
-        this->cs.resize(Ls);
-        this->aee.resize(Ls);
-        this->aeo.resize(Ls);
-        this->bee.resize(Ls);
-        this->beo.resize(Ls);
-        this->cee.resize(Ls);
-        this->ceo.resize(Ls);
-
-        for(int i=0; i<Ls; ++i){
-          this->bee[i] = 4.0 - this->M5 + 1.0;
-          this->cee[i] = 1.0;
-        }
-
-        for(int i=0; i<Ls; ++i){
-          this->aee[i] = this->cee[i];
-          this->bs[i] = this->beo[i] = 1.0;
-          this->cs[i] = this->ceo[i] = 0.0;
-        }
-
-        //////////////////////////////////////////
-        // EOFA shift terms
-        //////////////////////////////////////////
-        if(pm == 1){
-          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-          this->dm = mq1*this->cee[Ls-1];
-        } else if(this->pm == -1) {
-          this->dp = mq1*this->cee[0];
-          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-        } else {
-          this->dp = mq1*this->cee[0];
-          this->dm = mq1*this->cee[Ls-1];
-        }
-
-        //////////////////////////////////////////
-        // LDU decomposition of eeoo
-        //////////////////////////////////////////
-        this->dee.resize(Ls+1);
-        this->lee.resize(Ls);
-        this->leem.resize(Ls);
-        this->uee.resize(Ls);
-        this->ueem.resize(Ls);
-
-        for(int i=0; i<Ls; ++i){
-
-          if(i < Ls-1){
-
-            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-            this->leem[i] = this->dm/this->bee[i];
-            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-            this->dee[i] = this->bee[i];
-
-            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-            this->ueem[i] = this->dp / this->bee[0];
-            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
-          } else {
-
-            this->lee[i]  = 0.0;
-            this->leem[i] = 0.0;
-            this->uee[i]  = 0.0;
-            this->ueem[i] = 0.0;
-
-          }
-        }
-
-        {
-          Coeff_t delta_d = 1.0 / this->bee[0];
-          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
-        }
-
-        int inv = 1;
-        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
-    }
-
-    // Recompute Cayley-form coefficients for different shift
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-    {
-        this->shift = new_shift;
-        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        int Ls = this->Ls;
-
-        GridBase* grid = this->FermionRedBlackGrid();
-        int LLs = grid->_rdimensions[0];
-
-        if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-        for(int s=0; s<Ls; s++){
-            Pplus(s,s)  = this->bee[s];
-            Pminus(s,s) = this->bee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pminus(s,s+1) = -this->cee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pplus(s+1,s) = -this->cee[s+1];
-        }
-
-        Pplus (0,Ls-1) = this->dp;
-        Pminus(Ls-1,0) = this->dm;
-
-        Eigen::MatrixXcd PplusMat ;
-        Eigen::MatrixXcd PminusMat;
-
-        #if(0)
-            std::cout << GridLogMessage << "Pplus:" << std::endl;
-            for(int s=0; s<Ls; ++s){
-                for(int ss=0; ss<Ls; ++ss){
-                    std::cout << Pplus(s,ss) << "\t";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << GridLogMessage << "Pminus:" << std::endl;
-            for(int s=0; s<Ls; ++s){
-                for(int ss=0; ss<Ls; ++ss){
-                    std::cout << Pminus(s,ss) << "\t";
-                }
-                std::cout << std::endl;
-            }
-        #endif
-
-        if(inv) {
-            PplusMat  = Pplus.inverse();
-            PminusMat = Pminus.inverse();
-        } else {
-            PplusMat  = Pplus;
-            PminusMat = Pminus;
-        }
-
-        if(dag){
-            PplusMat.adjointInPlace();
-            PminusMat.adjointInPlace();
-        }
-
-        typedef typename SiteHalfSpinor::scalar_type scalar_type;
-        const int Nsimd = Simd::Nsimd();
-        Matp.resize(Ls*LLs);
-        Matm.resize(Ls*LLs);
-
-        for(int s2=0; s2<Ls; s2++){
-        for(int s1=0; s1<LLs; s1++){
-            int istride = LLs;
-            int ostride = 1;
-            Simd Vp;
-            Simd Vm;
-            scalar_type *sp = (scalar_type*) &Vp;
-            scalar_type *sm = (scalar_type*) &Vm;
-            for(int l=0; l<Nsimd; l++){
-                if(switcheroo<Coeff_t>::iscomplex()) {
-                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
-                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
-                } else {
-                    // if real
-                    scalar_type tmp;
-                    tmp = PplusMat (l*istride+s1*ostride,s2);
-                    sp[l] = scalar_type(tmp.real(),tmp.real());
-                    tmp = PminusMat(l*istride+s1*ostride,s2);
-                    sm[l] = scalar_type(tmp.real(),tmp.real());
-                }
-            }
-            Matp[LLs*s2+s1] = Vp;
-            Matm[LLs*s2+s1] = Vm;
-        }}
-    }
-
-    FermOpTemplateInstantiate(DomainWallEOFAFermion);
-    GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
-
-}}
@@ -1,115 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.h
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
-#define  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
-
-#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  template<class Impl>
-  class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
-  {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-
-    public:
-      // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
-      // for red-black preconditioned Shamir EOFA
-      Coeff_t dm;
-      Coeff_t dp;
-
-      virtual void Instantiatable(void) {};
-
-      // EOFA-specific operations
-      virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
-      virtual void  Dtilde     (const FermionField& in, FermionField& out);
-      virtual void  DtildeInv  (const FermionField& in, FermionField& out);
-
-      // override multiply
-      virtual RealD M          (const FermionField& in, FermionField& out);
-      virtual RealD Mdag       (const FermionField& in, FermionField& out);
-
-      // half checkerboard operations
-      virtual void  Mooee      (const FermionField& in, FermionField& out);
-      virtual void  MooeeDag   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag(const FermionField& in, FermionField& out);
-
-      virtual void   M5D       (const FermionField& psi, FermionField& chi);
-      virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);
-
-      /////////////////////////////////////////////////////
-      // Instantiate different versions depending on Impl
-      /////////////////////////////////////////////////////
-      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
-
-      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
-
-      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
-
-      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
-
-      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
-
-      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
-
-      virtual void RefreshShiftCoefficients(RealD new_shift);
-
-      // Constructors
-      DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
-        RealD _M5, const ImplParams& p=ImplParams());
-
-    protected:
-      void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
-  };
-}}
-
-#define INSTANTIATE_DPERP_DWF_EOFA(A)\
-template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
-template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
-
-#undef  DOMAIN_WALL_EOFA_DPERP_DENSE
-#define DOMAIN_WALL_EOFA_DPERP_CACHE
-#undef  DOMAIN_WALL_EOFA_DPERP_LINALG
-#define DOMAIN_WALL_EOFA_DPERP_VEC
-
-#endif
@@ -1,248 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-    // Pminus fowards
-    // Pplus  backwards..
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        int Ls = this->Ls;
-        GridBase* grid = psi._grid;
-
-        assert(phi.checkerboard == psi.checkerboard);
-        chi.checkerboard = psi.checkerboard;
-        // Flops = 6.0*(Nc*Ns) *Ls*vol
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-            for(int s=0; s<Ls; s++){
-                auto tmp = psi._odata[0];
-                if(s==0) {
-                    spProj5m(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+Ls-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else if(s==(Ls-1)) {
-                    spProj5m(tmp, psi._odata[ss+0]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else {
-                    spProj5m(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5p(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                }
-            }
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        int Ls = this->Ls;
-        GridBase* grid = psi._grid;
-        assert(phi.checkerboard == psi.checkerboard);
-        chi.checkerboard=psi.checkerboard;
-
-        // Flops = 6.0*(Nc*Ns) *Ls*vol
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-            auto tmp = psi._odata[0];
-            for(int s=0; s<Ls; s++){
-                if(s==0) {
-                    spProj5p(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+Ls-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else if(s==(Ls-1)) {
-                    spProj5p(tmp, psi._odata[ss+0]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                } else {
-                    spProj5p(tmp, psi._odata[ss+s+1]);
-                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-                    spProj5m(tmp, psi._odata[ss+s-1]);
-                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-                }
-            }
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        GridBase* grid = psi._grid;
-        int Ls = this->Ls;
-
-        chi.checkerboard = psi.checkerboard;
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-
-            auto tmp1 = psi._odata[0];
-            auto tmp2 = psi._odata[0];
-
-            // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-            // Apply (L^{\prime})^{-1}
-            chi[ss] = psi[ss]; // chi[0]=psi[0]
-            for(int s=1; s<Ls; s++){
-                spProj5p(tmp1, chi[ss+s-1]);
-                chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-            }
-
-            // L_m^{-1}
-            for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-                spProj5m(tmp1, chi[ss+s]);
-                chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-            }
-
-            // U_m^{-1} D^{-1}
-            for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-                spProj5p(tmp1, chi[ss+Ls-1]);
-                chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
-            }
-            spProj5m(tmp2, chi[ss+Ls-1]);
-            chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
-
-            // Apply U^{-1}
-            for(int s=Ls-2; s>=0; s--){
-                spProj5m(tmp1, chi[ss+s+1]);
-                chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        GridBase* grid = psi._grid;
-        int Ls = this->Ls;
-
-        assert(psi.checkerboard == psi.checkerboard);
-        chi.checkerboard = psi.checkerboard;
-
-        std::vector<Coeff_t> ueec(Ls);
-        std::vector<Coeff_t> deec(Ls+1);
-        std::vector<Coeff_t> leec(Ls);
-        std::vector<Coeff_t> ueemc(Ls);
-        std::vector<Coeff_t> leemc(Ls);
-
-        for(int s=0; s<ueec.size(); s++){
-            ueec[s]  = conjugate(this->uee[s]);
-            deec[s]  = conjugate(this->dee[s]);
-            leec[s]  = conjugate(this->lee[s]);
-            ueemc[s] = conjugate(this->ueem[s]);
-            leemc[s] = conjugate(this->leem[s]);
-        }
-        deec[Ls] = conjugate(this->dee[Ls]);
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
-
-            auto tmp1 = psi._odata[0];
-            auto tmp2 = psi._odata[0];
-
-            // Apply (U^{\prime})^{-dagger}
-            chi[ss] = psi[ss];
-            for(int s=1; s<Ls; s++){
-                spProj5m(tmp1, chi[ss+s-1]);
-                chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
-            }
-
-            // U_m^{-\dagger}
-            for(int s=0; s<Ls-1; s++){
-                spProj5p(tmp1, chi[ss+s]);
-                chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
-            }
-
-            // L_m^{-\dagger} D^{-dagger}
-            for(int s=0; s<Ls-1; s++){
-                spProj5m(tmp1, chi[ss+Ls-1]);
-                chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
-            }
-            spProj5p(tmp2, chi[ss+Ls-1]);
-            chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
-
-            // Apply L^{-dagger}
-            for(int s=Ls-2; s>=0; s--){
-                spProj5p(tmp1, chi[ss+s+1]);
-                chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-    #endif
-
-}}
@@ -1,159 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    /*
-    * Dense matrix versions of routines
-    */
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-    {
-        int Ls = this->Ls;
-        int LLs = psi._grid->_rdimensions[0];
-        int vol = psi._grid->oSites()/LLs;
-
-        chi.checkerboard = psi.checkerboard;
-
-        assert(Ls==LLs);
-
-        Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-        Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
-
-        for(int s=0;s<Ls;s++){
-            Pplus(s,s)  = this->bee[s];
-            Pminus(s,s) = this->bee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pminus(s,s+1) = -this->cee[s];
-        }
-
-        for(int s=0; s<Ls-1; s++){
-            Pplus(s+1,s) = -this->cee[s+1];
-        }
-
-        Pplus (0,Ls-1) = this->dp;
-        Pminus(Ls-1,0) = this->dm;
-
-        Eigen::MatrixXd PplusMat ;
-        Eigen::MatrixXd PminusMat;
-
-        if(inv) {
-            PplusMat  = Pplus.inverse();
-            PminusMat = Pminus.inverse();
-        } else {
-            PplusMat  = Pplus;
-            PminusMat = Pminus;
-        }
-
-        if(dag){
-            PplusMat.adjointInPlace();
-            PminusMat.adjointInPlace();
-        }
-
-        // For the non-vectorised s-direction this is simple
-
-        for(auto site=0; site<vol; site++){
-
-            SiteSpinor     SiteChi;
-            SiteHalfSpinor SitePplus;
-            SiteHalfSpinor SitePminus;
-
-            for(int s1=0; s1<Ls; s1++){
-                SiteChi = zero;
-                for(int s2=0; s2<Ls; s2++){
-                    int lex2 = s2 + Ls*site;
-                    if(PplusMat(s1,s2) != 0.0){
-                        spProj5p(SitePplus,psi[lex2]);
-                        accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
-                    }
-                    if(PminusMat(s1,s2) != 0.0){
-                        spProj5m(SitePminus, psi[lex2]);
-                        accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
-                    }
-                }
-                chi[s1+Ls*site] = SiteChi*0.5;
-            }
-        }
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
-
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-        template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    #endif
-
-}}
@@ -1,168 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-    // Pminus fowards
-    // Pplus  backwards
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        Coeff_t one(1.0);
-        int Ls = this->Ls;
-        for(int s=0; s<Ls; s++){
-            if(s==0) {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-            } else if (s==(Ls-1)) {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-            } else {
-              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-            }
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        Coeff_t one(1.0);
-        int Ls = this->Ls;
-        for(int s=0; s<Ls; s++){
-            if(s==0) {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-            } else if (s==(Ls-1)) {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-            } else {
-              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-            }
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        Coeff_t one(1.0);
-        Coeff_t czero(0.0);
-        chi.checkerboard = psi.checkerboard;
-        int Ls = this->Ls;
-
-        FermionField tmp(psi._grid);
-
-        // Apply (L^{\prime})^{-1}
-        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-        for(int s=1; s<Ls; s++){
-            axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-        }
-
-        // L_m^{-1}
-        for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-            axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-        }
-
-        // U_m^{-1} D^{-1}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
-        }
-        axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
-        axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
-
-        // Apply U^{-1}
-        for(int s=Ls-2; s>=0; s--){
-            axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-        }
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        Coeff_t one(1.0);
-        Coeff_t czero(0.0);
-        chi.checkerboard = psi.checkerboard;
-        int Ls = this->Ls;
-
-        FermionField tmp(psi._grid);
-
-        // Apply (U^{\prime})^{-dagger}
-        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-        for(int s=1; s<Ls; s++){
-            axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-        }
-
-        // U_m^{-\dagger}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-        }
-
-        // L_m^{-\dagger} D^{-dagger}
-        for(int s=0; s<Ls-1; s++){
-            axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-        }
-        axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
-        axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
-
-        // Apply L^{-dagger}
-        for(int s=Ls-2; s>=0; s--){
-            axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-        }
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
-
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
-
-    #endif
-
-}}
@@ -1,605 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-    /*
-    * Dense matrix versions of routines
-    */
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-    {
-        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        GridBase* grid = psi._grid;
-        int Ls  = this->Ls;
-        int LLs = grid->_rdimensions[0];
-        const int nsimd = Simd::Nsimd();
-
-        Vector<iSinglet<Simd> > u(LLs);
-        Vector<iSinglet<Simd> > l(LLs);
-        Vector<iSinglet<Simd> > d(LLs);
-
-        assert(Ls/LLs == nsimd);
-        assert(phi.checkerboard == psi.checkerboard);
-
-        chi.checkerboard = psi.checkerboard;
-
-        // just directly address via type pun
-        typedef typename Simd::scalar_type scalar_type;
-        scalar_type* u_p = (scalar_type*) &u[0];
-        scalar_type* l_p = (scalar_type*) &l[0];
-        scalar_type* d_p = (scalar_type*) &d[0];
-
-        for(int o=0;o<LLs;o++){ // outer
-        for(int i=0;i<nsimd;i++){ //inner
-            int s  = o + i*LLs;
-            int ss = o*nsimd + i;
-            u_p[ss] = upper[s];
-            l_p[ss] = lower[s];
-            d_p[ss] = diag[s];
-        }}
-
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        assert(Nc == 3);
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-            #if 0
-
-                alignas(64) SiteHalfSpinor hp;
-                alignas(64) SiteHalfSpinor hm;
-                alignas(64) SiteSpinor fp;
-                alignas(64) SiteSpinor fm;
-
-                for(int v=0; v<LLs; v++){
-
-                    int vp = (v+1)%LLs;
-                    int vm = (v+LLs-1)%LLs;
-
-                    spProj5m(hp, psi[ss+vp]);
-                    spProj5p(hm, psi[ss+vm]);
-
-                    if (vp <= v){ rotate(hp, hp, 1); }
-                    if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-                    hp = 0.5*hp;
-                    hm = 0.5*hm;
-
-                    spRecon5m(fp, hp);
-                    spRecon5p(fm, hm);
-
-                    chi[ss+v] = d[v]*phi[ss+v];
-                    chi[ss+v] = chi[ss+v] + u[v]*fp;
-                    chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-                }
-
-            #else
-
-                for(int v=0; v<LLs; v++){
-
-                    vprefetch(psi[ss+v+LLs]);
-
-                    int vp = (v==LLs-1) ? 0     : v+1;
-                    int vm = (v==0)     ? LLs-1 : v-1;
-
-                    Simd hp_00 = psi[ss+vp]()(2)(0);
-                    Simd hp_01 = psi[ss+vp]()(2)(1);
-                    Simd hp_02 = psi[ss+vp]()(2)(2);
-                    Simd hp_10 = psi[ss+vp]()(3)(0);
-                    Simd hp_11 = psi[ss+vp]()(3)(1);
-                    Simd hp_12 = psi[ss+vp]()(3)(2);
-
-                    Simd hm_00 = psi[ss+vm]()(0)(0);
-                    Simd hm_01 = psi[ss+vm]()(0)(1);
-                    Simd hm_02 = psi[ss+vm]()(0)(2);
-                    Simd hm_10 = psi[ss+vm]()(1)(0);
-                    Simd hm_11 = psi[ss+vm]()(1)(1);
-                    Simd hm_12 = psi[ss+vm]()(1)(2);
-
-                    if(vp <= v){
-                        hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-                        hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-                        hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-                        hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-                        hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-                        hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-                    }
-
-                    if(vm >= v){
-                        hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-                        hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-                        hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-                        hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-                        hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-                        hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-                    }
-
-                    // Can force these to real arithmetic and save 2x.
-                    Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-                    Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-                    Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-                    Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-                    Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-                    Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-                    Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-                    Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-                    Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-                    Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-                    Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-                    Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-                    vstream(chi[ss+v]()(0)(0), p_00);
-                    vstream(chi[ss+v]()(0)(1), p_01);
-                    vstream(chi[ss+v]()(0)(2), p_02);
-                    vstream(chi[ss+v]()(1)(0), p_10);
-                    vstream(chi[ss+v]()(1)(1), p_11);
-                    vstream(chi[ss+v]()(1)(2), p_12);
-                    vstream(chi[ss+v]()(2)(0), p_20);
-                    vstream(chi[ss+v]()(2)(1), p_21);
-                    vstream(chi[ss+v]()(2)(2), p_22);
-                    vstream(chi[ss+v]()(3)(0), p_30);
-                    vstream(chi[ss+v]()(3)(1), p_31);
-                    vstream(chi[ss+v]()(3)(2), p_32);
-                }
-
-            #endif
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-    {
-        GridBase* grid = psi._grid;
-        int Ls  = this->Ls;
-        int LLs = grid->_rdimensions[0];
-        int nsimd = Simd::Nsimd();
-
-        Vector<iSinglet<Simd> > u(LLs);
-        Vector<iSinglet<Simd> > l(LLs);
-        Vector<iSinglet<Simd> > d(LLs);
-
-        assert(Ls/LLs == nsimd);
-        assert(phi.checkerboard == psi.checkerboard);
-
-        chi.checkerboard = psi.checkerboard;
-
-        // just directly address via type pun
-        typedef typename Simd::scalar_type scalar_type;
-        scalar_type* u_p = (scalar_type*) &u[0];
-        scalar_type* l_p = (scalar_type*) &l[0];
-        scalar_type* d_p = (scalar_type*) &d[0];
-
-        for(int o=0; o<LLs; o++){ // outer
-        for(int i=0; i<nsimd; i++){ //inner
-            int s  = o + i*LLs;
-            int ss = o*nsimd + i;
-            u_p[ss] = upper[s];
-            l_p[ss] = lower[s];
-            d_p[ss] = diag[s];
-        }}
-
-        this->M5Dcalls++;
-        this->M5Dtime -= usecond();
-
-        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        #if 0
-
-            alignas(64) SiteHalfSpinor hp;
-            alignas(64) SiteHalfSpinor hm;
-            alignas(64) SiteSpinor fp;
-            alignas(64) SiteSpinor fm;
-
-            for(int v=0; v<LLs; v++){
-
-                int vp = (v+1)%LLs;
-                int vm = (v+LLs-1)%LLs;
-
-                spProj5p(hp, psi[ss+vp]);
-                spProj5m(hm, psi[ss+vm]);
-
-                if(vp <= v){ rotate(hp, hp, 1); }
-                if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-                hp = hp*0.5;
-                hm = hm*0.5;
-                spRecon5p(fp, hp);
-                spRecon5m(fm, hm);
-
-                chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-                chi[ss+v] = chi[ss+v]     +l[v]*fm;
-            }
-
-        #else
-
-            for(int v=0; v<LLs; v++){
-
-                vprefetch(psi[ss+v+LLs]);
-
-                int vp = (v == LLs-1) ? 0     : v+1;
-                int vm = (v == 0    ) ? LLs-1 : v-1;
-
-                Simd hp_00 = psi[ss+vp]()(0)(0);
-                Simd hp_01 = psi[ss+vp]()(0)(1);
-                Simd hp_02 = psi[ss+vp]()(0)(2);
-                Simd hp_10 = psi[ss+vp]()(1)(0);
-                Simd hp_11 = psi[ss+vp]()(1)(1);
-                Simd hp_12 = psi[ss+vp]()(1)(2);
-
-                Simd hm_00 = psi[ss+vm]()(2)(0);
-                Simd hm_01 = psi[ss+vm]()(2)(1);
-                Simd hm_02 = psi[ss+vm]()(2)(2);
-                Simd hm_10 = psi[ss+vm]()(3)(0);
-                Simd hm_11 = psi[ss+vm]()(3)(1);
-                Simd hm_12 = psi[ss+vm]()(3)(2);
-
-                if (vp <= v){
-                    hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-                    hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-                    hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-                    hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-                    hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-                    hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-                }
-
-                if(vm >= v){
-                    hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-                    hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-                    hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-                    hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-                    hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-                    hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-                }
-
-                Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-                Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-                Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-                Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-                Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-                Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-                Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-                Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-                Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-                Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-                Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-                Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-                vstream(chi[ss+v]()(0)(0), p_00);
-                vstream(chi[ss+v]()(0)(1), p_01);
-                vstream(chi[ss+v]()(0)(2), p_02);
-                vstream(chi[ss+v]()(1)(0), p_10);
-                vstream(chi[ss+v]()(1)(1), p_11);
-                vstream(chi[ss+v]()(1)(2), p_12);
-                vstream(chi[ss+v]()(2)(0), p_20);
-                vstream(chi[ss+v]()(2)(1), p_21);
-                vstream(chi[ss+v]()(2)(2), p_22);
-                vstream(chi[ss+v]()(3)(0), p_30);
-                vstream(chi[ss+v]()(3)(1), p_31);
-                vstream(chi[ss+v]()(3)(2), p_32);
-            }
-        #endif
-
-        }
-
-        this->M5Dtime += usecond();
-    }
-
-    #ifdef AVX512
-        #include<simd/Intel512common.h>
-        #include<simd/Intel512avx.h>
-        #include<simd/Intel512single.h>
-    #endif
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
-        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        #ifndef AVX512
-        {
-            SiteHalfSpinor BcastP;
-            SiteHalfSpinor BcastM;
-            SiteHalfSpinor SiteChiP;
-            SiteHalfSpinor SiteChiM;
-
-            // Ls*Ls * 2 * 12 * vol flops
-            for(int s1=0; s1<LLs; s1++){
-
-                for(int s2=0; s2<LLs; s2++){
-                for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-                    int s = s2 + l*LLs;
-                    int lex = s2 + LLs*site;
-
-                    if( s2==0 && l==0 ){
-                        SiteChiP=zero;
-                        SiteChiM=zero;
-                    }
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-                    }}
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-                    }}
-
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-                        SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-                    }}
-                }}
-
-                {
-                    int lex = s1 + LLs*site;
-                    for(int sp=0; sp<2;  sp++){
-                    for(int co=0; co<Nc; co++){
-                        vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-                        vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-                    }}
-                }
-            }
-
-        }
-        #else
-        {
-            // pointers
-            //  MASK_REGS;
-            #define Chi_00 %%zmm1
-            #define Chi_01 %%zmm2
-            #define Chi_02 %%zmm3
-            #define Chi_10 %%zmm4
-            #define Chi_11 %%zmm5
-            #define Chi_12 %%zmm6
-            #define Chi_20 %%zmm7
-            #define Chi_21 %%zmm8
-            #define Chi_22 %%zmm9
-            #define Chi_30 %%zmm10
-            #define Chi_31 %%zmm11
-            #define Chi_32 %%zmm12
-
-            #define BCAST0  %%zmm13
-            #define BCAST1  %%zmm14
-            #define BCAST2  %%zmm15
-            #define BCAST3  %%zmm16
-            #define BCAST4  %%zmm17
-            #define BCAST5  %%zmm18
-            #define BCAST6  %%zmm19
-            #define BCAST7  %%zmm20
-            #define BCAST8  %%zmm21
-            #define BCAST9  %%zmm22
-            #define BCAST10 %%zmm23
-            #define BCAST11 %%zmm24
-
-            int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-            for(int s1=0; s1<LLs; s1++){
-
-                for(int s2=0; s2<LLs; s2++){
-
-                    int lex = s2 + LLs*site;
-                    uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-                    uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-                    uint64_t a2 = (uint64_t) &psi[lex];
-
-                    for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-                        if((s2+l)==0) {
-                            asm(
-                                    VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-                                    VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-                                    VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-                                    VBCASTCDUP(0,%2,BCAST0)
-                                    VBCASTCDUP(1,%2,BCAST1)
-                                    VBCASTCDUP(2,%2,BCAST2)
-                                    VBCASTCDUP(3,%2,BCAST3)
-                                    VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-                                    VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-                                    VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-                                    VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-                                    VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-                                    VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-                                    VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-                                    VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-                                    VMULMEM(0,%1,BCAST8,Chi_22)
-                                    VMULMEM(0,%1,BCAST9,Chi_30)
-                                    VMULMEM(0,%1,BCAST10,Chi_31)
-                                    VMULMEM(0,%1,BCAST11,Chi_32)
-                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
-                        } else {
-                            asm(
-                                    VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-                                    VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-                                    VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-                                    VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-                                    VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-                                    VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-                                    VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-                                    VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-                                    VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-                                    VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-                                    VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-                                    VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
-                        }
-                        a0 = a0 + incr;
-                        a1 = a1 + incr;
-                        a2 = a2 + sizeof(typename Simd::scalar_type);
-                    }
-                }
-
-                {
-                  int lexa = s1+LLs*site;
-                  asm (
-                     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-                     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-                     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-                     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-                     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-
-                }
-            }
-        }
-
-        #undef Chi_00
-        #undef Chi_01
-        #undef Chi_02
-        #undef Chi_10
-        #undef Chi_11
-        #undef Chi_12
-        #undef Chi_20
-        #undef Chi_21
-        #undef Chi_22
-        #undef Chi_30
-        #undef Chi_31
-        #undef Chi_32
-
-        #undef BCAST0
-        #undef BCAST1
-        #undef BCAST2
-        #undef BCAST3
-        #undef BCAST4
-        #undef BCAST5
-        #undef BCAST6
-        #undef BCAST7
-        #undef BCAST8
-        #undef BCAST9
-        #undef BCAST10
-        #undef BCAST11
-        #endif
-    };
-
-    // Z-mobius version
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-        std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-        exit(-1);
-    };
-
-    template<class Impl>
-    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-    {
-        int Ls  = this->Ls;
-        int LLs = psi._grid->_rdimensions[0];
-        int vol = psi._grid->oSites()/LLs;
-
-        chi.checkerboard = psi.checkerboard;
-
-        Vector<iSinglet<Simd> > Matp;
-        Vector<iSinglet<Simd> > Matm;
-        Vector<iSinglet<Simd> > *_Matp;
-        Vector<iSinglet<Simd> > *_Matm;
-
-        //  MooeeInternalCompute(dag,inv,Matp,Matm);
-        if(inv && dag){
-            _Matp = &this->MatpInvDag;
-            _Matm = &this->MatmInvDag;
-        }
-
-        if(inv && (!dag)){
-            _Matp = &this->MatpInv;
-            _Matm = &this->MatmInv;
-        }
-
-        if(!inv){
-            MooeeInternalCompute(dag, inv, Matp, Matm);
-            _Matp = &Matp;
-            _Matm = &Matm;
-        }
-
-        assert(_Matp->size() == Ls*LLs);
-
-        this->MooeeInvCalls++;
-        this->MooeeInvTime -= usecond();
-
-        if(switcheroo<Coeff_t>::iscomplex()){
-            parallel_for(auto site=0; site<vol; site++){
-                MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-            }
-        } else {
-            parallel_for(auto site=0; site<vol; site++){
-                MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-            }
-        }
-
-        this->MooeeInvTime += usecond();
-    }
-
-    #ifdef DOMAIN_WALL_EOFA_DPERP_VEC
-
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
-
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
-        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
-
-        template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-        template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-        template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    #endif
-
-}}
@@ -1,502 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  template<class Impl>
-    MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
-      GaugeField            &_Umu,
-      GridCartesian         &FiveDimGrid,
-      GridRedBlackCartesian &FiveDimRedBlackGrid,
-      GridCartesian         &FourDimGrid,
-      GridRedBlackCartesian &FourDimRedBlackGrid,
-      RealD _mq1, RealD _mq2, RealD _mq3,
-      RealD _shift, int _pm, RealD _M5,
-      RealD _b, RealD _c, const ImplParams &p) :
-    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
-        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
-        _shift, _pm, _M5, _b, _c, p)
-    {
-      int Ls = this->Ls;
-
-      RealD eps = 1.0;
-      Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-      assert(zdata->n == this->Ls);
-
-      std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
-        ",c=" << _c << ") with Ls=" << Ls << std::endl;
-      this->SetCoefficientsTanh(zdata, _b, _c);
-      std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
-        ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
-        ",pm=" << _pm << ")" << std::endl;
-
-      Approx::zolotarev_free(zdata);
-
-      if(_shift != 0.0){
-        SetCoefficientsPrecondShiftOps();
-      } else {
-        Mooee_shift.resize(Ls, 0.0);
-        MooeeInv_shift_lc.resize(Ls, 0.0);
-        MooeeInv_shift_norm.resize(Ls, 0.0);
-        MooeeInvDag_shift_lc.resize(Ls, 0.0);
-        MooeeInvDag_shift_norm.resize(Ls, 0.0);
-      }
-    }
-
-    /****************************************************************
-     * Additional EOFA operators only called outside the inverter.  
-     * Since speed is not essential, simple axpby-style
-     * implementations should be fine.
-     ***************************************************************/
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
-    {
-      int Ls = this->Ls;
-      RealD alpha = this->alpha;
-
-      Din = zero;
-      if((sign == 1) && (dag == 0)) { // \Omega_{+}
-        for(int s=0; s<Ls; ++s){
-          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
-        }
-      } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
-        for(int s=0; s<Ls; ++s){
-          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
-        }
-      } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
-        for(int sp=0; sp<Ls; ++sp){
-          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
-        }
-      } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
-        for(int sp=0; sp<Ls; ++sp){
-          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
-        }
-      }
-    }
-
-    // This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
-    // It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
-    {
-      int Ls    = this->Ls;
-      RealD b   = 0.5 * ( 1.0 + this->alpha );
-      RealD c   = 0.5 * ( 1.0 - this->alpha );
-      RealD mq1 = this->mq1;
-
-      for(int s=0; s<Ls; ++s){
-        if(s == 0) {
-          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-          axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
-        } else if(s == (Ls-1)) {
-          axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
-          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-        } else {
-          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
-          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
-        }
-      }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-      RealD m = this->mq1;
-      RealD c = 0.5 * this->alpha;
-      RealD d = 0.5;
-
-      RealD DtInv_p(0.0), DtInv_m(0.0);
-      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
-      FermionField tmp(this->FermionGrid());
-
-      for(int s=0; s<Ls; ++s){
-      for(int sp=0; sp<Ls; ++sp){
-
-        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
-        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
-        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
-        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
-
-        if(sp == 0){
-          axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
-          axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
-        } else {
-          axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
-          axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
-        }
-
-      }}
-    }
-
-    /*****************************************************************************************************/
-
-    template<class Impl>
-    RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      FermionField Din(psi._grid);
-
-      this->Meooe5D(psi, Din);
-      this->DW(Din, chi, DaggerNo);
-      axpby(chi, 1.0, 1.0, chi, psi);
-      this->M5D(psi, chi);
-      return(norm2(chi));
-    }
-
-    template<class Impl>
-    RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      FermionField Din(psi._grid);
-
-      this->DW(psi, Din, DaggerYes);
-      this->MeooeDag5D(Din, chi);
-      this->M5Ddag(psi, chi);
-      axpby(chi, 1.0, 1.0, chi, psi);
-      return(norm2(chi));
-    }
-
-    /********************************************************************
-     * Performance critical fermion operators called inside the inverter
-     ********************************************************************/
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      std::vector<Coeff_t> diag(Ls,1.0);
-      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      std::vector<Coeff_t> diag(Ls,1.0);
-      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    // half checkerboard operations
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      // coefficients of Mooee
-      std::vector<Coeff_t> diag = this->bee;
-      std::vector<Coeff_t> upper(Ls);
-      std::vector<Coeff_t> lower(Ls);
-      for(int s=0; s<Ls; s++){
-        upper[s] = -this->cee[s];
-        lower[s] = -this->cee[s];
-      }
-      upper[Ls-1] *= -this->mq1;
-      lower[0]    *= -this->mq1;
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
-    {
-      int Ls = this->Ls;
-
-      // coefficients of MooeeDag
-      std::vector<Coeff_t> diag = this->bee;
-      std::vector<Coeff_t> upper(Ls);
-      std::vector<Coeff_t> lower(Ls);
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          upper[s] = -this->cee[s+1];
-          lower[s] = this->mq1*this->cee[Ls-1];
-        } else if(s==(Ls-1)) {
-          upper[s] = this->mq1*this->cee[0];
-          lower[s] = -this->cee[s-1];
-        } else {
-          upper[s] = -this->cee[s+1];
-          lower[s] = -this->cee[s-1];
-        }
-      }
-
-      // no shift term
-      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
-
-      // fused M + shift operation
-      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
-    }
-
-    /****************************************************************************************/
-
-    // Computes coefficients for applying Cayley preconditioned shift operators
-    //  (Mooee + \Delta) --> Mooee_shift
-    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
-    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
-    // For the latter two cases, the operation takes the form
-    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
-    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
-    {
-      int   Ls    = this->Ls;
-      int   pm    = this->pm;
-      RealD alpha = this->alpha;
-      RealD k     = this->k;
-      RealD mq1   = this->mq1;
-      RealD shift = this->shift;
-
-      // Initialize
-      Mooee_shift.resize(Ls);
-      MooeeInv_shift_lc.resize(Ls);
-      MooeeInv_shift_norm.resize(Ls);
-      MooeeInvDag_shift_lc.resize(Ls);
-      MooeeInvDag_shift_norm.resize(Ls);
-
-      // Construct Mooee_shift
-      int idx(0);
-      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
-                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-      for(int s=0; s<Ls; ++s){
-        idx = (pm == 1) ? (s) : (Ls-1-s);
-        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
-      }
-
-      // Tridiagonal solve for MooeeInvDag_shift_lc
-      {
-        Coeff_t m(0.0);
-        std::vector<Coeff_t> d = Mooee_shift;
-        std::vector<Coeff_t> u(Ls,0.0);
-        std::vector<Coeff_t> y(Ls,0.0);
-        std::vector<Coeff_t> q(Ls,0.0);
-        if(pm == 1){ u[0] = 1.0; }
-        else{ u[Ls-1] = 1.0; }
-
-        // Tridiagonal matrix algorithm + Sherman-Morrison formula
-        //
-        // We solve
-        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
-        // where Mooee' is the tridiagonal part of Mooee_{+}, and
-        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
-        // so that the outer-product u \otimes v gives the (0,Ls-1)
-        // entry of Mooee_{+}.
-        //
-        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
-        // and then construct the solution to the original system
-        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
-        if(pm == 1){
-          for(int s=1; s<Ls; ++s){
-            m = -this->cee[s] / this->bee[s-1];
-            d[s] -= m*d[s-1];
-            u[s] -= m*u[s-1];
-          }
-        }
-        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
-        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
-        for(int s=Ls-2; s>=0; --s){
-          if(pm == 1){
-            y[s] = d[s] / this->bee[s];
-            q[s] = u[s] / this->bee[s];
-          } else {
-            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
-            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
-          }
-        }
-
-        // Construct MooeeInvDag_shift_lc
-        for(int s=0; s<Ls; ++s){
-          if(pm == 1){
-            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
-              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
-          } else {
-            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
-              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
-          }
-        }
-
-        // Compute remaining coefficients
-        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
-        for(int s=0; s<Ls; ++s){
-
-          // MooeeInv_shift_lc
-          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
-          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
-
-          // MooeeInv_shift_norm
-          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
-
-          // MooeeInvDag_shift_norm
-          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
-          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
-            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
-        }
-      }
-    }
-
-    // Recompute coefficients for a different value of shift constant
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
-    {
-      this->shift = new_shift;
-      if(new_shift != 0.0){
-        SetCoefficientsPrecondShiftOps();
-      } else {
-        int Ls = this->Ls;
-        Mooee_shift.resize(Ls,0.0);
-        MooeeInv_shift_lc.resize(Ls,0.0);
-        MooeeInv_shift_norm.resize(Ls,0.0);
-        MooeeInvDag_shift_lc.resize(Ls,0.0);
-        MooeeInvDag_shift_norm.resize(Ls,0.0);
-      }
-    }
-
-    template<class Impl>
-    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-    {
-      int Ls = this->Ls;
-
-      GridBase* grid = this->FermionRedBlackGrid();
-      int LLs = grid->_rdimensions[0];
-
-      if(LLs == Ls){ return; } // Not vectorised in 5th direction
-
-      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
-
-      for(int s=0; s<Ls; s++){
-        Pplus(s,s)  = this->bee[s];
-        Pminus(s,s) = this->bee[s];
-      }
-
-      for(int s=0; s<Ls-1; s++){
-        Pminus(s,s+1) = -this->cee[s];
-        Pplus(s+1,s) = -this->cee[s+1];
-      }
-
-      Pplus (0,Ls-1) = this->mq1*this->cee[0];
-      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
-
-      if(this->shift != 0.0){
-        RealD c = 0.5 * this->alpha;
-        RealD d = 0.5;
-        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
-        if(this->pm == 1) {
-          for(int s=0; s<Ls; ++s){
-            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
-          }
-        } else {
-          for(int s=0; s<Ls; ++s){
-            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
-          }
-        }
-      }
-
-      Eigen::MatrixXcd PplusMat ;
-      Eigen::MatrixXcd PminusMat;
-
-      if(inv) {
-        PplusMat  = Pplus.inverse();
-        PminusMat = Pminus.inverse();
-      } else {
-        PplusMat  = Pplus;
-        PminusMat = Pminus;
-      }
-
-      if(dag){
-        PplusMat.adjointInPlace();
-        PminusMat.adjointInPlace();
-      }
-
-      typedef typename SiteHalfSpinor::scalar_type scalar_type;
-      const int Nsimd = Simd::Nsimd();
-      Matp.resize(Ls*LLs);
-      Matm.resize(Ls*LLs);
-
-      for(int s2=0; s2<Ls; s2++){
-      for(int s1=0; s1<LLs; s1++){
-        int istride = LLs;
-        int ostride = 1;
-        Simd Vp;
-        Simd Vm;
-        scalar_type *sp = (scalar_type*) &Vp;
-        scalar_type *sm = (scalar_type*) &Vm;
-        for(int l=0; l<Nsimd; l++){
-          if(switcheroo<Coeff_t>::iscomplex()) {
-            sp[l] = PplusMat (l*istride+s1*ostride,s2);
-            sm[l] = PminusMat(l*istride+s1*ostride,s2);
-          } else {
-            // if real
-            scalar_type tmp;
-            tmp = PplusMat (l*istride+s1*ostride,s2);
-            sp[l] = scalar_type(tmp.real(),tmp.real());
-            tmp = PminusMat(l*istride+s1*ostride,s2);
-            sm[l] = scalar_type(tmp.real(),tmp.real());
-          }
-        }
-        Matp[LLs*s2+s1] = Vp;
-        Matm[LLs*s2+s1] = Vm;
-      }}
-  }
-
-  FermOpTemplateInstantiate(MobiusEOFAFermion);
-  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
-
-}}
@@ -1,133 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.h
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef  GRID_QCD_MOBIUS_EOFA_FERMION_H
-#define  GRID_QCD_MOBIUS_EOFA_FERMION_H
-
-#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  template<class Impl>
-  class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
-  {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-
-    public:
-      // Shift operator coefficients for red-black preconditioned Mobius EOFA
-      std::vector<Coeff_t> Mooee_shift;
-      std::vector<Coeff_t> MooeeInv_shift_lc;
-      std::vector<Coeff_t> MooeeInv_shift_norm;
-      std::vector<Coeff_t> MooeeInvDag_shift_lc;
-      std::vector<Coeff_t> MooeeInvDag_shift_norm;
-
-      virtual void Instantiatable(void) {};
-
-      // EOFA-specific operations
-      virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
-      virtual void  Dtilde           (const FermionField& in, FermionField& out);
-      virtual void  DtildeInv        (const FermionField& in, FermionField& out);
-
-      // override multiply
-      virtual RealD M                (const FermionField& in, FermionField& out);
-      virtual RealD Mdag             (const FermionField& in, FermionField& out);
-
-      // half checkerboard operations
-      virtual void  Mooee            (const FermionField& in, FermionField& out);
-      virtual void  MooeeDag         (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv         (const FermionField& in, FermionField& out);
-      virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
-      virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);
-
-      virtual void   M5D             (const FermionField& psi, FermionField& chi);
-      virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);
-
-      /////////////////////////////////////////////////////
-      // Instantiate different versions depending on Impl
-      /////////////////////////////////////////////////////
-      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
-
-      void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-        std::vector<Coeff_t>& shift_coeffs);
-
-      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
-
-      void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-        std::vector<Coeff_t>& shift_coeffs);
-
-      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
-
-      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
-
-      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
-
-      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
-        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
-
-      virtual void RefreshShiftCoefficients(RealD new_shift);
-
-      // Constructors
-      MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
-        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
-        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
-        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
-
-    protected:
-      void SetCoefficientsPrecondShiftOps(void);
-  };
-}}
-
-#define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
-template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
-template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
-template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
-  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
-template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
-template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
-
-#undef  MOBIUS_EOFA_DPERP_DENSE
-#define MOBIUS_EOFA_DPERP_CACHE
-#undef  MOBIUS_EOFA_DPERP_LINALG
-#define MOBIUS_EOFA_DPERP_VEC
-
-#endif
@@ -1,429 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
-  {
-    int Ls = this->Ls;
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      for(int s=0; s<Ls; s++){
-        auto tmp = psi._odata[0];
-        if(s==0){
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5m(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
-    std::vector<Coeff_t> &shift_coeffs)
-  {
-    int Ls = this->Ls;
-    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      for(int s=0; s<Ls; s++){
-        auto tmp = psi._odata[0];
-        if(s==0){
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5m(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5m(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5p(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
-        else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
-        chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
-  {
-    int Ls = this->Ls;
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      auto tmp = psi._odata[0];
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5p(tmp, psi._odata[ss+0]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
-    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
-    std::vector<Coeff_t> &shift_coeffs)
-  {
-    int Ls = this->Ls;
-    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-    GridBase *grid = psi._grid;
-
-    assert(phi.checkerboard == psi.checkerboard);
-    chi.checkerboard = psi.checkerboard;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-      chi[ss+Ls-1] = zero;
-      auto tmp = psi._odata[0];
-      for(int s=0; s<Ls; s++){
-        if(s==0) {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+Ls-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else if(s==(Ls-1)) {
-          spProj5p(tmp, psi._odata[ss+0]);
-          chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        } else {
-          spProj5p(tmp, psi._odata[ss+s+1]);
-          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
-          spProj5m(tmp, psi._odata[ss+s-1]);
-          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
-        }
-        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
-        else{ spProj5m(tmp, psi._odata[ss+s]); }
-        chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
-      }
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
-  {
-    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
-
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp = psi._odata[0];
-
-      // Apply (L^{\prime})^{-1}
-      chi[ss] = psi[ss]; // chi[0]=psi[0]
-      for(int s=1; s<Ls; s++){
-        spProj5p(tmp, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
-      }
-
-      // L_m^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-        spProj5m(tmp, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
-      }
-
-      // U_m^{-1} D^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-        spProj5p(tmp, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-      // Apply U^{-1}
-      for(int s=Ls-2; s>=0; s--){
-        spProj5m(tmp, chi[ss+s+1]);
-        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
-  {
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp1        = psi._odata[0];
-      auto tmp2        = psi._odata[0];
-      auto tmp2_spProj = psi._odata[0];
-
-      // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
-      chi[ss] = psi[ss]; // chi[0]=psi[0]
-      tmp2 = MooeeInv_shift_lc[0]*psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5p(tmp1, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
-        tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
-      }
-      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-      else{ spProj5m(tmp2_spProj, tmp2); }
-
-      // L_m^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-        spProj5m(tmp1, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
-      }
-
-      // U_m^{-1} D^{-1}
-      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-        spProj5p(tmp1, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
-      }
-      // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-      spProj5m(tmp1, chi[ss+Ls-1]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-
-      // Apply U^{-1} and add shift term
-      for(int s=Ls-2; s>=0; s--){
-        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
-        spProj5m(tmp1, chi[ss+s]);
-        chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
-  {
-    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
-
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp = psi._odata[0];
-
-      // Apply (U^{\prime})^{-dag}
-      chi[ss] = psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5m(tmp, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
-      }
-
-      // U_m^{-\dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5p(tmp, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
-      }
-
-      // L_m^{-\dag} D^{-dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5m(tmp, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-
-      // Apply L^{-dag}
-      for(int s=Ls-2; s>=0; s--){
-        spProj5p(tmp, chi[ss+s+1]);
-        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
-  {
-    GridBase *grid = psi._grid;
-    int Ls = this->Ls;
-
-    chi.checkerboard = psi.checkerboard;
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
-
-      auto tmp1        = psi._odata[0];
-      auto tmp2        = psi._odata[0];
-      auto tmp2_spProj = psi._odata[0];
-
-      // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
-      chi[ss] = psi[ss];
-      tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
-      for(int s=1; s<Ls; s++){
-        spProj5m(tmp1, chi[ss+s-1]);
-        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
-        tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
-      }
-      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-      else{ spProj5m(tmp2_spProj, tmp2); }
-
-      // U_m^{-\dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5p(tmp1, chi[ss+s]);
-        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
-      }
-
-      // L_m^{-\dag} D^{-dag}
-      for(int s=0; s<Ls-1; s++){
-        spProj5m(tmp1, chi[ss+Ls-1]);
-        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
-      }
-      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
-      spProj5p(tmp1, chi[ss+Ls-1]);
-      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
-
-      // Apply L^{-dag}
-      for(int s=Ls-2; s>=0; s--){
-        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
-        spProj5p(tmp1, chi[ss+s]);
-        chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_CACHE
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-  #endif
-
-}}
@@ -1,184 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  /*
-  * Dense matrix versions of routines
-  */
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-  {
-    int Ls = this->Ls;
-    int LLs = psi._grid->_rdimensions[0];
-    int vol = psi._grid->oSites()/LLs;
-
-    int pm      = this->pm;
-    RealD shift = this->shift;
-    RealD alpha = this->alpha;
-    RealD k     = this->k;
-    RealD mq1   = this->mq1;
-
-    chi.checkerboard = psi.checkerboard;
-
-    assert(Ls==LLs);
-
-    Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-    Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
-
-    for(int s=0;s<Ls;s++){
-        Pplus(s,s)  = this->bee[s];
-        Pminus(s,s) = this->bee[s];
-    }
-
-    for(int s=0; s<Ls-1; s++){
-        Pminus(s,s+1) = -this->cee[s];
-    }
-
-    for(int s=0; s<Ls-1; s++){
-        Pplus(s+1,s) = -this->cee[s+1];
-    }
-    Pplus (0,Ls-1) = mq1*this->cee[0];
-    Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
-
-    if(shift != 0.0){
-      Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
-      for(int s=0; s<Ls; ++s){
-        if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
-        else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
-      }
-    }
-
-    Eigen::MatrixXd PplusMat ;
-    Eigen::MatrixXd PminusMat;
-
-    if(inv){
-      PplusMat  = Pplus.inverse();
-      PminusMat = Pminus.inverse();
-    } else {
-      PplusMat  = Pplus;
-      PminusMat = Pminus;
-    }
-
-    if(dag){
-      PplusMat.adjointInPlace();
-      PminusMat.adjointInPlace();
-    }
-
-    // For the non-vectorised s-direction this is simple
-
-    for(auto site=0; site<vol; site++){
-
-        SiteSpinor     SiteChi;
-        SiteHalfSpinor SitePplus;
-        SiteHalfSpinor SitePminus;
-
-        for(int s1=0; s1<Ls; s1++){
-            SiteChi = zero;
-            for(int s2=0; s2<Ls; s2++){
-                int lex2 = s2 + Ls*site;
-                if(PplusMat(s1,s2) != 0.0){
-                    spProj5p(SitePplus,psi[lex2]);
-                    accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
-                }
-                if(PminusMat(s1,s2) != 0.0){
-                    spProj5m(SitePminus, psi[lex2]);
-                    accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
-                }
-            }
-            chi[s1+Ls*site] = SiteChi*0.5;
-        }
-    }
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_DENSE
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-    template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-  #endif
-
-}}
@@ -1,290 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
-  // Pminus fowards
-  // Pplus  backwards
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    Coeff_t one(1.0);
-    int Ls = this->Ls;
-    for(int s=0; s<Ls; s++){
-      if(s==0) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
-      } else if (s==(Ls-1)) {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      } else {
-        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
-        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
-      }
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
-
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    FermionField tmp(psi._grid);
-
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-      axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
-    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply U^{-1} and add shift term
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
-      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
-
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    // Apply (U^{\prime})^{-dagger}
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-    }
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    Coeff_t one(1.0);
-    Coeff_t czero(0.0);
-    chi.checkerboard = psi.checkerboard;
-    int Ls = this->Ls;
-
-    FermionField tmp(psi._grid);
-
-    // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
-    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
-    axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
-    for(int s=1; s<Ls; s++){
-      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
-      axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
-    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
-    }
-    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
-
-    // Apply L^{-dagger} and add shift
-    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
-    for(int s=Ls-2; s>=0; s--){
-      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
-      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
-      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
-    }
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_LINALG
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
-
-  #endif
-
-}}
@@ -1,983 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-
-namespace Grid {
-namespace QCD {
-
-  /*
-  * Dense matrix versions of routines
-  */
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
-  {
-    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    GridBase* grid  = psi._grid;
-    int Ls          = this->Ls;
-    int LLs         = grid->_rdimensions[0];
-    const int nsimd = Simd::Nsimd();
-
-    Vector<iSinglet<Simd>> u(LLs);
-    Vector<iSinglet<Simd>> l(LLs);
-    Vector<iSinglet<Simd>> d(LLs);
-
-    assert(Ls/LLs == nsimd);
-    assert(phi.checkerboard == psi.checkerboard);
-
-    chi.checkerboard = psi.checkerboard;
-
-    // just directly address via type pun
-    typedef typename Simd::scalar_type scalar_type;
-    scalar_type* u_p = (scalar_type*) &u[0];
-    scalar_type* l_p = (scalar_type*) &l[0];
-    scalar_type* d_p = (scalar_type*) &d[0];
-
-    for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s   = o + i*LLs;
-      int ss  = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    assert(Nc == 3);
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-      #if 0
-
-        alignas(64) SiteHalfSpinor hp;
-        alignas(64) SiteHalfSpinor hm;
-        alignas(64) SiteSpinor fp;
-        alignas(64) SiteSpinor fm;
-
-        for(int v=0; v<LLs; v++){
-
-          int vp = (v+1)%LLs;
-          int vm = (v+LLs-1)%LLs;
-
-          spProj5m(hp, psi[ss+vp]);
-          spProj5p(hm, psi[ss+vm]);
-
-          if (vp <= v){ rotate(hp, hp, 1); }
-          if (vm >= v){ rotate(hm, hm, nsimd-1); }
-
-          hp = 0.5*hp;
-          hm = 0.5*hm;
-
-          spRecon5m(fp, hp);
-          spRecon5p(fm, hm);
-
-          chi[ss+v] = d[v]*phi[ss+v];
-          chi[ss+v] = chi[ss+v] + u[v]*fp;
-          chi[ss+v] = chi[ss+v] + l[v]*fm;
-
-        }
-
-      #else
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0)     ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(2)(0);
-          Simd hp_01 = psi[ss+vp]()(2)(1);
-          Simd hp_02 = psi[ss+vp]()(2)(2);
-          Simd hp_10 = psi[ss+vp]()(3)(0);
-          Simd hp_11 = psi[ss+vp]()(3)(1);
-          Simd hp_12 = psi[ss+vp]()(3)(2);
-
-          Simd hm_00 = psi[ss+vm]()(0)(0);
-          Simd hm_01 = psi[ss+vm]()(0)(1);
-          Simd hm_02 = psi[ss+vm]()(0)(2);
-          Simd hm_10 = psi[ss+vm]()(1)(0);
-          Simd hm_11 = psi[ss+vm]()(1)(1);
-          Simd hm_12 = psi[ss+vm]()(1)(2);
-
-          if(vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          // Can force these to real arithmetic and save 2x.
-          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-        }
-
-      #endif
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    #if 0
-
-      this->M5D(psi, phi, chi, lower, diag, upper);
-
-      // FIXME: possible gain from vectorizing shift operation as well?
-      Coeff_t one(1.0);
-      int Ls = this->Ls;
-      for(int s=0; s<Ls; s++){
-        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
-        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
-      }
-
-    #else
-
-      GridBase* grid  = psi._grid;
-      int Ls          = this->Ls;
-      int LLs         = grid->_rdimensions[0];
-      const int nsimd = Simd::Nsimd();
-
-      Vector<iSinglet<Simd>> u(LLs);
-      Vector<iSinglet<Simd>> l(LLs);
-      Vector<iSinglet<Simd>> d(LLs);
-      Vector<iSinglet<Simd>> s(LLs);
-
-      assert(Ls/LLs == nsimd);
-      assert(phi.checkerboard == psi.checkerboard);
-
-      chi.checkerboard = psi.checkerboard;
-
-      // just directly address via type pun
-      typedef typename Simd::scalar_type scalar_type;
-      scalar_type* u_p = (scalar_type*) &u[0];
-      scalar_type* l_p = (scalar_type*) &l[0];
-      scalar_type* d_p = (scalar_type*) &d[0];
-      scalar_type* s_p = (scalar_type*) &s[0];
-
-      for(int o=0; o<LLs; o++){ // outer
-      for(int i=0; i<nsimd; i++){ //inner
-        int s   = o + i*LLs;
-        int ss  = o*nsimd + i;
-        u_p[ss] = upper[s];
-        l_p[ss] = lower[s];
-        d_p[ss] = diag[s];
-        s_p[ss] = shift_coeffs[s];
-      }}
-
-      this->M5Dcalls++;
-      this->M5Dtime -= usecond();
-
-      assert(Nc == 3);
-
-      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        int vs     = (this->pm == 1) ? LLs-1 : 0;
-        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
-        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
-        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
-        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
-        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
-        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0)     ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(2)(0);
-          Simd hp_01 = psi[ss+vp]()(2)(1);
-          Simd hp_02 = psi[ss+vp]()(2)(2);
-          Simd hp_10 = psi[ss+vp]()(3)(0);
-          Simd hp_11 = psi[ss+vp]()(3)(1);
-          Simd hp_12 = psi[ss+vp]()(3)(2);
-
-          Simd hm_00 = psi[ss+vm]()(0)(0);
-          Simd hm_01 = psi[ss+vm]()(0)(1);
-          Simd hm_02 = psi[ss+vm]()(0)(2);
-          Simd hm_10 = psi[ss+vm]()(1)(0);
-          Simd hm_11 = psi[ss+vm]()(1)(1);
-          Simd hm_12 = psi[ss+vm]()(1)(2);
-
-          if(vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(this->pm == 1 && vs <= v){
-            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          if(this->pm == -1 && vs >= v){
-            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-          }
-
-          // Can force these to real arithmetic and save 2x.
-          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-        }
-      }
-
-      this->M5Dtime += usecond();
-
-    #endif
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
-  {
-    GridBase* grid = psi._grid;
-    int Ls  = this->Ls;
-    int LLs = grid->_rdimensions[0];
-    int nsimd = Simd::Nsimd();
-
-    Vector<iSinglet<Simd>> u(LLs);
-    Vector<iSinglet<Simd>> l(LLs);
-    Vector<iSinglet<Simd>> d(LLs);
-
-    assert(Ls/LLs == nsimd);
-    assert(phi.checkerboard == psi.checkerboard);
-
-    chi.checkerboard = psi.checkerboard;
-
-    // just directly address via type pun
-    typedef typename Simd::scalar_type scalar_type;
-    scalar_type* u_p = (scalar_type*) &u[0];
-    scalar_type* l_p = (scalar_type*) &l[0];
-    scalar_type* d_p = (scalar_type*) &d[0];
-
-    for(int o=0; o<LLs; o++){ // outer
-    for(int i=0; i<nsimd; i++){ //inner
-      int s  = o + i*LLs;
-      int ss = o*nsimd + i;
-      u_p[ss] = upper[s];
-      l_p[ss] = lower[s];
-      d_p[ss] = diag[s];
-    }}
-
-    this->M5Dcalls++;
-    this->M5Dtime -= usecond();
-
-    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-      #if 0
-
-        alignas(64) SiteHalfSpinor hp;
-        alignas(64) SiteHalfSpinor hm;
-        alignas(64) SiteSpinor fp;
-        alignas(64) SiteSpinor fm;
-
-        for(int v=0; v<LLs; v++){
-
-          int vp = (v+1)%LLs;
-          int vm = (v+LLs-1)%LLs;
-
-          spProj5p(hp, psi[ss+vp]);
-          spProj5m(hm, psi[ss+vm]);
-
-          if(vp <= v){ rotate(hp, hp, 1); }
-          if(vm >= v){ rotate(hm, hm, nsimd-1); }
-
-          hp = hp*0.5;
-          hm = hm*0.5;
-          spRecon5p(fp, hp);
-          spRecon5m(fm, hm);
-
-          chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-          chi[ss+v] = chi[ss+v]     +l[v]*fm;
-
-        }
-
-      #else
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0    ) ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(0)(0);
-          Simd hp_01 = psi[ss+vp]()(0)(1);
-          Simd hp_02 = psi[ss+vp]()(0)(2);
-          Simd hp_10 = psi[ss+vp]()(1)(0);
-          Simd hp_11 = psi[ss+vp]()(1)(1);
-          Simd hp_12 = psi[ss+vp]()(1)(2);
-
-          Simd hm_00 = psi[ss+vm]()(2)(0);
-          Simd hm_01 = psi[ss+vm]()(2)(1);
-          Simd hm_02 = psi[ss+vm]()(2)(2);
-          Simd hm_10 = psi[ss+vm]()(3)(0);
-          Simd hm_11 = psi[ss+vm]()(3)(1);
-          Simd hm_12 = psi[ss+vm]()(3)(2);
-
-          if (vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
-          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
-          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
-          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
-          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
-          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-
-        }
-
-      #endif
-
-    }
-
-    this->M5Dtime += usecond();
-  }
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
-    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-    std::vector<Coeff_t>& shift_coeffs)
-  {
-    #if 0
-
-      this->M5Ddag(psi, phi, chi, lower, diag, upper);
-
-      // FIXME: possible gain from vectorizing shift operation as well?
-      Coeff_t one(1.0);
-      int Ls = this->Ls;
-      for(int s=0; s<Ls; s++){
-        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
-        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
-      }
-
-    #else
-
-      GridBase* grid = psi._grid;
-      int Ls  = this->Ls;
-      int LLs = grid->_rdimensions[0];
-      int nsimd = Simd::Nsimd();
-
-      Vector<iSinglet<Simd>> u(LLs);
-      Vector<iSinglet<Simd>> l(LLs);
-      Vector<iSinglet<Simd>> d(LLs);
-      Vector<iSinglet<Simd>> s(LLs);
-
-      assert(Ls/LLs == nsimd);
-      assert(phi.checkerboard == psi.checkerboard);
-
-      chi.checkerboard = psi.checkerboard;
-
-      // just directly address via type pun
-      typedef typename Simd::scalar_type scalar_type;
-      scalar_type* u_p = (scalar_type*) &u[0];
-      scalar_type* l_p = (scalar_type*) &l[0];
-      scalar_type* d_p = (scalar_type*) &d[0];
-      scalar_type* s_p = (scalar_type*) &s[0];
-
-      for(int o=0; o<LLs; o++){ // outer
-      for(int i=0; i<nsimd; i++){ //inner
-        int s  = o + i*LLs;
-        int ss = o*nsimd + i;
-        u_p[ss] = upper[s];
-        l_p[ss] = lower[s];
-        d_p[ss] = diag[s];
-        s_p[ss] = shift_coeffs[s];
-      }}
-
-      this->M5Dcalls++;
-      this->M5Dtime -= usecond();
-
-      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
-
-        int vs     = (this->pm == 1) ? LLs-1 : 0;
-        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
-        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
-        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
-        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
-        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
-        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
-
-        for(int v=0; v<LLs; v++){
-
-          vprefetch(psi[ss+v+LLs]);
-
-          int vp = (v == LLs-1) ? 0     : v+1;
-          int vm = (v == 0    ) ? LLs-1 : v-1;
-
-          Simd hp_00 = psi[ss+vp]()(0)(0);
-          Simd hp_01 = psi[ss+vp]()(0)(1);
-          Simd hp_02 = psi[ss+vp]()(0)(2);
-          Simd hp_10 = psi[ss+vp]()(1)(0);
-          Simd hp_11 = psi[ss+vp]()(1)(1);
-          Simd hp_12 = psi[ss+vp]()(1)(2);
-
-          Simd hm_00 = psi[ss+vm]()(2)(0);
-          Simd hm_01 = psi[ss+vm]()(2)(1);
-          Simd hm_02 = psi[ss+vm]()(2)(2);
-          Simd hm_10 = psi[ss+vm]()(3)(0);
-          Simd hm_11 = psi[ss+vm]()(3)(1);
-          Simd hm_12 = psi[ss+vm]()(3)(2);
-
-          if (vp <= v){
-            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-          }
-
-          if(this->pm == 1 && vs <= v){
-            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
-          }
-
-          if(vm >= v){
-            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-          }
-
-          if(this->pm == -1 && vs >= v){
-            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
-            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
-            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
-            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
-            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
-            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
-          }
-
-          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
-          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
-          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
-          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
-          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
-          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
-          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
-          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
-          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
-          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
-          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
-          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
-                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
-
-          vstream(chi[ss+v]()(0)(0), p_00);
-          vstream(chi[ss+v]()(0)(1), p_01);
-          vstream(chi[ss+v]()(0)(2), p_02);
-          vstream(chi[ss+v]()(1)(0), p_10);
-          vstream(chi[ss+v]()(1)(1), p_11);
-          vstream(chi[ss+v]()(1)(2), p_12);
-          vstream(chi[ss+v]()(2)(0), p_20);
-          vstream(chi[ss+v]()(2)(1), p_21);
-          vstream(chi[ss+v]()(2)(2), p_22);
-          vstream(chi[ss+v]()(3)(0), p_30);
-          vstream(chi[ss+v]()(3)(1), p_31);
-          vstream(chi[ss+v]()(3)(2), p_32);
-
-        }
-
-      }
-
-      this->M5Dtime += usecond();
-
-    #endif
-  }
-
-  #ifdef AVX512
-    #include<simd/Intel512common.h>
-    #include<simd/Intel512avx.h>
-    #include<simd/Intel512single.h>
-  #endif
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
-    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-  {
-    #ifndef AVX512
-      {
-        SiteHalfSpinor BcastP;
-        SiteHalfSpinor BcastM;
-        SiteHalfSpinor SiteChiP;
-        SiteHalfSpinor SiteChiM;
-
-        // Ls*Ls * 2 * 12 * vol flops
-        for(int s1=0; s1<LLs; s1++){
-
-          for(int s2=0; s2<LLs; s2++){
-          for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
-
-            int s = s2 + l*LLs;
-            int lex = s2 + LLs*site;
-
-            if( s2==0 && l==0 ){
-              SiteChiP=zero;
-              SiteChiM=zero;
-            }
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
-            }}
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
-            }}
-
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
-              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
-            }}
-          }}
-
-          {
-            int lex = s1 + LLs*site;
-            for(int sp=0; sp<2;  sp++){
-            for(int co=0; co<Nc; co++){
-              vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
-              vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
-            }}
-          }
-        }
-      }
-    #else
-      {
-        // pointers
-        //  MASK_REGS;
-        #define Chi_00 %%zmm1
-        #define Chi_01 %%zmm2
-        #define Chi_02 %%zmm3
-        #define Chi_10 %%zmm4
-        #define Chi_11 %%zmm5
-        #define Chi_12 %%zmm6
-        #define Chi_20 %%zmm7
-        #define Chi_21 %%zmm8
-        #define Chi_22 %%zmm9
-        #define Chi_30 %%zmm10
-        #define Chi_31 %%zmm11
-        #define Chi_32 %%zmm12
-
-        #define BCAST0  %%zmm13
-        #define BCAST1  %%zmm14
-        #define BCAST2  %%zmm15
-        #define BCAST3  %%zmm16
-        #define BCAST4  %%zmm17
-        #define BCAST5  %%zmm18
-        #define BCAST6  %%zmm19
-        #define BCAST7  %%zmm20
-        #define BCAST8  %%zmm21
-        #define BCAST9  %%zmm22
-        #define BCAST10 %%zmm23
-        #define BCAST11 %%zmm24
-
-        int incr = LLs*LLs*sizeof(iSinglet<Simd>);
-
-        for(int s1=0; s1<LLs; s1++){
-
-          for(int s2=0; s2<LLs; s2++){
-
-            int lex = s2 + LLs*site;
-            uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
-            uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
-            uint64_t a2 = (uint64_t) &psi[lex];
-
-            for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
-
-              if((s2+l)==0) {
-                asm(
-                      VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
-                      VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
-                      VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
-                      VBCASTCDUP(0,%2,BCAST0)
-                      VBCASTCDUP(1,%2,BCAST1)
-                      VBCASTCDUP(2,%2,BCAST2)
-                      VBCASTCDUP(3,%2,BCAST3)
-                      VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
-                      VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
-                      VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
-                      VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
-                      VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
-                      VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
-                      VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
-                      VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
-                      VMULMEM(0,%1,BCAST8,Chi_22)
-                      VMULMEM(0,%1,BCAST9,Chi_30)
-                      VMULMEM(0,%1,BCAST10,Chi_31)
-                      VMULMEM(0,%1,BCAST11,Chi_32)
-                      : : "r" (a0), "r" (a1), "r" (a2)                            );
-              } else {
-                asm(
-                      VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
-                      VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
-                      VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
-                      VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
-                      VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
-                      VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
-                      VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
-                      VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
-                      VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
-                      VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
-                      VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
-                      VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
-                      : : "r" (a0), "r" (a1), "r" (a2)                            );
-              }
-
-              a0 = a0 + incr;
-              a1 = a1 + incr;
-              a2 = a2 + sizeof(typename Simd::scalar_type);
-            }
-          }
-
-          {
-            int lexa = s1+LLs*site;
-            asm (
-               VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
-               VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
-               VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
-               VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
-               : : "r" ((uint64_t)&chi[lexa]) : "memory" );
-          }
-        }
-      }
-
-      #undef Chi_00
-      #undef Chi_01
-      #undef Chi_02
-      #undef Chi_10
-      #undef Chi_11
-      #undef Chi_12
-      #undef Chi_20
-      #undef Chi_21
-      #undef Chi_22
-      #undef Chi_30
-      #undef Chi_31
-      #undef Chi_32
-
-      #undef BCAST0
-      #undef BCAST1
-      #undef BCAST2
-      #undef BCAST3
-      #undef BCAST4
-      #undef BCAST5
-      #undef BCAST6
-      #undef BCAST7
-      #undef BCAST8
-      #undef BCAST9
-      #undef BCAST10
-      #undef BCAST11
-
-    #endif
-  };
-
-  // Z-mobius version
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
-    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
-  {
-    std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
-    exit(-1);
-  };
-
-  template<class Impl>
-  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
-  {
-    int Ls  = this->Ls;
-    int LLs = psi._grid->_rdimensions[0];
-    int vol = psi._grid->oSites()/LLs;
-
-    chi.checkerboard = psi.checkerboard;
-
-    Vector<iSinglet<Simd>>   Matp;
-    Vector<iSinglet<Simd>>   Matm;
-    Vector<iSinglet<Simd>>* _Matp;
-    Vector<iSinglet<Simd>>* _Matm;
-
-    //  MooeeInternalCompute(dag,inv,Matp,Matm);
-    if(inv && dag){
-      _Matp = &this->MatpInvDag;
-      _Matm = &this->MatmInvDag;
-    }
-
-    if(inv && (!dag)){
-      _Matp = &this->MatpInv;
-      _Matm = &this->MatmInv;
-    }
-
-    if(!inv){
-      MooeeInternalCompute(dag, inv, Matp, Matm);
-      _Matp = &Matp;
-      _Matm = &Matm;
-    }
-
-    assert(_Matp->size() == Ls*LLs);
-
-    this->MooeeInvCalls++;
-    this->MooeeInvTime -= usecond();
-
-    if(switcheroo<Coeff_t>::iscomplex()){
-      parallel_for(auto site=0; site<vol; site++){
-        MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-      }
-    } else {
-      parallel_for(auto site=0; site<vol; site++){
-        MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-      }
-    }
-
-    this->MooeeInvTime += usecond();
-  }
-
-  #ifdef MOBIUS_EOFA_DPERP_VEC
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
-
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
-    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
-
-    template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-    template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-    template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
-
-  #endif
-
-}}
@@ -1,294 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-namespace Grid {
-namespace QCD {
-
-int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
-int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
-
-#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in._odata[SE->_offset], ptype);		\
-    } else {							\
-      chi_p = &in._odata[SE->_offset];				\
-    }								\
-  } else {							\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  multLink(Uchi, U._odata[sU], *chi_p, Dir);			
-
-#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in._odata[SE->_offset], ptype);		\
-    } else {							\
-      chi_p = &in._odata[SE->_offset];				\
-    }								\
-  } else if ( st.same_node[Dir] ) {				\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
-  }
-
-#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
-  SE = st.GetEntry(ptype, Dir+skew, sF);			\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    nmu++;							\
-    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
-  }
-
-template <class Impl>
-StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
-
-////////////////////////////////////////////////////////////////////////////////////
-// Generic implementation; move to different file?
-// Int, Ext, Int+Ext cases for comms overlap
-////////////////////////////////////////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out, int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
-    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
-    if ( dag ) { 
-      Uchi = - Uchi;
-    } 
-    vstream(out._odata[sF], Uchi);
-  }
-};
-
-  ///////////////////////////////////////////////////
-  // Only contributions from interior of our node
-  ///////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeField &U, DoubledGaugeField &UUU,
-						SiteSpinor *buf, int LLs, int sU, 
-						const FermionField &in, FermionField &out,int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int skew ;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    Uchi=zero;
-    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
-    if ( dag ) {
-      Uchi = - Uchi;
-    }
-    vstream(out._odata[sF], Uchi);
-  }
-};
-
-
-  ///////////////////////////////////////////////////
-  // Only contributions from exterior of our node
-  ///////////////////////////////////////////////////
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
-						DoubledGaugeField &U, DoubledGaugeField &UUU,
-						SiteSpinor *buf, int LLs, int sU,
-						const FermionField &in, FermionField &out,int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int nmu=0;
-  int skew ;
-
-  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
-    skew = 0;
-    Uchi=zero;
-    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
-    skew=8;
-    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
-    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
-
-    if ( nmu ) { 
-      if ( dag ) { 
-	out._odata[sF] = out._odata[sF] - Uchi;
-      } else { 
-	out._odata[sF] = out._odata[sF] + Uchi;
-      }
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////
-// Driving / wrapping routine to select right kernel
-////////////////////////////////////////////////////////////////////////////////////
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
-					 SiteSpinor *buf, int LLs, int sU,
-					 const FermionField &in, FermionField &out,
-					 int interior,int exterior)
-{
-  int dag=1;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
-				      SiteSpinor *buf, int LLs, int sU,
-				      const FermionField &in, FermionField &out,
-				      int interior,int exterior)
-{
-  int dag=0;
-  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
-				      SiteSpinor *buf, int LLs,
-				      int sU, const FermionField &in, FermionField &out,
-				      int dag,int interior,int exterior) 
-{
-  switch(Opt) {
-#ifdef AVX512
-  case OptInlineAsm:
-    if ( interior && exterior ) {
-      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else { 
-      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
-      assert(0);
-    }
-    break;
-#endif
-  case OptHandUnroll:
-    if ( interior && exterior ) {
-      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  case OptGeneric:
-    if ( interior && exterior ) {
-      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( interior ) {
-      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    } else if ( exterior ) {
-      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    }
-    break;
-  default:
-    std::cout<<"Oops Opt = "<<Opt<<std::endl;
-    assert(0);
-    break;
-  }
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,  DoubledGaugeField &UUU, SiteSpinor *buf, int sF,
-				      int sU, const FermionField &in, FermionField &out, int dir, int disp) 
-{
-  // Disp should be either +1,-1,+3,-3
-  // What about "dag" ?
-  // Because we work out pU . dS/dU 
-  // U
-  assert(0);
-}
-
-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
-
-}}
-
@@ -1,122 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/StaggeredKernels.h
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_QCD_STAGGERED_KERNELS_H
-#define GRID_QCD_STAGGERED_KERNELS_H
-
-namespace Grid {
-namespace QCD {
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Helper routines that implement Staggered stencil for a single site.
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class StaggeredKernelsStatic { 
- public:
-  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
-  enum { CommsAndCompute, CommsThenCompute };
-  static int Opt;
-  static int Comms;
-};
- 
-template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic { 
- public:
-   
-  INHERIT_IMPL_TYPES(Impl);
-  typedef FermionOperator<Impl> Base;
-   
-public:
-    
-   void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
-		      int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
-
-   ///////////////////////////////////////////////////////////////////////////////////////
-   // Generic Nc kernels
-   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U, DoubledGaugeField &UUU, 
-			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
-   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
-			   DoubledGaugeField &U, DoubledGaugeField &UUU, 
-			   SiteSpinor * buf, int LLs, int sU, 
-			   const FermionField &in, FermionField &out,int dag);
-   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
-			   DoubledGaugeField &U, DoubledGaugeField &UUU,
-			   SiteSpinor * buf, int LLs, int sU, 
-			   const FermionField &in, FermionField &out,int dag);
-
-   ///////////////////////////////////////////////////////////////////////////////////////
-   // Nc=3 specific kernels
-   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
-		     DoubledGaugeField &U,DoubledGaugeField &UUU, 
-		     SiteSpinor * buf, int LLs, int sU, 
-		     const FermionField &in, FermionField &out,int dag);
-   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U,DoubledGaugeField &UUU, 
-			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
-   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
-			DoubledGaugeField &U,DoubledGaugeField &UUU, 
-			SiteSpinor * buf, int LLs, int sU, 
-			const FermionField &in, FermionField &out,int dag);
-
-   ///////////////////////////////////////////////////////////////////////////////////////
-   // Asm Nc=3 specific kernels
-   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-		    DoubledGaugeField &U,DoubledGaugeField &UUU, 
-		    SiteSpinor * buf, int LLs, int sU, 
-		    const FermionField &in, FermionField &out,int dag);
-   ///////////////////////////////////////////////////////////////////////////////////////////////////
-   // Generic interface; fan out to right routine
-   ///////////////////////////////////////////////////////////////////////////////////////////////////
-   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
-		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionField &in, FermionField &out, int interior=1,int exterior=1);
-
-   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
-		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
-		    SiteSpinor * buf, int LLs, int sU,
-		    const FermionField &in, FermionField &out, int interior=1,int exterior=1);
-
-   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
-		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
-		 SiteSpinor * buf, int LLs, int sU,
-		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
-  
-public:
-
-  StaggeredKernels(const ImplParams &p = ImplParams());
-
-};
-    
-}}
-
-#endif
@@ -1,399 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid.h>
-
-
-#define LOAD_CHI(b)		\
-  const SiteSpinor & ref (b[offset]);	\
-    Chi_0=ref()()(0);\
-    Chi_1=ref()()(1);\
-    Chi_2=ref()()(2);
-
-
-// To splat or not to splat depends on the implementation
-#define MULT(A,UChi)				\
-  auto & ref(U._odata[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
-    UChi ## _0  = U_00*Chi_0;	       \
-    UChi ## _1  = U_10*Chi_0;\
-    UChi ## _2  = U_20*Chi_0;\
-    UChi ## _0 += U_01*Chi_1;\
-    UChi ## _1 += U_11*Chi_1;\
-    UChi ## _2 += U_21*Chi_1;\
-    UChi ## _0 += U_02*Chi_2;\
-    UChi ## _1 += U_12*Chi_2;\
-    UChi ## _2 += U_22*Chi_2;
-
-#define MULT_ADD(U,A,UChi)			\
-  auto & ref(U._odata[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
-    UChi ## _0 += U_00*Chi_0;	       \
-    UChi ## _1 += U_10*Chi_0;\
-    UChi ## _2 += U_20*Chi_0;\
-    UChi ## _0 += U_01*Chi_1;\
-    UChi ## _1 += U_11*Chi_1;\
-    UChi ## _2 += U_21*Chi_1;\
-    UChi ## _0 += U_02*Chi_2;\
-    UChi ## _1 += U_12*Chi_2;\
-    UChi ## _2 += U_22*Chi_2;
-
-
-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_0,Chi_0);			\
-  permute##dir(Chi_1,Chi_1);			\
-  permute##dir(Chi_2,Chi_2);
-
-
-#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);	\
-  offset = SE->_offset;			\
-  local  = SE->_is_local;		\
-  perm   = SE->_permute;		\
-  if ( local ) {						\
-    LOAD_CHI(in._odata);					\
-    if ( perm) {						\
-      PERMUTE_DIR(Perm);					\
-    }								\
-  } else {							\
-    LOAD_CHI(buf);						\
-  }								
-
-#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
-  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
-  {								\
-    MULT(Dir,even);						\
-  }
-
-#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even)			\
-  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
-  {								\
-    MULT_ADD(U,Dir,even);					\
-  }
-
-
-
-#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);			\
-  offset = SE->_offset;					\
-  local  = SE->_is_local;				\
-  perm   = SE->_permute;				\
-  if ( local ) {					\
-    LOAD_CHI(in._odata);				\
-    if ( perm) {					\
-      PERMUTE_DIR(Perm);				\
-    }							\
-  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI(buf);					\
-  }							\
-  if (SE->_is_local || st.same_node[Dir] ) {		\
-    MULT_ADD(U,Dir,even);				\
-  }
-
-#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even)	\
-  SE=st.GetEntry(ptype,Dir+skew,sF);			\
-  offset = SE->_offset;					\
-  local  = SE->_is_local;				\
-  perm   = SE->_permute;				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
-    nmu++;							\
-    { LOAD_CHI(buf);	  }					\
-    { MULT_ADD(U,Dir,even); }					\
-  }								
-
-namespace Grid {
-namespace QCD {
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
-					  DoubledGaugeField &U,DoubledGaugeField &UUU,
-					  SiteSpinor *buf, int LLs, int sU, 
-					  const FermionField &in, FermionField &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset,local,perm, ptype;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    skew = 0;
-    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
-    HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);   
-    HAND_STENCIL_LEG      (U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG      (U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG      (U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
-    
-    if ( dag ) {
-      result()()(0) = - even_0 - odd_0;
-      result()()(1) = - even_1 - odd_1;
-      result()()(2) = - even_2 - odd_2;
-    } else { 
-      result()()(0) = even_0 + odd_0;
-      result()()(1) = even_1 + odd_1;
-      result()()(2) = even_2 + odd_2;
-    }
-    vstream(out._odata[sF],result);
-  }
-}
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset,local,perm, ptype;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    even_0 = zero;    even_1 = zero;    even_2 = zero;
-     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
-
-    skew = 0;
-    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
-
-    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
-    if ( dag ) {
-      result()()(0) = - even_0 - odd_0;
-      result()()(1) = - even_1 - odd_1;
-      result()()(2) = - even_2 - odd_2;
-    } else { 
-      result()()(0) = even_0 + odd_0;
-      result()()(1) = even_1 + odd_1;
-      result()()(2) = even_2 + odd_2;
-    }
-    vstream(out._odata[sF],result);
-  }
-}
-
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
-					     DoubledGaugeField &U, DoubledGaugeField &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
-					     const FermionField &in, FermionField &out,int dag) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
-
-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
-  int offset,local,perm, ptype;
-
-  StencilEntry *SE;
-  int skew;
-
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-
-    even_0 = zero;    even_1 = zero;    even_2 = zero;
-     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
-    int nmu=0;
-    skew = 0;
-    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
-    skew = 8;
-    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);  
-    HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
-    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
-    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
-
-    // Add sum of all exterior connected stencil legs
-    if ( nmu ) { 
-      if ( dag ) {
-	result()()(0) = - even_0 - odd_0;
-	result()()(1) = - even_1 - odd_1;
-	result()()(2) = - even_2 - odd_2;
-      } else { 
-	result()()(0) = even_0 + odd_0;
-	result()()(1) = even_1 + odd_1;
-	result()()(2) = even_2 + odd_2;
-      }
-      out._odata[sF] = out._odata[sF] + result;
-    }
-  }
-}
-
-
-#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
-  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
-									\
-  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
-									\
-  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionField &in, FermionField &out, int dag); \
-
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
-
-
-}
-}
-
@@ -1,243 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-
-    Copyright (C) 2017
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-//#include <Grid/Eigen/Dense>
-#include <Grid/qcd/spin/Dirac.h>
-
-namespace Grid
-{
-namespace QCD
-{
-
-// *NOT* EO
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out._grid);
-
-  // Wilson term
-  out.checkerboard = in.checkerboard;
-  this->Dhop(in, out, DaggerNo);
-
-  // Clover term
-  Mooee(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
-{
-  FermionField temp(out._grid);
-
-  // Wilson term
-  out.checkerboard = in.checkerboard;
-  this->Dhop(in, out, DaggerYes);
-
-  // Clover term
-  MooeeDag(in, temp);
-
-  out += temp;
-  return norm2(out);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-{
-  WilsonFermion<Impl>::ImportGauge(_Umu);
-  GridBase *grid = _Umu._grid;
-  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
-
-  // Compute the field strength terms mu>nu
-  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
-  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
-  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
-
-  // Compute the Clover Operator acting on Colour and Spin
-  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
-
-  int lvol = _Umu._grid->lSites();
-  int DimRep = Impl::Dimension;
-
-  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-
-  std::vector<int> lcoor;
-  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
-
-  for (int site = 0; site < lvol; site++)
-  {
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-    peekLocalSite(Qx, CloverTerm, lcoor);
-    Qxinv = zero;
-    //if (csw!=0){
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
-    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
-
-    EigenInvCloverOp = EigenCloverOp.inverse();
-    //std::cout << EigenInvCloverOp << std::endl;
-    for (int j = 0; j < Ns; j++)
-      for (int k = 0; k < Ns; k++)
-        for (int a = 0; a < DimRep; a++)
-          for (int b = 0; b < DimRep; b++)
-            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
-    //  }
-    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
-  }
-
-  // Separate the even and odd parts
-  pickCheckerboard(Even, CloverTermEven, CloverTerm);
-  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
-
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
-
-  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
-  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
-
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseNo);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerNo, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
-{
-  this->MooeeInternal(in, out, DaggerYes, InverseYes);
-}
-
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
-{
-  out.checkerboard = in.checkerboard;
-  CloverFieldType *Clover;
-  assert(in.checkerboard == Odd || in.checkerboard == Even);
-
-  if (dag)
-  {
-    if (in._grid->_isCheckerBoarded)
-    {
-      if (in.checkerboard == Odd)
-      {
-        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
-      }
-      else
-      {
-        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
-      }
-      out = *Clover * in;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
-    }
-  }
-  else
-  {
-    if (in._grid->_isCheckerBoarded)
-    {
-
-      if (in.checkerboard == Odd)
-      {
-        //  std::cout << "Calling clover term Odd" << std::endl;
-        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
-      }
-      else
-      {
-        //  std::cout << "Calling clover term Even" << std::endl;
-        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
-      }
-      out = *Clover * in;
-      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
-    }
-    else
-    {
-      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
-    }
-  }
-
-} // MooeeInternal
-
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
-{
-  assert(0);
-}
-
-// Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
-{
-  assert(0); // not implemented yet
-}
-
-FermOpTemplateInstantiate(WilsonCloverFermion);
-AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
-TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
-//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
-}
-}
@@ -1,366 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
-
-    Copyright (C) 2017
-
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-    Author: David Preti <>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
-#define GRID_QCD_WILSON_CLOVER_FERMION_H
-
-#include <Grid/Grid.h>
-
-namespace Grid
-{
-namespace QCD
-{
-
-///////////////////////////////////////////////////////////////////
-// Wilson Clover
-//
-// Operator ( with anisotropy coefficients):
-//
-// Q =   1 + (Nd-1)/xi_0 + m
-//     + W_t + (nu/xi_0) * W_s
-//     - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss)  ]
-//
-// s spatial, t temporal directions.
-// where W_t and W_s are the temporal and spatial components of the
-// Wilson Dirac operator
-//
-// csw_r = csw_t to recover the isotropic version
-//////////////////////////////////////////////////////////////////
-
-template <class Impl>
-class WilsonCloverFermion : public WilsonFermion<Impl>
-{
-public:
-  // Types definitions
-  INHERIT_IMPL_TYPES(Impl);
-  template <typename vtype>
-  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
-  typedef iImplClover<Simd> SiteCloverType;
-  typedef Lattice<SiteCloverType> CloverFieldType;
-
-public:
-  typedef WilsonFermion<Impl> WilsonBase;
-
-  virtual void Instantiatable(void){};
-  // Constructors
-  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-                      GridRedBlackCartesian &Hgrid,
-                      const RealD _mass,
-                      const RealD _csw_r = 0.0,
-                      const RealD _csw_t = 0.0,
-                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
-                                                                                     Fgrid,
-                                                                                     Hgrid,
-                                                                                     _mass, impl_p, clover_anisotropy),
-                                                                 CloverTerm(&Fgrid),
-                                                                 CloverTermInv(&Fgrid),
-                                                                 CloverTermEven(&Hgrid),
-                                                                 CloverTermOdd(&Hgrid),
-                                                                 CloverTermInvEven(&Hgrid),
-                                                                 CloverTermInvOdd(&Hgrid),
-                                                                 CloverTermDagEven(&Hgrid),
-                                                                 CloverTermDagOdd(&Hgrid),
-                                                                 CloverTermInvDagEven(&Hgrid),
-                                                                 CloverTermInvDagOdd(&Hgrid)
-  {
-    assert(Nd == 4); // require 4 dimensions
-
-    if (clover_anisotropy.isAnisotropic)
-    {
-      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
-      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
-    }
-    else
-    {
-      csw_r = _csw_r * 0.5;
-      diag_mass = 4.0 + _mass;
-    }
-    csw_t = _csw_t * 0.5;
-
-    if (csw_r == 0)
-      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
-    if (csw_t == 0)
-      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
-
-    ImportGauge(_Umu);
-  }
-
-  virtual RealD M(const FermionField &in, FermionField &out);
-  virtual RealD Mdag(const FermionField &in, FermionField &out);
-
-  virtual void Mooee(const FermionField &in, FermionField &out);
-  virtual void MooeeDag(const FermionField &in, FermionField &out);
-  virtual void MooeeInv(const FermionField &in, FermionField &out);
-  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
-  virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
-
-  //virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
-  virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
-  virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
-
-  void ImportGauge(const GaugeField &_Umu);
-
-  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
-  {
-    conformable(X._grid, Y._grid);
-    conformable(X._grid, force._grid);
-    GaugeLinkField force_mu(force._grid), lambda(force._grid);
-    GaugeField clover_force(force._grid);
-    PropagatorField Lambda(force._grid);
-
-    // Guido: Here we are hitting some performance issues:
-    // need to extract the components of the DoubledGaugeField
-    // for each call
-    // Possible solution
-    // Create a vector object to store them? (cons: wasting space)
-    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
-
-    Impl::extractLinkField(U, this->Umu);
-
-    force = zero;
-    // Derivative of the Wilson hopping term
-    this->DhopDeriv(force, X, Y, dag);
-
-    ///////////////////////////////////////////////////////////
-    // Clover term derivative
-    ///////////////////////////////////////////////////////////
-    Impl::outerProductImpl(Lambda, X, Y);
-    //std::cout << "Lambda:" << Lambda << std::endl;
-
-    Gamma::Algebra sigma[] = {
-        Gamma::Algebra::SigmaXY,
-        Gamma::Algebra::SigmaXZ,
-        Gamma::Algebra::SigmaXT,
-        Gamma::Algebra::MinusSigmaXY,
-        Gamma::Algebra::SigmaYZ,
-        Gamma::Algebra::SigmaYT,
-        Gamma::Algebra::MinusSigmaXZ,
-        Gamma::Algebra::MinusSigmaYZ,
-        Gamma::Algebra::SigmaZT,
-        Gamma::Algebra::MinusSigmaXT,
-        Gamma::Algebra::MinusSigmaYT,
-        Gamma::Algebra::MinusSigmaZT};
-
-    /*
-      sigma_{\mu \nu}=
-      | 0         sigma[0]  sigma[1]  sigma[2] |
-      | sigma[3]    0       sigma[4]  sigma[5] |
-      | sigma[6]  sigma[7]     0      sigma[8] |
-      | sigma[9]  sigma[10] sigma[11]   0      |
-    */
-
-    int count = 0;
-    clover_force = zero;
-    for (int mu = 0; mu < 4; mu++)
-    {
-      force_mu = zero;
-      for (int nu = 0; nu < 4; nu++)
-      {
-        if (mu == nu)
-        continue;
-        
-        RealD factor;
-        if (nu == 4 || mu == 4)
-        {
-          factor = 2.0 * csw_t;
-        }
-        else
-        {
-          factor = 2.0 * csw_r;
-        }
-        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
-        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
-        count++;
-      }
-
-      pokeLorentz(clover_force, U[mu] * force_mu, mu);
-    }
-    //clover_force *= csw;
-    force += clover_force;
-  }
-
-  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
-  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
-  {
-    conformable(lambda._grid, U[0]._grid);
-    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
-    // insertion in upper staple
-    // please check redundancy of shift operations
-
-    // C1+
-    tmp = lambda * U[nu];
-    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
-
-    // C2+
-    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
-    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
-
-    // C3+
-    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
-    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
-
-    // C4+
-    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
-
-    // insertion in lower staple
-    // C1-
-    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
-
-    // C2-
-    tmp = adj(lambda) * U[nu];
-    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
-
-    // C3-
-    tmp = lambda * U[nu];
-    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
-
-    // C4-
-    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
-
-    return out;
-  }
-
-private:
-  // here fixing the 4 dimensions, make it more general?
-
-  RealD csw_r;                                               // Clover coefficient - spatial
-  RealD csw_t;                                               // Clover coefficient - temporal
-  RealD diag_mass;                                           // Mass term
-  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
-  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
-  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
-  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
-  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
-
-  // eventually these can be compressed into 6x6 blocks instead of the 12x12
-  // using the DeGrand-Rossi basis for the gamma matrices
-  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
-    {
-      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
-    }
-
-    return T;
-  }
-
-  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
-    {
-      T._odata[i]()(0, 1) = -F._odata[i]()();
-      T._odata[i]()(1, 0) = F._odata[i]()();
-      T._odata[i]()(2, 3) = -F._odata[i]()();
-      T._odata[i]()(3, 2) = F._odata[i]()();
-    }
-
-    return T;
-  }
-
-  CloverFieldType fillCloverXY(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
-    {
-
-      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
-      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
-    }
-
-    return T;
-  }
-
-  CloverFieldType fillCloverXT(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
-    {
-      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
-      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
-      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
-    }
-
-    return T;
-  }
-
-  CloverFieldType fillCloverYT(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
-    {
-      T._odata[i]()(0, 1) = -(F._odata[i]()());
-      T._odata[i]()(1, 0) = (F._odata[i]()());
-      T._odata[i]()(2, 3) = (F._odata[i]()());
-      T._odata[i]()(3, 2) = -(F._odata[i]()());
-    }
-
-    return T;
-  }
-
-  CloverFieldType fillCloverZT(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F._grid);
-    T = zero;
-    PARALLEL_FOR_LOOP
-    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
-    {
-      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
-      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
-      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
-    }
-
-    return T;
-  }
-};
-}
-}
-
-#endif // GRID_QCD_WILSON_CLOVER_FERMION_H
@@ -1,878 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-#define REGISTER
-
-#define LOAD_CHIMU_BODY(F)			\
-  Chimu_00=ref(F)(0)(0);			\
-  Chimu_01=ref(F)(0)(1);			\
-  Chimu_02=ref(F)(0)(2);			\
-  Chimu_10=ref(F)(1)(0);			\
-  Chimu_11=ref(F)(1)(1);			\
-  Chimu_12=ref(F)(1)(2);			\
-  Chimu_20=ref(F)(2)(0);			\
-  Chimu_21=ref(F)(2)(1);			\
-  Chimu_22=ref(F)(2)(2);			\
-  Chimu_30=ref(F)(3)(0);			\
-  Chimu_31=ref(F)(3)(1);			\
-  Chimu_32=ref(F)(3)(2)
-
-#define LOAD_CHIMU(DIR,F,PERM)						\
-  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
-
-#define LOAD_CHI_BODY(F)				\
-    Chi_00 = ref(F)(0)(0);\
-    Chi_01 = ref(F)(0)(1);\
-    Chi_02 = ref(F)(0)(2);\
-    Chi_10 = ref(F)(1)(0);\
-    Chi_11 = ref(F)(1)(1);\
-    Chi_12 = ref(F)(1)(2)
-
-#define LOAD_CHI(DIR,F,PERM)					\
-  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
-
-
-//G-parity implementations using in-place intrinsic ops
-
-//1l 1h -> 1h 1l
-//0l 0h , 1h 1l -> 0l 1h 0h,1l
-//0h,1l -> 1l,0h
-//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
-//Pulled fermion through forwards face, GPBC on upper component
-//Need 0= 0l 1h   1= 1l 0h
-//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
-//Pulled fermion through backwards face, GPBC on lower component
-//Need 0= 1l 0h   1= 0l 1h
-
-//1l 1h -> 1h 1l
-//0l 0h , 1h 1l -> 0l 1h 0h,1l
-#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
-  permute##PERM(tmp1, ref(1)(S)(C));				\
-  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
-  INTO = tmp2;
-
-//0l 0h -> 0h 0l
-//1l 1h, 0h 0l -> 1l 0h, 1h 0l
-#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
-  permute##PERM(tmp1, ref(0)(S)(C));				\
-  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
-  INTO = tmp2;
-
-
-
-
-#define LOAD_CHI_SETUP(DIR,F)						\
-  g = F;								\
-  direction = st._directions[DIR];				\
-  distance = st._distances[DIR];				\
-  sl = st._grid->_simd_layout[direction];			\
-  inplace_twist = 0;						\
-  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
-    if(sl == 1){							\
-      g = (F+1) % 2;							\
-    }else{								\
-      inplace_twist = 1;						\
-    }									\
-  }  
-
-#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
-  { const SiteSpinor &ref(in._odata[offset]);				\
-    LOAD_CHI_SETUP(DIR,F);						\
-    if(!inplace_twist){							\
-      LOAD_CHIMU_BODY(g);						\
-    }else{								\
-      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
-      }else{								\
-	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
-      } \
-    } \
-  }
-
-
-#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
-  { const SiteHalfSpinor &ref(buf[offset]);				\
-    LOAD_CHI_SETUP(DIR,F);						\
-    if(!inplace_twist){							\
-      LOAD_CHI_BODY(g);							\
-    }else{								\
-      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
-	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
-      }else{								\
-	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
-	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
-      }									\
-    }									\
-  }
-
-
-#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
-#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
-
-// To splat or not to splat depends on the implementation
-#define MULT_2SPIN_BODY \
-  Impl::loadLinkElement(U_00,ref()(0,0));	\
-  Impl::loadLinkElement(U_10,ref()(1,0));	\
-  Impl::loadLinkElement(U_20,ref()(2,0));	\
-  Impl::loadLinkElement(U_01,ref()(0,1));	\
-  Impl::loadLinkElement(U_11,ref()(1,1));	\
-  Impl::loadLinkElement(U_21,ref()(2,1));	\
-  UChi_00 = U_00*Chi_00;			\
-  UChi_10 = U_00*Chi_10;			\
-  UChi_01 = U_10*Chi_00;			\
-  UChi_11 = U_10*Chi_10;			\
-  UChi_02 = U_20*Chi_00;			\
-  UChi_12 = U_20*Chi_10;			\
-  UChi_00+= U_01*Chi_01;			\
-  UChi_10+= U_01*Chi_11;			\
-  UChi_01+= U_11*Chi_01;			\
-  UChi_11+= U_11*Chi_11;			\
-  UChi_02+= U_21*Chi_01;			\
-  UChi_12+= U_21*Chi_11;			\
-  Impl::loadLinkElement(U_00,ref()(0,2));	\
-  Impl::loadLinkElement(U_10,ref()(1,2));	\
-  Impl::loadLinkElement(U_20,ref()(2,2));	\
-  UChi_00+= U_00*Chi_02;			\
-  UChi_10+= U_00*Chi_12;			\
-  UChi_01+= U_10*Chi_02;			\
-  UChi_11+= U_10*Chi_12;			\
-  UChi_02+= U_20*Chi_02;			\
-  UChi_12+= U_20*Chi_12
-
-
-#define MULT_2SPIN(A,F)					\
-  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
-
-#define MULT_2SPIN_GPARITY(A,F)				\
-  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
-
-
-#define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_00,Chi_00);\
-      permute##dir(Chi_01,Chi_01);\
-      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);\
-      permute##dir(Chi_11,Chi_11);\
-      permute##dir(Chi_12,Chi_12);
-
-//      hspin(0)=fspin(0)+timesI(fspin(3));
-//      hspin(1)=fspin(1)+timesI(fspin(2));
-#define XP_PROJ \
-    Chi_00 = Chimu_00+timesI(Chimu_30);\
-    Chi_01 = Chimu_01+timesI(Chimu_31);\
-    Chi_02 = Chimu_02+timesI(Chimu_32);\
-    Chi_10 = Chimu_10+timesI(Chimu_20);\
-    Chi_11 = Chimu_11+timesI(Chimu_21);\
-    Chi_12 = Chimu_12+timesI(Chimu_22);
-
-#define YP_PROJ \
-    Chi_00 = Chimu_00-Chimu_30;\
-    Chi_01 = Chimu_01-Chimu_31;\
-    Chi_02 = Chimu_02-Chimu_32;\
-    Chi_10 = Chimu_10+Chimu_20;\
-    Chi_11 = Chimu_11+Chimu_21;\
-    Chi_12 = Chimu_12+Chimu_22;
-
-#define ZP_PROJ \
-  Chi_00 = Chimu_00+timesI(Chimu_20);		\
-  Chi_01 = Chimu_01+timesI(Chimu_21);		\
-  Chi_02 = Chimu_02+timesI(Chimu_22);		\
-  Chi_10 = Chimu_10-timesI(Chimu_30);		\
-  Chi_11 = Chimu_11-timesI(Chimu_31);		\
-  Chi_12 = Chimu_12-timesI(Chimu_32);
-
-#define TP_PROJ \
-  Chi_00 = Chimu_00+Chimu_20;		\
-  Chi_01 = Chimu_01+Chimu_21;		\
-  Chi_02 = Chimu_02+Chimu_22;		\
-  Chi_10 = Chimu_10+Chimu_30;		\
-  Chi_11 = Chimu_11+Chimu_31;		\
-  Chi_12 = Chimu_12+Chimu_32;
-
-
-//      hspin(0)=fspin(0)-timesI(fspin(3));
-//      hspin(1)=fspin(1)-timesI(fspin(2));
-#define XM_PROJ \
-    Chi_00 = Chimu_00-timesI(Chimu_30);\
-    Chi_01 = Chimu_01-timesI(Chimu_31);\
-    Chi_02 = Chimu_02-timesI(Chimu_32);\
-    Chi_10 = Chimu_10-timesI(Chimu_20);\
-    Chi_11 = Chimu_11-timesI(Chimu_21);\
-    Chi_12 = Chimu_12-timesI(Chimu_22);
-
-#define YM_PROJ \
-    Chi_00 = Chimu_00+Chimu_30;\
-    Chi_01 = Chimu_01+Chimu_31;\
-    Chi_02 = Chimu_02+Chimu_32;\
-    Chi_10 = Chimu_10-Chimu_20;\
-    Chi_11 = Chimu_11-Chimu_21;\
-    Chi_12 = Chimu_12-Chimu_22;
-
-#define ZM_PROJ \
-  Chi_00 = Chimu_00-timesI(Chimu_20);		\
-  Chi_01 = Chimu_01-timesI(Chimu_21);		\
-  Chi_02 = Chimu_02-timesI(Chimu_22);		\
-  Chi_10 = Chimu_10+timesI(Chimu_30);		\
-  Chi_11 = Chimu_11+timesI(Chimu_31);		\
-  Chi_12 = Chimu_12+timesI(Chimu_32);
-
-#define TM_PROJ \
-  Chi_00 = Chimu_00-Chimu_20;		\
-  Chi_01 = Chimu_01-Chimu_21;		\
-  Chi_02 = Chimu_02-Chimu_22;		\
-  Chi_10 = Chimu_10-Chimu_30;		\
-  Chi_11 = Chimu_11-Chimu_31;		\
-  Chi_12 = Chimu_12-Chimu_32;
-
-//      fspin(0)=hspin(0);
-//      fspin(1)=hspin(1);
-//      fspin(2)=timesMinusI(hspin(1));
-//      fspin(3)=timesMinusI(hspin(0));
-#define XP_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesMinusI(UChi_10);\
-  result_21 = timesMinusI(UChi_11);\
-  result_22 = timesMinusI(UChi_12);\
-  result_30 = timesMinusI(UChi_00);\
-  result_31 = timesMinusI(UChi_01);\
-  result_32 = timesMinusI(UChi_02);
-
-#define XP_RECON_ACCUM\
-  result_00+=UChi_00;\
-  result_01+=UChi_01;\
-  result_02+=UChi_02;\
-  result_10+=UChi_10;\
-  result_11+=UChi_11;\
-  result_12+=UChi_12;\
-  result_20-=timesI(UChi_10);\
-  result_21-=timesI(UChi_11);\
-  result_22-=timesI(UChi_12);\
-  result_30-=timesI(UChi_00);\
-  result_31-=timesI(UChi_01);\
-  result_32-=timesI(UChi_02);
-
-#define XM_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesI(UChi_10);\
-  result_21 = timesI(UChi_11);\
-  result_22 = timesI(UChi_12);\
-  result_30 = timesI(UChi_00);\
-  result_31 = timesI(UChi_01);\
-  result_32 = timesI(UChi_02);
-
-#define XM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_10);\
-  result_21+= timesI(UChi_11);\
-  result_22+= timesI(UChi_12);\
-  result_30+= timesI(UChi_00);\
-  result_31+= timesI(UChi_01);\
-  result_32+= timesI(UChi_02);
-
-#define YP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_10;\
-  result_21+= UChi_11;\
-  result_22+= UChi_12;\
-  result_30-= UChi_00;\
-  result_31-= UChi_01;\
-  result_32-= UChi_02;
-
-#define YM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_10;\
-  result_21-= UChi_11;\
-  result_22-= UChi_12;\
-  result_30+= UChi_00;\
-  result_31+= UChi_01;\
-  result_32+= UChi_02;
-
-#define ZP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= timesI(UChi_00);			\
-  result_21-= timesI(UChi_01);			\
-  result_22-= timesI(UChi_02);			\
-  result_30+= timesI(UChi_10);			\
-  result_31+= timesI(UChi_11);			\
-  result_32+= timesI(UChi_12);
-
-#define ZM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_00);			\
-  result_21+= timesI(UChi_01);			\
-  result_22+= timesI(UChi_02);			\
-  result_30-= timesI(UChi_10);			\
-  result_31-= timesI(UChi_11);			\
-  result_32-= timesI(UChi_12);
-
-#define TP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_00;			\
-  result_21+= UChi_01;			\
-  result_22+= UChi_02;			\
-  result_30+= UChi_10;			\
-  result_31+= UChi_11;			\
-  result_32+= UChi_12;
-
-#define TM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_00;	\
-  result_21-= UChi_01;	\
-  result_22-= UChi_02;	\
-  result_30-= UChi_10;	\
-  result_31-= UChi_11;	\
-  result_32-= UChi_12;
-
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-  }						\
-  MULT_2SPIN_IMPL(DIR,F);			\
-  RECON;					
-
-
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-  }						\
-  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN_IMPL(DIR,F);			\
-    RECON;					\
-  }
-
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI_IMPL(DIR,F,PERM);			\
-    MULT_2SPIN_IMPL(DIR,F);			\
-    RECON;					\
-    nmu++;					\
-  }
-
-#define HAND_RESULT(ss,F)			\
-  {						\
-    SiteSpinor & ref (out._odata[ss]);		\
-    vstream(ref(F)(0)(0),result_00);		\
-    vstream(ref(F)(0)(1),result_01);		\
-    vstream(ref(F)(0)(2),result_02);		\
-    vstream(ref(F)(1)(0),result_10);		\
-    vstream(ref(F)(1)(1),result_11);		\
-    vstream(ref(F)(1)(2),result_12);		\
-    vstream(ref(F)(2)(0),result_20);		\
-    vstream(ref(F)(2)(1),result_21);		\
-    vstream(ref(F)(2)(2),result_22);		\
-    vstream(ref(F)(3)(0),result_30);		\
-    vstream(ref(F)(3)(1),result_31);		\
-    vstream(ref(F)(3)(2),result_32);		\
-  }
-
-#define HAND_RESULT_EXT(ss,F)			\
-  if (nmu){					\
-    SiteSpinor & ref (out._odata[ss]);		\
-    ref(F)(0)(0)+=result_00;		\
-    ref(F)(0)(1)+=result_01;		\
-    ref(F)(0)(2)+=result_02;		\
-    ref(F)(1)(0)+=result_10;		\
-    ref(F)(1)(1)+=result_11;		\
-    ref(F)(1)(2)+=result_12;		\
-    ref(F)(2)(0)+=result_20;		\
-    ref(F)(2)(1)+=result_21;		\
-    ref(F)(2)(2)+=result_22;		\
-    ref(F)(3)(0)+=result_30;		\
-    ref(F)(3)(1)+=result_31;		\
-    ref(F)(3)(2)+=result_32;		\
-  }
-
-
-#define HAND_DECLARATIONS(a)			\
-  Simd result_00;				\
-  Simd result_01;				\
-  Simd result_02;				\
-  Simd result_10;				\
-  Simd result_11;				\
-  Simd result_12;				\
-  Simd result_20;				\
-  Simd result_21;				\
-  Simd result_22;				\
-  Simd result_30;				\
-  Simd result_31;				\
-  Simd result_32;				\
-  Simd Chi_00;					\
-  Simd Chi_01;					\
-  Simd Chi_02;					\
-  Simd Chi_10;					\
-  Simd Chi_11;					\
-  Simd Chi_12;					\
-  Simd UChi_00;					\
-  Simd UChi_01;					\
-  Simd UChi_02;					\
-  Simd UChi_10;					\
-  Simd UChi_11;					\
-  Simd UChi_12;					\
-  Simd U_00;					\
-  Simd U_10;					\
-  Simd U_20;					\
-  Simd U_01;					\
-  Simd U_11;					\
-  Simd U_21;
-
-#define ZERO_RESULT				\
-  result_00=zero;				\
-  result_01=zero;				\
-  result_02=zero;				\
-  result_10=zero;				\
-  result_11=zero;				\
-  result_12=zero;				\
-  result_20=zero;				\
-  result_21=zero;				\
-  result_22=zero;				\
-  result_30=zero;				\
-  result_31=zero;				\
-  result_32=zero;			
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-namespace Grid {
-namespace QCD {
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
-#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss,F)
-
-  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-
-#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
-  ZERO_RESULT;							\
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_RESULT(ss,F)
-  
-  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;
-
-#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss,F)
-
-  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  int nmu=0;
-
-#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss,F)
-
-  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
-}
-
-#define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-				    int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    StencilEntry *SE;							\
-    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-					    int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
-    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
-    StencilEntry *SE;							\
-    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-									\
-  template<> void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
-						     int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    StencilEntry *SE;							\
-    int nmu=0;								\
-    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    nmu = 0;								\
-    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }									\
-  template<>								\
-  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-							     int ss,int sU,const FermionField &in, FermionField &out) \
-  {									\
-    typedef IMPL Impl;							\
-    typedef typename Simd::scalar_type S;				\
-    typedef typename Simd::vector_type V;				\
-									\
-    HAND_DECLARATIONS(ignore);						\
-									\
-    StencilEntry *SE;							\
-    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
-    int nmu=0;								\
-    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-    nmu = 0;								\
-    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
-  }
-
-
-HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
-HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
-
-
-
-
-
-
-
-
-
-
-  
-////////////// Wilson ; uses this implementation /////////////////////
-
-#define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-					     int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						int ss,int sU,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
-						int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
-						   int ss,int sU,const FermionField &in, FermionField &out); 
-
-INSTANTIATE_THEM(GparityWilsonImplF);
-INSTANTIATE_THEM(GparityWilsonImplD);
-INSTANTIATE_THEM(GparityWilsonImplFH);
-INSTANTIATE_THEM(GparityWilsonImplDF);
-}}
@@ -1,153 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/gauge/GaugeImpl.h
-
-Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_GAUGE_IMPL_TYPES_H
-#define GRID_GAUGE_IMPL_TYPES_H
-
-namespace Grid {
-namespace QCD {
-
-////////////////////////////////////////////////////////////////////////
-// Implementation dependent gauge types
-////////////////////////////////////////////////////////////////////////
-
-#define INHERIT_GIMPL_TYPES(GImpl)                  \
-  typedef typename GImpl::Simd Simd;                \
-  typedef typename GImpl::LinkField GaugeLinkField; \
-  typedef typename GImpl::Field GaugeField;         \
-  typedef typename GImpl::ComplexField ComplexField;\
-  typedef typename GImpl::SiteField SiteGaugeField; \
-  typedef typename GImpl::SiteComplex SiteComplex;  \
-  typedef typename GImpl::SiteLink SiteGaugeLink;
-
-#define INHERIT_FIELD_TYPES(Impl)		    \
-  typedef typename Impl::Simd Simd;		    \
-  typedef typename Impl::ComplexField ComplexField; \
-  typedef typename Impl::SiteField SiteField;	    \
-  typedef typename Impl::Field Field;
-
-// hardcodes the exponential approximation in the template
-template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes {
-public:
-  typedef S Simd;
-
-  template <typename vtype> using iImplScalar     = iScalar<iScalar<iScalar<vtype> > >;
-  template <typename vtype> using iImplGaugeLink  = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
-  template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
-
-  typedef iImplScalar<Simd>     SiteComplex;
-  typedef iImplGaugeLink<Simd>  SiteLink;
-  typedef iImplGaugeField<Simd> SiteField;
-
-  typedef Lattice<SiteComplex> ComplexField;
-  typedef Lattice<SiteLink>    LinkField; 
-  typedef Lattice<SiteField>   Field;
-
-  // Guido: we can probably separate the types from the HMC functions
-  // this will create 2 kind of implementations
-  // probably confusing the users
-  // Now keeping only one class
-
-
-  // Move this elsewhere? FIXME
-  static inline void AddLink(Field &U, LinkField &W,
-                                  int mu) { // U[mu] += W
-    PARALLEL_FOR_LOOP
-    for (auto ss = 0; ss < U._grid->oSites(); ss++) {
-      U._odata[ss]._internal[mu] =
-          U._odata[ss]._internal[mu] + W._odata[ss]._internal;
-    }
-  }
-
-  ///////////////////////////////////////////////////////////
-  // Move these to another class
-  // HMC auxiliary functions
-  static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) {
-    // specific for SU gauge fields
-    LinkField Pmu(P._grid);
-    Pmu = zero;
-    for (int mu = 0; mu < Nd; mu++) {
-      SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
-      PokeIndex<LorentzIndex>(P, Pmu, mu);
-    }
-  }
-
-  static inline Field projectForce(Field &P) { return Ta(P); }
-
-  static inline void update_field(Field& P, Field& U, double ep){
-    //static std::chrono::duration<double> diff;
-
-    //auto start = std::chrono::high_resolution_clock::now();
-    parallel_for(int ss=0;ss<P._grid->oSites();ss++){
-      for (int mu = 0; mu < Nd; mu++) 
-        U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]);
-    }
-    
-    //auto end = std::chrono::high_resolution_clock::now();
-   // diff += end - start;
-   // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
-  }
-
-  static inline RealD FieldSquareNorm(Field& U){
-    LatticeComplex Hloc(U._grid);
-    Hloc = zero;
-    for (int mu = 0; mu < Nd; mu++) {
-      auto Umu = PeekIndex<LorentzIndex>(U, mu);
-      Hloc += trace(Umu * Umu);
-    }
-    Complex Hsum = sum(Hloc);
-    return Hsum.real();
-  }
-
-  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::HotConfiguration(pRNG, U);
-  }
-
-  static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::TepidConfiguration(pRNG, U);
-  }
-
-  static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::ColdConfiguration(pRNG, U);
-  }
-};
-
-
-typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
-typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
-typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
-
-typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR;
-typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF;
-typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD;
-
-
-} // QCD
-} // Grid
-
-#endif // GRID_GAUGE_IMPL_TYPES_H
@@ -1,417 +0,0 @@
-/*************************************************************************************
- 
- Grid physics library, www.github.com/paboyle/Grid
- 
- Source file: ./lib/qcd/action/gauge/Photon.h
- 
- Copyright (C) 2015
- 
- Author: Peter Boyle <paboyle@ph.ed.ac.uk>
- 
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- 
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- 
- See the full license in the file "LICENSE" in the top level distribution directory
- *************************************************************************************/
-/*  END LEGAL */
-#ifndef QCD_PHOTON_ACTION_H
-#define QCD_PHOTON_ACTION_H
-
-namespace Grid{
-namespace QCD{
-  template <class S>
-  class QedGimpl
-  {
-  public:
-    typedef S Simd;
-    
-    template <typename vtype>
-    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
-    template <typename vtype>
-    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
-    
-    typedef iImplGaugeLink<Simd>  SiteLink;
-    typedef iImplGaugeField<Simd> SiteField;
-    typedef SiteField             SiteComplex;
-    
-    typedef Lattice<SiteLink>  LinkField;
-    typedef Lattice<SiteField> Field;
-    typedef Field              ComplexField;
-  };
-  
-  typedef QedGimpl<vComplex> QedGimplR;
-  
-  template<class Gimpl>
-  class Photon
-  {
-  public:
-    INHERIT_GIMPL_TYPES(Gimpl);
-    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
-    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2, qedInf, 3);
-  public:
-    Photon(Gauge gauge, ZmScheme zmScheme);
-    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements);
-    Photon(Gauge gauge, ZmScheme zmScheme, Real G0);
-    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements, Real G0);
-    virtual ~Photon(void) = default;
-    void FreePropagator(const GaugeField &in, GaugeField &out);
-    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
-    void StochasticWeight(GaugeLinkField &weight);
-    void StochasticField(GaugeField &out, GridParallelRNG &rng);
-    void StochasticField(GaugeField &out, GridParallelRNG &rng,
-                         const GaugeLinkField &weight);
-    void UnitField(GaugeField &out);
-  private:
-    void infVolPropagator(GaugeLinkField &out);
-    void invKHatSquared(GaugeLinkField &out);
-    void zmSub(GaugeLinkField &out);
-  private:
-    Gauge    gauge_;
-    ZmScheme zmScheme_;
-    std::vector<Real>  improvement_;
-    Real     G0_;
-  };
-
-  typedef Photon<QedGimplR>  PhotonR;
-  
-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()),
-    G0_(0.15493339023106021408483720810737508876916113364521)
-  {}
-
-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
-                        std::vector<Real> improvements)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements),
-    G0_(0.15493339023106021408483720810737508876916113364521)
-  {}
-
-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme, Real G0)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()), G0_(G0)
-  {}
-
-  template<class Gimpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
-                        std::vector<Real> improvements, Real G0)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements), G0_(G0)
-  {}
-
-  template<class Gimpl>
-  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
-  {
-    FFT theFFT(in._grid);
-    
-    GaugeField in_k(in._grid);
-    GaugeField prop_k(in._grid);
-    
-    theFFT.FFT_all_dim(in_k,in,FFT::forward);
-    MomentumSpacePropagator(prop_k,in_k);
-    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
-  }
-
-  template<class Gimpl>
-  void Photon<Gimpl>::infVolPropagator(GaugeLinkField &out)
-  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    LatticeReal        xmu(grid);
-    GaugeLinkField     one(grid);
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    std::vector<int>   x0(nd,0);
-    TComplex           Tone  = Complex(1.0,0.0);
-    TComplex           Tzero = Complex(G0_,0.0);
-    FFT                fft(grid);
-    
-    one = Complex(1.0,0.0);
-    out = zero;
-    for(int mu = 0; mu < nd; mu++)
-    {
-      LatticeCoordinate(xmu,mu);
-      Real lo2 = l[mu]/2.0;
-      xmu = where(xmu < lo2, xmu, xmu-double(l[mu]));
-      out = out + toComplex(4*M_PI*M_PI*xmu*xmu);
-    }
-    pokeSite(Tone, out, x0);
-    out = one/out;
-    pokeSite(Tzero, out, x0);
-    fft.FFT_all_dim(out, out, FFT::forward);
-  }
-  
-  template<class Gimpl>
-  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
-  {
-    GridBase           *grid = out._grid;
-    GaugeLinkField     kmu(grid), one(grid);
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    std::vector<int>   zm(nd,0);
-    TComplex           Tone = Complex(1.0,0.0);
-    TComplex           Tzero= Complex(0.0,0.0);
-    
-    one = Complex(1.0,0.0);
-    out = zero;
-    for(int mu = 0; mu < nd; mu++)
-    {
-      Real twoPiL = M_PI*2./l[mu];
-      
-      LatticeCoordinate(kmu,mu);
-      kmu = 2.*sin(.5*twoPiL*kmu);
-      out = out + kmu*kmu;
-    }
-    pokeSite(Tone, out, zm);
-    out = one/out;
-    pokeSite(Tzero, out, zm);
-  }
-  
-  template<class Gimpl>
-  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
-  {
-    GridBase           *grid = out._grid;
-    const unsigned int nd    = grid->_ndimension;
-    std::vector<int>   &l    = grid->_fdimensions;
-    
-    switch (zmScheme_)
-    {
-      case ZmScheme::qedTL:
-      {
-        std::vector<int> zm(nd,0);
-        TComplex         Tzero = Complex(0.0,0.0);
-        
-        pokeSite(Tzero, out, zm);
-        
-        break;
-      }
-      case ZmScheme::qedL:
-      {
-        LatticeInteger spNrm(grid), coor(grid);
-        GaugeLinkField z(grid);
-        
-        spNrm = zero;
-        for(int d = 0; d < grid->_ndimension - 1; d++)
-        {
-          LatticeCoordinate(coor,d);
-          coor = where(coor < Integer(l[d]/2), coor, coor-Integer(l[d]));
-          spNrm = spNrm + coor*coor;
-        }
-        out = where(spNrm == Integer(0), 0.*out, out);
-
-        // IR improvement
-        for(int i = 0; i < improvement_.size(); i++)
-        {
-          Real f = sqrt(improvement_[i]+1);
-          out = where(spNrm == Integer(i+1), f*out, out);
-        }
-      }
-      default:
-        break;
-    }
-  }
-
-  template<class Gimpl>
-  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
-                                               GaugeField &out)
-  {
-  GridBase           *grid = out._grid;
-    LatticeComplex     momProp(grid);
-    
-    switch (zmScheme_)
-    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
-      {
-        invKHatSquared(momProp);
-        zmSub(momProp);
-        break;
-      }
-      case ZmScheme::qedInf:
-      {
-        infVolPropagator(momProp);
-        break;
-      }
-      default:
-        break;
-    }
-    
-    out = in*momProp;
-  }
-  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
-  {
-    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
-    const unsigned int nd        = grid->_ndimension;
-    std::vector<int>   latt_size = grid->_fdimensions;
-    
-    switch (zmScheme_)
-    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
-      {
-        Integer vol = 1;
-        for(int d = 0; d < nd; d++)
-        {
-          vol = vol * latt_size[d];
-        }
-        invKHatSquared(weight);
-        weight = sqrt(vol)*sqrt(weight);
-        zmSub(weight);
-        break;
-      }
-      case ZmScheme::qedInf:
-      {
-        infVolPropagator(weight);
-        weight = sqrt(real(weight));
-        break;
-      }
-      default:
-        break;
-    }
-  }
-  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
-  {
-    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
-    GaugeLinkField weight(grid);
-    
-    StochasticWeight(weight);
-    StochasticField(out, rng, weight);
-  }
-  
-  template<class Gimpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
-                                      const GaugeLinkField &weight)
-  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    const unsigned int nd = grid->_ndimension;
-    GaugeLinkField     r(grid);
-    GaugeField         aTilde(grid);
-    FFT                fft(grid);
-    
-    switch (zmScheme_)
-    {
-      case ZmScheme::qedTL:
-      case ZmScheme::qedL:
-      {
-        for(int mu = 0; mu < nd; mu++)
-        {
-          gaussian(rng, r);
-          r = weight*r;
-          pokeLorentz(aTilde, r, mu);
-        }
-        break;
-      }
-      case ZmScheme::qedInf:
-      {
-        Complex                    shift(1., 1.); // This needs to be a GaugeLink element?
-        for(int mu = 0; mu < nd; mu++)
-        {
-          bernoulli(rng, r);
-          r = weight*(2.*r - shift);
-          pokeLorentz(aTilde, r, mu);
-        }
-        break;
-      }
-      default:
-        break;
-    }
-
-    fft.FFT_all_dim(out, aTilde, FFT::backward);
-    
-    out = real(out);
-  }
-
-  template<class Gimpl>
-  void Photon<Gimpl>::UnitField(GaugeField &out)
-  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    const unsigned int nd = grid->_ndimension;
-    GaugeLinkField     r(grid);
-    
-    r = Complex(1.0,0.0);
-
-    for(int mu = 0; mu < nd; mu++)
-    {
-      pokeLorentz(out, r, mu);
-    }
-    
-    out = real(out);
-  }
-//  template<class Gimpl>
-//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
-//                                                            const GaugeField &in)
-//  {
-//    
-//    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
-//    
-//    GridBase *grid = out._grid;
-//    LatticeInteger     coor(grid);
-//    GaugeField zz(grid); zz=zero;
-//    
-//    // xyzt
-//    for(int d = 0; d < grid->_ndimension-1;d++){
-//      LatticeCoordinate(coor,d);
-//      out = where(coor==Integer(0),zz,out);
-//    }
-//  }
-//  
-//  template<class Gimpl>
-//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
-//                                                             const GaugeField &in)
-//  {
-//    
-//    // what type LatticeComplex
-//    GridBase *grid = out._grid;
-//    int nd = grid->_ndimension;
-//    
-//    typedef typename GaugeField::vector_type vector_type;
-//    typedef typename GaugeField::scalar_type ScalComplex;
-//    typedef Lattice<iSinglet<vector_type> > LatComplex;
-//    
-//    std::vector<int> latt_size   = grid->_fdimensions;
-//    
-//    LatComplex denom(grid); denom= zero;
-//    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
-//    LatComplex   kmu(grid);
-//    
-//    ScalComplex ci(0.0,1.0);
-//    // momphase = n * 2pi / L
-//    for(int mu=0;mu<Nd;mu++) {
-//      
-//      LatticeCoordinate(kmu,mu);
-//      
-//      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-//      
-//      kmu = TwoPiL * kmu ;
-//      
-//      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
-//    }
-//    std::vector<int> zero_mode(nd,0);
-//    TComplexD Tone = ComplexD(1.0,0.0);
-//    TComplexD Tzero= ComplexD(0.0,0.0);
-//    
-//    pokeSite(Tone,denom,zero_mode);
-//    
-//    denom= one/denom;
-//    
-//    pokeSite(Tzero,denom,zero_mode);
-//    
-//    out = zero;
-//    out = in*denom;
-//  };
-  
-}}
-#endif
@@ -1,95 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef QCD_WILSON_GAUGE_ACTION_H
-#define QCD_WILSON_GAUGE_ACTION_H
-
-namespace Grid {
-namespace QCD {
-
-////////////////////////////////////////////////////////////////////////
-// Wilson Gauge Action .. should I template the Nc etc..
-////////////////////////////////////////////////////////////////////////
-template <class Gimpl>
-class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
- public:  
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  /////////////////////////// constructors
-  explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
-
-  virtual std::string action_name() {return "WilsonGaugeAction";}
-
-  virtual std::string LogParameters(){
-    std::stringstream sstream;
-    sstream << GridLogMessage << "[WilsonGaugeAction] Beta: " << beta << std::endl;
-    return sstream.str();
-  }
-
-  virtual void refresh(const GaugeField &U,
-                       GridParallelRNG &pRNG){};  // noop as no pseudoferms
-
-  virtual RealD S(const GaugeField &U) {
-    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
-    RealD vol = U._grid->gSites();
-    RealD action = beta * (1.0 - plaq) * (Nd * (Nd - 1.0)) * vol * 0.5;
-    return action;
-  };
-
-  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
-    // not optimal implementation FIXME
-    // extend Ta to include Lorentz indexes
-
-    RealD factor = 0.5 * beta / RealD(Nc);
-
-    GaugeLinkField Umu(U._grid);
-    GaugeLinkField dSdU_mu(U._grid);
-    for (int mu = 0; mu < Nd; mu++) {
-      Umu = PeekIndex<LorentzIndex>(U, mu);
-
-      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
-
-      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
-    }
-  }
-private:
-  RealD beta;  
-};
-
-
-
-}
-}
-
-#endif
@@ -1,145 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
-#define QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
-
-namespace Grid{
-  namespace QCD{
-
-    // Base even odd HMC on the normal Mee based schur decomposition.
-    //
-    //     M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
-    //         (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
-    //
-    // Determinant is det of middle factor
-    // This assumes Mee is indept of U.
-    //
-    template<class Impl>
-    class SchurDifferentiableOperator :  public SchurDiagMooeeOperator<FermionOperator<Impl>,typename Impl::FermionField> 
-      {
-      public:
-      INHERIT_IMPL_TYPES(Impl);
-
-        typedef FermionOperator<Impl> Matrix;
-
-        SchurDifferentiableOperator (Matrix &Mat) : SchurDiagMooeeOperator<Matrix,FermionField>(Mat) {};
-
-        void MpcDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
-        
-          GridBase *fgrid   = this->_Mat.FermionGrid();
-          GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
-
-          FermionField tmp1(fcbgrid);
-          FermionField tmp2(fcbgrid);
-
-          conformable(fcbgrid,U._grid);
-          conformable(fcbgrid,V._grid);
-
-          // Assert the checkerboard?? or code for either
-          assert(U.checkerboard==Odd);
-          assert(V.checkerboard==U.checkerboard);
-
-          // NOTE Guido: WE DO NOT WANT TO USE THE ucbgrid GRID FOR THE FORCE
-          // it is not conformable with the HMC force field
-	  // Case: Ls vectorised fields
-          // INHERIT FROM THE Force field instead
-          GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
-          GaugeField ForceO(forcecb);
-          GaugeField ForceE(forcecb);
-
-
-          //  X^dag Der_oe MeeInv Meo Y
-          // Use Mooee as nontrivial but gauge field indept
-          this->_Mat.Meooe   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
-	  this->_Mat.MooeeInv(tmp1,tmp2);   // even->even 
-          this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerNo);
-          //  Accumulate X^dag M_oe MeeInv Der_eo Y
-          this->_Mat.MeooeDag   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
-          this->_Mat.MooeeInvDag(tmp1,tmp2); // even->even 
-          this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerNo);
-          
-          assert(ForceE.checkerboard==Even);
-          assert(ForceO.checkerboard==Odd);
-
-          setCheckerboard(Force,ForceE); 
-          setCheckerboard(Force,ForceO);
-          Force=-Force;
-
-          delete forcecb;
-        }
-
-
-        void MpcDagDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
-        
-          GridBase *fgrid   = this->_Mat.FermionGrid();
-          GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
-
-          FermionField tmp1(fcbgrid);
-          FermionField tmp2(fcbgrid);
-
-          conformable(fcbgrid,U._grid);
-          conformable(fcbgrid,V._grid);
-
-          // Assert the checkerboard?? or code for either
-          assert(V.checkerboard==Odd);
-          assert(V.checkerboard==V.checkerboard);
-
-          // NOTE Guido: WE DO NOT WANT TO USE THE ucbgrid GRID FOR THE FORCE
-          // it is not conformable with the HMC force field
-          // INHERIT FROM THE Force field instead
-	  GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
-          GaugeField ForceO(forcecb);
-          GaugeField ForceE(forcecb);
-
-          //  X^dag Der_oe MeeInv Meo Y
-          // Use Mooee as nontrivial but gauge field indept
-          this->_Mat.MeooeDag   (V,tmp1);      // odd->even -- implicit -0.5 factor to be applied
-          this->_Mat.MooeeInvDag(tmp1,tmp2);   // even->even 
-          this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes);
-          
-          //  Accumulate X^dag M_oe MeeInv Der_eo Y
-          this->_Mat.Meooe   (U,tmp1);    // even->odd -- implicit -0.5 factor to be applied
-          this->_Mat.MooeeInv(tmp1,tmp2); // even->even 
-          this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes);
-
-          assert(ForceE.checkerboard==Even);
-          assert(ForceO.checkerboard==Odd);
-
-          setCheckerboard(Force,ForceE); 
-          setCheckerboard(Force,ForceO);
-          Force=-Force;
-
-          delete forcecb;
-        }
-
-    };
-
-  }
-}
-#endif
@@ -1,264 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h
-
-Copyright (C) 2017
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-/////////////////////////////////////////////////////////////////
-// Implementation of exact one flavour algorithm (EOFA)         //
-// using fermion classes defined in:                           //
-//    Grid/qcd/action/fermion/DomainWallEOFAFermion.h (Shamir) //
-//    Grid/qcd/action/fermion/MobiusEOFAFermion.h (Mobius)     //
-// arXiv: 1403.1683, 1706.05843                                //
-/////////////////////////////////////////////////////////////////
-
-#ifndef QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
-#define QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
-
-namespace Grid{
-namespace QCD{
-
-  ///////////////////////////////////////////////////////////////
-  // Exact one flavour implementation of DWF determinant ratio //
-  ///////////////////////////////////////////////////////////////
-
-  template<class Impl>
-  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
-  {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-      typedef OneFlavourRationalParams Params;
-      Params param;
-      MultiShiftFunction PowerNegHalf;
-
-    private:
-      bool use_heatbath_forecasting;
-      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
-      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
-      SchurRedBlackDiagMooeeSolve<FermionField> Solver;
-      FermionField Phi; // the pseudofermion field for this trajectory
-
-    public:
-      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, AbstractEOFAFermion<Impl>& _Rop,
-        OperatorFunction<FermionField>& S, Params& p, bool use_fc=false) : Lop(_Lop), Rop(_Rop), Solver(S),
-        Phi(_Lop.FermionGrid()), param(p), use_heatbath_forecasting(use_fc)
-      {
-        AlgRemez remez(param.lo, param.hi, param.precision);
-
-        // MdagM^(+- 1/2)
-        std::cout << GridLogMessage << "Generating degree " << param.degree << " for x^(-1/2)" << std::endl;
-        remez.generateApprox(param.degree, 1, 2);
-        PowerNegHalf.Init(remez, param.tolerance, true);
-      };
-
-      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
-
-      virtual std::string LogParameters() {
-        std::stringstream sstream;
-        sstream << GridLogMessage << "[" << action_name() << "] Low            :" << param.lo << std::endl;
-        sstream << GridLogMessage << "[" << action_name() << "] High           :" << param.hi << std::endl;
-        sstream << GridLogMessage << "[" << action_name() << "] Max iterations :" << param.MaxIter << std::endl;
-        sstream << GridLogMessage << "[" << action_name() << "] Tolerance      :" << param.tolerance << std::endl;
-        sstream << GridLogMessage << "[" << action_name() << "] Degree         :" << param.degree << std::endl;
-        sstream << GridLogMessage << "[" << action_name() << "] Precision      :" << param.precision << std::endl;
-        return sstream.str();
-      }
-
-      // Spin projection
-      void spProj(const FermionField& in, FermionField& out, int sign, int Ls)
-      {
-        if(sign == 1){ for(int s=0; s<Ls; ++s){ axpby_ssp_pplus(out, 0.0, in, 1.0, in, s, s); } }
-        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
-      }
-
-      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
-      // We generate a Gaussian noise vector \eta, and then compute
-      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
-      // using a rational approximation to the inverse square root
-      virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
-      {
-        Lop.ImportGauge(U);
-        Rop.ImportGauge(U);
-
-        FermionField eta         (Lop.FermionGrid());
-        FermionField CG_src      (Lop.FermionGrid());
-        FermionField CG_soln     (Lop.FermionGrid());
-        FermionField Forecast_src(Lop.FermionGrid());
-        std::vector<FermionField> tmp(2, Lop.FermionGrid());
-
-        // Use chronological inverter to forecast solutions across poles
-        std::vector<FermionField> prev_solns;
-        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
-        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
-
-        // Seed with Gaussian noise vector (var = 0.5)
-        RealD scale = std::sqrt(0.5);
-        gaussian(pRNG,eta);
-        eta = eta * scale;
-        printf("Heatbath source vector: <\\eta|\\eta> = %1.15e\n", norm2(eta));
-
-        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
-        RealD N(PowerNegHalf.norm);
-        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
-        Phi = eta * N;
-
-        // LH terms:
-        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
-        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
-        RealD gamma_l(0.0);
-        spProj(eta, tmp[0], -1, Lop.Ls);
-        Lop.Omega(tmp[0], tmp[1], -1, 0);
-        G5R5(CG_src, tmp[1]);
-        tmp[1] = zero;
-        for(int k=0; k<param.degree; ++k){
-          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Lop.RefreshShiftCoefficients(-gamma_l);
-          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
-            Lop.Mdag(CG_src, Forecast_src);
-            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
-            Solver(Lop, CG_src, CG_soln);
-            prev_solns.push_back(CG_soln);
-          } else {
-            CG_soln = zero; // Just use zero as the initial guess
-            Solver(Lop, CG_src, CG_soln);
-          }
-          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
-          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
-        }
-        Lop.Omega(tmp[1], tmp[0], -1, 1);
-        spProj(tmp[0], tmp[1], -1, Lop.Ls);
-        Phi = Phi + tmp[1];
-
-        // RH terms:
-        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //          + \gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
-        spProj(eta, tmp[0], 1, Rop.Ls);
-        Rop.Omega(tmp[0], tmp[1], 1, 0);
-        G5R5(CG_src, tmp[1]);
-        tmp[1] = zero;
-        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
-        for(int k=0; k<param.degree; ++k){
-          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
-          if(use_heatbath_forecasting){
-            Rop.Mdag(CG_src, Forecast_src);
-            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
-            Solver(Rop, CG_src, CG_soln);
-            prev_solns.push_back(CG_soln);
-          } else {
-            CG_soln = zero;
-            Solver(Rop, CG_src, CG_soln);
-          }
-          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
-          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
-        }
-        Rop.Omega(tmp[1], tmp[0], 1, 1);
-        spProj(tmp[0], tmp[1], 1, Rop.Ls);
-        Phi = Phi + tmp[1];
-
-        // Reset shift coefficients for energy and force evals
-        Lop.RefreshShiftCoefficients(0.0);
-        Rop.RefreshShiftCoefficients(-1.0);
-      };
-
-      // EOFA action: see Eqn. (10) of arXiv:1706.05843
-      virtual RealD S(const GaugeField& U)
-      {
-        Lop.ImportGauge(U);
-        Rop.ImportGauge(U);
-
-        FermionField spProj_Phi(Lop.FermionGrid());
-        std::vector<FermionField> tmp(2, Lop.FermionGrid());
-
-        // S = <\Phi|\Phi>
-        RealD action(norm2(Phi));
-
-        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, -1, Lop.Ls);
-        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
-        G5R5(tmp[1], tmp[0]);
-        tmp[0] = zero;
-        Solver(Lop, tmp[1], tmp[0]);
-        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
-        Lop.Omega(tmp[1], tmp[0], -1, 1);
-        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
-
-        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, 1, Rop.Ls);
-        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
-        G5R5(tmp[1], tmp[0]);
-        tmp[0] = zero;
-        Solver(Rop, tmp[1], tmp[0]);
-        Rop.Dtilde(tmp[0], tmp[1]);
-        Rop.Omega(tmp[1], tmp[0], 1, 1);
-        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
-
-        return action;
-      };
-
-      // EOFA pseudofermion force: see Eqns. (34)-(36) of arXiv:1706.05843
-      virtual void deriv(const GaugeField& U, GaugeField& dSdU)
-      {
-        Lop.ImportGauge(U);
-        Rop.ImportGauge(U);
-
-        FermionField spProj_Phi      (Lop.FermionGrid());
-        FermionField Omega_spProj_Phi(Lop.FermionGrid());
-        FermionField CG_src          (Lop.FermionGrid());
-        FermionField Chi             (Lop.FermionGrid());
-        FermionField g5_R5_Chi       (Lop.FermionGrid());
-
-        GaugeField force(Lop.GaugeGrid());
-
-        // LH: dSdU = k \chi_{L}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{L}
-        //     \chi_{L} = H(mf)^{-1} \Omega_{-} P_{-} \Phi
-        spProj(Phi, spProj_Phi, -1, Lop.Ls);
-        Lop.Omega(spProj_Phi, Omega_spProj_Phi, -1, 0);
-        G5R5(CG_src, Omega_spProj_Phi);
-        spProj_Phi = zero;
-        Solver(Lop, CG_src, spProj_Phi);
-        Lop.Dtilde(spProj_Phi, Chi);
-        G5R5(g5_R5_Chi, Chi);
-        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
-        dSdU = Lop.k * force;
-
-        // RH: dSdU = dSdU - k \chi_{R}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{}
-        //     \chi_{R} = ( H(mb) - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \Phi
-        spProj(Phi, spProj_Phi, 1, Rop.Ls);
-        Rop.Omega(spProj_Phi, Omega_spProj_Phi, 1, 0);
-        G5R5(CG_src, Omega_spProj_Phi);
-        spProj_Phi = zero;
-        Solver(Rop, CG_src, spProj_Phi);
-        Rop.Dtilde(spProj_Phi, Chi);
-        G5R5(g5_R5_Chi, Chi);
-        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
-        dSdU = dSdU - Rop.k * force;
-      };
-  };
-}}
-
-#endif
@@ -1,209 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
-#define QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
-
-namespace Grid{
-  namespace QCD{
-
-    ///////////////////////////////////////
-    // Two flavour ratio
-    ///////////////////////////////////////
-    template<class Impl>
-    class TwoFlavourEvenOddRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
-
-    private:
-      FermionOperator<Impl> & NumOp;// the basic operator
-      FermionOperator<Impl> & DenOp;// the basic operator
-
-      OperatorFunction<FermionField> &DerivativeSolver;
-      OperatorFunction<FermionField> &ActionSolver;
-
-      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
-      FermionField PhiEven;  // the pseudo fermion field for this trajectory
-
-    public:
-      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-                                                FermionOperator<Impl>  &_DenOp, 
-                                                OperatorFunction<FermionField> & DS,
-                                                OperatorFunction<FermionField> & AS) :
-      NumOp(_NumOp), 
-      DenOp(_DenOp), 
-      DerivativeSolver(DS), 
-      ActionSolver(AS),
-      PhiEven(_NumOp.FermionRedBlackGrid()),
-      PhiOdd(_NumOp.FermionRedBlackGrid()) 
-        {
-          conformable(_NumOp.FermionGrid(), _DenOp.FermionGrid());
-          conformable(_NumOp.FermionRedBlackGrid(), _DenOp.FermionRedBlackGrid());
-          conformable(_NumOp.GaugeGrid(), _DenOp.GaugeGrid());
-          conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
-        };
-
-      virtual std::string action_name(){return "TwoFlavourEvenOddRatioPseudoFermionAction";}
-
-      virtual std::string LogParameters(){
-	std::stringstream sstream;
-	sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
-	return sstream.str();
-      } 
-
-      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
-
-        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
-        //
-        // NumOp == V
-        // DenOp == M
-        //
-        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
-        //
-        // P(eta_o) = e^{- eta_o^dag eta_o}
-        //
-        // e^{x^2/2 sig^2} => sig^2 = 0.5.
-        // 
-        RealD scale = std::sqrt(0.5);
-
-        FermionField eta    (NumOp.FermionGrid());
-        FermionField etaOdd (NumOp.FermionRedBlackGrid());
-        FermionField etaEven(NumOp.FermionRedBlackGrid());
-        FermionField tmp    (NumOp.FermionRedBlackGrid());
-
-        gaussian(pRNG,eta);
-
-        pickCheckerboard(Even,etaEven,eta);
-        pickCheckerboard(Odd,etaOdd,eta);
-
-        NumOp.ImportGauge(U);
-        DenOp.ImportGauge(U);
-
-        SchurDifferentiableOperator<Impl> Mpc(DenOp);
-        SchurDifferentiableOperator<Impl> Vpc(NumOp);
-
-        // Odd det factors
-        Mpc.MpcDag(etaOdd,PhiOdd);
-        tmp=zero;
-        ActionSolver(Vpc,PhiOdd,tmp);
-        Vpc.Mpc(tmp,PhiOdd);            
-
-        // Even det factors
-        DenOp.MooeeDag(etaEven,tmp);
-        NumOp.MooeeInvDag(tmp,PhiEven);
-
-        PhiOdd =PhiOdd*scale;
-        PhiEven=PhiEven*scale;
-        
-      };
-
-      //////////////////////////////////////////////////////
-      // S = phi^dag V (Mdag M)^-1 Vdag phi
-      //////////////////////////////////////////////////////
-      virtual RealD S(const GaugeField &U) {
-
-        NumOp.ImportGauge(U);
-        DenOp.ImportGauge(U);
-
-        SchurDifferentiableOperator<Impl> Mpc(DenOp);
-        SchurDifferentiableOperator<Impl> Vpc(NumOp);
-
-        FermionField X(NumOp.FermionRedBlackGrid());
-        FermionField Y(NumOp.FermionRedBlackGrid());
-
-        Vpc.MpcDag(PhiOdd,Y);           // Y= Vdag phi
-        X=zero;
-        ActionSolver(Mpc,Y,X);          // X= (MdagM)^-1 Vdag phi
-        //Mpc.Mpc(X,Y);                   // Y=  Mdag^-1 Vdag phi
-        // Multiply by Ydag
-        RealD action = real(innerProduct(Y,X));
-
-        //RealD action = norm2(Y);
-
-        // The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
-        // Only really clover term that creates this. Leave the EE portion as a future to do to make most
-        // rapid progresss on DWF for now.
-        //
-        NumOp.MooeeDag(PhiEven,X);
-        DenOp.MooeeInvDag(X,Y);
-        action = action + norm2(Y);
-
-        return action;
-      };
-
-      //////////////////////////////////////////////////////
-      // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
-      //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
-      //       + phi^dag V (Mdag M)^-1 dV^dag  phi
-      //////////////////////////////////////////////////////
-      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
-        NumOp.ImportGauge(U);
-        DenOp.ImportGauge(U);
-
-        SchurDifferentiableOperator<Impl> Mpc(DenOp);
-        SchurDifferentiableOperator<Impl> Vpc(NumOp);
-
-        FermionField  X(NumOp.FermionRedBlackGrid());
-        FermionField  Y(NumOp.FermionRedBlackGrid());
-
-        // This assignment is necessary to be compliant with the HMC grids
-	GaugeField force(dSdU._grid);
-
-        //Y=Vdag phi
-        //X = (Mdag M)^-1 V^dag phi
-        //Y = (Mdag)^-1 V^dag  phi
-        Vpc.MpcDag(PhiOdd,Y);          // Y= Vdag phi
-        X=zero;
-        DerivativeSolver(Mpc,Y,X);     // X= (MdagM)^-1 Vdag phi
-        Mpc.Mpc(X,Y);                  // Y=  Mdag^-1 Vdag phi
-
-        // phi^dag V (Mdag M)^-1 dV^dag  phi
-        Vpc.MpcDagDeriv(force , X, PhiOdd );   dSdU = force;
-  
-        // phi^dag dV (Mdag M)^-1 V^dag  phi
-        Vpc.MpcDeriv(force , PhiOdd, X );      dSdU = dSdU+force;
-
-        //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
-        //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
-        Mpc.MpcDeriv(force,Y,X);              dSdU = dSdU-force;
-        Mpc.MpcDagDeriv(force,X,Y);           dSdU = dSdU-force;
-
-        // FIXME No force contribution from EvenEven assumed here
-        // Needs a fix for clover.
-        assert(NumOp.ConstEE() == 1);
-        assert(DenOp.ConstEE() == 1);
-
-        dSdU = -dSdU;
-        
-      };
-    };
-  }
-}
-#endif
@@ -1,50 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/gauge/Scalar.h
-
-Copyright (C) 2017
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_QCD_SCALAR_H
-#define GRID_QCD_SCALAR_H
-
-#include <Grid/qcd/action/scalar/ScalarImpl.h>
-#include <Grid/qcd/action/scalar/ScalarAction.h>
-#include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
-
-namespace Grid {
-namespace QCD {
-
-  typedef ScalarAction<ScalarImplR>                 ScalarActionR;
-  typedef ScalarAction<ScalarImplF>                 ScalarActionF;
-  typedef ScalarAction<ScalarImplD>                 ScalarActionD;
-
-  template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
-  template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
-  template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
-  
-}
-}
-
-#endif  // GRID_QCD_SCALAR_H
@@ -1,83 +0,0 @@
-/*************************************************************************************
-
-  Grid physics library, www.github.com/paboyle/Grid
-
-  Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
-
-  Copyright (C) 2015
-
-  Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-  Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-  Author: neo <cossu@post.kek.jp>
-  Author: paboyle <paboyle@ph.ed.ac.uk>
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License along
-  with this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-  See the full license in the file "LICENSE" in the top level distribution
-directory
-  *************************************************************************************/
-/*  END LEGAL */
-
-#ifndef SCALAR_ACTION_H
-#define SCALAR_ACTION_H
-
-namespace Grid {
-  // FIXME drop the QCD namespace everywhere here
-
-template <class Impl>
-class ScalarAction : public QCD::Action<typename Impl::Field> {
- public:
-    INHERIT_FIELD_TYPES(Impl);
-
- private:
-    RealD mass_square;
-    RealD lambda;
-
- public:
-    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
-
-    virtual std::string LogParameters() {
-      std::stringstream sstream;
-      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
-      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
-      return sstream.str();
-    }
-    virtual std::string action_name() {return "ScalarAction";}
-
-    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
-
-    virtual RealD S(const Field &p) {
-      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
-    (lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
-    ScalarObs<Impl>::sumphider(p);
-    };
-
-    virtual void deriv(const Field &p,
-                       Field &force) {
-      Field tmp(p._grid);
-      Field p2(p._grid);
-      ScalarObs<Impl>::phisquared(p2, p);
-      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
-      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-
-      force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
-    }
-};
-
-
-
-}  // namespace Grid
-
-#endif // SCALAR_ACTION_H
@@ -1,263 +0,0 @@
-#ifndef SCALAR_IMPL
-#define SCALAR_IMPL
-
-
-namespace Grid {
-  //namespace QCD {
-
-template <class S>
-class ScalarImplTypes {
- public:
-    typedef S Simd;
-
-    template <typename vtype>
-    using iImplField = iScalar<iScalar<iScalar<vtype> > >;
-
-    typedef iImplField<Simd> SiteField;
-    typedef SiteField        SitePropagator;
-    typedef SiteField        SiteComplex;
-
-    typedef Lattice<SiteField> Field;
-    typedef Field              ComplexField;
-    typedef Field              FermionField;
-    typedef Field              PropagatorField;
-
-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
-      gaussian(pRNG, P);
-    }
-
-    static inline Field projectForce(Field& P){return P;}
-
-    static inline void update_field(Field& P, Field& U, double ep) {
-      U += P*ep;
-    }
-
-    static inline RealD FieldSquareNorm(Field& U) {
-      return (- sum(trace(U*U))/2.0);
-    }
-
-    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
-    }
-
-    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
-    }
-
-    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-      U = 1.0;
-    }
-
-    static void MomentumSpacePropagator(Field &out, RealD m)
-    {
-      GridBase           *grid = out._grid;
-      Field              kmu(grid), one(grid);
-      const unsigned int nd    = grid->_ndimension;
-      std::vector<int>   &l    = grid->_fdimensions;
-
-      one = Complex(1.0,0.0);
-      out = m*m;
-      for(int mu = 0; mu < nd; mu++)
-      {
-        Real twoPiL = M_PI*2./l[mu];
-
-        LatticeCoordinate(kmu,mu);
-        kmu = 2.*sin(.5*twoPiL*kmu);
-        out = out + kmu*kmu;
-      }
-      out = one/out;
-    }
-
-    static void FreePropagator(const Field &in, Field &out,
-                               const Field &momKernel)
-    {
-      FFT   fft((GridCartesian *)in._grid);
-      Field inFT(in._grid);
-
-      fft.FFT_all_dim(inFT, in, FFT::forward);
-      inFT = inFT*momKernel;
-      fft.FFT_all_dim(out, inFT, FFT::backward);
-    }
-
-    static void FreePropagator(const Field &in, Field &out, RealD m)
-    {
-      Field momKernel(in._grid);
-
-      MomentumSpacePropagator(momKernel, m);
-      FreePropagator(in, out, momKernel);
-    }
-
-  };
-
-  #ifdef  USE_FFT_ACCELERATION
-  #ifndef FFT_MASS
-  #error  "USE_FFT_ACCELERATION is defined but not FFT_MASS"
-  #endif
-  #endif
-  
-  template <class S, unsigned int N>
-  class ScalarAdjMatrixImplTypes {
-  public:
-    typedef S Simd;
-    typedef QCD::SU<N> Group;
-
-    template <typename vtype>
-    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
-    template <typename vtype>
-    using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
-
-    typedef iImplField<Simd>   SiteField;
-    typedef SiteField          SitePropagator;
-    typedef iImplComplex<Simd> SiteComplex;
-
-    typedef Lattice<SiteField>   Field;
-    typedef Lattice<SiteComplex> ComplexField;
-    typedef Field                FermionField;
-    typedef Field                PropagatorField;
-
-    static void MomentaSquare(ComplexField &out)
-    {
-      GridBase *grid = out._grid;
-      const std::vector<int> &l = grid->FullDimensions();
-      ComplexField kmu(grid);
-
-      for (int mu = 0; mu < grid->Nd(); mu++)
-      {
-        Real twoPiL = M_PI * 2.0 / l[mu];
-        LatticeCoordinate(kmu, mu);
-        kmu = 2.0 * sin(0.5 * twoPiL * kmu);
-        out += kmu * kmu;
-      }
-    }
-
-    static void MomentumSpacePropagator(ComplexField &out, RealD m)
-    {
-      GridBase *grid = out._grid;
-      ComplexField one(grid);
-      one = Complex(1.0, 0.0);
-      out = m * m;
-      MomentaSquare(out);
-      out = one / out;
-    }
-
-    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
-    {
-#ifndef USE_FFT_ACCELERATION
-      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
-#else
-
-      Field Pgaussian(P._grid), Pp(P._grid);
-      ComplexField p2(P._grid); p2 = zero;
-      RealD M = FFT_MASS;
-      
-      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
-
-      FFT theFFT((GridCartesian*)P._grid);
-      theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
-      MomentaSquare(p2);
-      p2 += M * M;
-      p2 = sqrt(p2);
-      Pp *= p2;
-      theFFT.FFT_all_dim(P, Pp, FFT::backward);
-
-#endif //USE_FFT_ACCELERATION
-    }
-
-    static inline Field projectForce(Field& P) {return P;}
-
-    static inline void update_field(Field &P, Field &U, double ep)
-    {
-#ifndef USE_FFT_ACCELERATION
-      double t0=usecond(); 
-      U += P * ep;
-      double t1=usecond();
-      double total_time = (t1-t0)/1e6;
-      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
-#else
-      // FFT transform P(x) -> P(p)
-      // divide by (M^2+p^2)  M external parameter (how to pass?)
-      // P'(p) = P(p)/(M^2+p^2)
-      // Transform back -> P'(x)
-      // U += P'(x)*ep
-
-      Field Pp(U._grid), P_FFT(U._grid);     
-      static ComplexField p2(U._grid);
-      RealD M = FFT_MASS;
-      
-      FFT theFFT((GridCartesian*)U._grid);
-      theFFT.FFT_all_dim(Pp, P, FFT::forward);
-
-      static bool first_call = true;
-      if (first_call)
-      {
-        // avoid recomputing
-        MomentumSpacePropagator(p2, M);
-        first_call = false;
-      }
-      Pp *= p2;
-      theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
-      U += P_FFT * ep;
-
-#endif //USE_FFT_ACCELERATION
-    }
-
-    static inline RealD FieldSquareNorm(Field &U)
-    {
-#ifndef USE_FFT_ACCELERATION
-      return (TensorRemove(sum(trace(U * U))).real());
-#else
-      // In case of Fourier acceleration we have to:
-      // compute U(p)*U(p)/(M^2+p^2))   Parseval theorem
-      // 1 FFT needed U(x) -> U(p)
-      // M to be passed
-
-      FFT theFFT((GridCartesian*)U._grid);
-      Field Up(U._grid);
-
-      theFFT.FFT_all_dim(Up, U, FFT::forward);
-      RealD M = FFT_MASS;
-      ComplexField p2(U._grid);
-      MomentumSpacePropagator(p2, M);
-      Field Up2 = Up * p2;
-      // from the definition of the DFT we need to divide by the volume
-      return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
-#endif //USE_FFT_ACCELERATION
-    }
-
-    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
-    }
-
-    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
-    }
-
-    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-      U = zero;
-    }
-
-  };
-
-
-
-
-  typedef ScalarImplTypes<vReal> ScalarImplR;
-  typedef ScalarImplTypes<vRealF> ScalarImplF;
-  typedef ScalarImplTypes<vRealD> ScalarImplD;
-  typedef ScalarImplTypes<vComplex> ScalarImplCR;
-  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
-  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
-
-  // Hardcoding here the size of the matrices
-  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
-  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
-  typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
-
-  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
-  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
-  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
-
-  //}
-}
-
-#endif
@@ -1,208 +0,0 @@
-/*************************************************************************************
-
-  Grid physics library, www.github.com/paboyle/Grid
-
-  Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
-
-  Copyright (C) 2015
-
-  Author: Guido Cossu <guido,cossu@ed.ac.uk>
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License along
-  with this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-  See the full license in the file "LICENSE" in the top level distribution
-directory
-  *************************************************************************************/
-/*  END LEGAL */
-
-#ifndef SCALAR_INT_ACTION_H
-#define SCALAR_INT_ACTION_H
-
-// Note: this action can completely absorb the ScalarAction for real float fields
-// use the scalarObjs to generalise the structure
-
-namespace Grid
-{
-// FIXME drop the QCD namespace everywhere here
-
-template <class Impl, int Ndim>
-class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
-{
-public:
-  INHERIT_FIELD_TYPES(Impl);
-
-private:
-  RealD mass_square;
-  RealD lambda;
-  RealD g;
-  const unsigned int N = Impl::Group::Dimension;
-
-  typedef typename Field::vector_object vobj;
-  typedef CartesianStencil<vobj, vobj> Stencil;
-
-  SimpleCompressor<vobj> compressor;
-  int npoint = 2 * Ndim;
-  std::vector<int> directions;    //
-  std::vector<int> displacements; //
-
-public:
-  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
-  {
-    for (int mu = 0; mu < Ndim; mu++)
-    {
-      directions[mu] = mu;
-      directions[mu + Ndim] = mu;
-      displacements[mu] = 1;
-      displacements[mu + Ndim] = -1;
-    }
-  }
-
-  virtual std::string LogParameters()
-  {
-    std::stringstream sstream;
-    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
-    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
-    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
-    return sstream.str();
-  }
-
-  virtual std::string action_name() { return "ScalarAction"; }
-
-  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
-
-  virtual RealD S(const Field &p)
-  {
-    assert(p._grid->Nd() == Ndim);
-    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-    phiStencil.HaloExchange(p, compressor);
-    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
-    phisquared = p * p;
-    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
-    for (int mu = 0; mu < Ndim; mu++)
-    {
-      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-      parallel_for(int i = 0; i < p._grid->oSites(); i++)
-      {
-        int permute_type;
-        StencilEntry *SE;
-        vobj temp2;
-        const vobj *temp, *t_p;
-
-        SE = phiStencil.GetEntry(permute_type, mu, i);
-        t_p = &p._odata[i];
-        if (SE->_is_local)
-        {
-          temp = &p._odata[SE->_offset];
-          if (SE->_permute)
-          {
-            permute(temp2, *temp, permute_type);
-            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
-          }
-          else
-          {
-            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
-          }
-        }
-        else
-        {
-          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
-        }
-      }
-      //  action -= pshift*p + p*pshift;
-    }
-    // NB the trace in the algebra is normalised to 1/2
-    // minus sign coming from the antihermitian fields
-    return -(TensorRemove(sum(trace(action)))).real() * N / g;
-  };
-
-  virtual void deriv(const Field &p, Field &force)
-  {
-    double t0 = usecond();
-    assert(p._grid->Nd() == Ndim);
-    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
-    double interm_t = usecond();
-
-    // move this outside
-    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-
-    phiStencil.HaloExchange(p, compressor);
-    double halo_t = usecond();
-    int chunk = 128;
-    //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-
-    // inverting the order of the loops slows down the code(! g++ 7)
-    // cannot try to reduce the number of  force writes by factor npoint...
-    // use cache blocking
-    for (int point = 0; point < npoint; point++)
-    {
-
-#pragma omp parallel 
-{
-        int permute_type;
-        StencilEntry *SE;
-        const vobj *temp;
-
-#pragma omp for schedule(static, chunk)
-      for (int i = 0; i < p._grid->oSites(); i++)
-      {
-        SE = phiStencil.GetEntry(permute_type, point, i);
-        // prefetch next p?
-
-        if (SE->_is_local)
-        {
-          temp = &p._odata[SE->_offset];
-      
-          if (SE->_permute)
-          {
-            vobj temp2;
-            permute(temp2, *temp, permute_type);
-            force._odata[i] -= temp2;
-          }
-          else
-          {
-            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
-          }
-        }
-        else
-        {
-          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
-        }
-      }
-
-    }
-  }
-  force *= N / g;
-
-  double t1 = usecond();
-  double total_time = (t1 - t0) / 1e6;
-  double interm_time = (interm_t - t0) / 1e6;
-  double halo_time = (halo_t - interm_t) / 1e6;
-  double stencil_time = (t1 - halo_t) / 1e6;
-  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
-  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
-  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
-  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
-  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
-  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
-  double Gflops = flops / (total_time * 1e9);
-  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
-  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
-  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
-}
-};
-
-} // namespace Grid
-
-#endif // SCALAR_INT_ACTION_H
@@ -1,219 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/GenericHmcRunner.h
-
-Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-  See the full license in the file "LICENSE" in the top level distribution
-  directory
-  *************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_GENERIC_HMC_RUNNER
-#define GRID_GENERIC_HMC_RUNNER
-
-#include <unordered_map>
-
-namespace Grid {
-namespace QCD {
-
-
-// very ugly here but possibly resolved if we had a base Reader class
-template < class ReaderClass >
-class HMCRunnerBase {
-public:
-  virtual void Run() = 0;
-  virtual void initialize(ReaderClass& ) = 0;
-};
-
-
-template <class Implementation,
-          template <typename, typename, typename> class Integrator,
-          class RepresentationsPolicy = NoHirep, class ReaderClass = XmlReader>
-class HMCWrapperTemplate: public HMCRunnerBase<ReaderClass> {
- public:
-  INHERIT_FIELD_TYPES(Implementation);
-  typedef Implementation ImplPolicy;  // visible from outside
-  template <typename S = NoSmearing<Implementation> >
-  using IntegratorType = Integrator<Implementation, S, RepresentationsPolicy>;
-
-  HMCparameters Parameters;
-  std::string ParameterFile;
-  HMCResourceManager<Implementation> Resources;
-
-  // The set of actions (keep here for lower level users, for now)
-  ActionSet<Field, RepresentationsPolicy> TheAction;
-
-  HMCWrapperTemplate() = default;
-
-  HMCWrapperTemplate(HMCparameters Par){
-    Parameters = Par;
-  }
-
-  void initialize(ReaderClass & TheReader){
-    std::cout  << "Initialization of the HMC" << std::endl;
-    Resources.initialize(TheReader);
-
-    // eventually add smearing
-
-    Resources.GetActionSet(TheAction);    
-  }
-
-
-  void ReadCommandLine(int argc, char **argv) {
-    std::string arg;
-
-    if (GridCmdOptionExists(argv, argv + argc, "--StartingType")) {
-      arg = GridCmdOptionPayload(argv, argv + argc, "--StartingType");
-
-      if (arg != "HotStart" && arg != "ColdStart" && arg != "TepidStart" &&
-          arg != "CheckpointStart") {
-        std::cout << GridLogError << "Unrecognized option in --StartingType\n";
-        std::cout
-            << GridLogError
-            << "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-        exit(1);
-      }
-      Parameters.StartingType = arg;
-    }
-
-    if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
-      arg = GridCmdOptionPayload(argv, argv + argc, "--StartingTrajectory");
-      std::vector<int> ivec(0);
-      GridCmdOptionIntVector(arg, ivec);
-      Parameters.StartTrajectory = ivec[0];
-    }
-
-    if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
-      arg = GridCmdOptionPayload(argv, argv + argc, "--Trajectories");
-      std::vector<int> ivec(0);
-      GridCmdOptionIntVector(arg, ivec);
-      Parameters.Trajectories = ivec[0];
-    }
-
-    if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
-      arg = GridCmdOptionPayload(argv, argv + argc, "--Thermalizations");
-      std::vector<int> ivec(0);
-      GridCmdOptionIntVector(arg, ivec);
-      Parameters.NoMetropolisUntil = ivec[0];
-    }
-    if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
-      arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
-      ParameterFile = arg;
-    }
-  }
-
-
-  template <class SmearingPolicy>
-  void Run(SmearingPolicy &S) {
-    Runner(S);
-  }
-
-  void Run(){
-    NoSmearing<Implementation> S;
-    Runner(S);
-  }
-
-  //////////////////////////////////////////////////////////////////
-
- private:
-  template <class SmearingPolicy>
-  void Runner(SmearingPolicy &Smearing) {
-    auto UGrid = Resources.GetCartesian();
-    Resources.AddRNGs();
-    Field U(UGrid);
-
-    // Can move this outside?
-    typedef IntegratorType<SmearingPolicy> TheIntegrator;
-    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
-
-    if (Parameters.StartingType == "HotStart") {
-      // Hot start
-      Resources.SeedFixedIntegers();
-      Implementation::HotConfiguration(Resources.GetParallelRNG(), U);
-    } else if (Parameters.StartingType == "ColdStart") {
-      // Cold start
-      Resources.SeedFixedIntegers();
-      Implementation::ColdConfiguration(Resources.GetParallelRNG(), U);
-    } else if (Parameters.StartingType == "TepidStart") {
-      // Tepid start
-      Resources.SeedFixedIntegers();
-      Implementation::TepidConfiguration(Resources.GetParallelRNG(), U);
-    } else if (Parameters.StartingType == "CheckpointStart") {
-      // CheckpointRestart
-      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
-                                   Resources.GetSerialRNG(),
-                                   Resources.GetParallelRNG());
-    }
-
-    Smearing.set_Field(U);
-
-    HybridMonteCarlo<TheIntegrator> HMC(Parameters, MDynamics,
-                                        Resources.GetSerialRNG(),
-                                        Resources.GetParallelRNG(), 
-                                        Resources.GetObservables(), U);
-
-    // Run it
-    HMC.evolve();
-  }
-};
-
-// These are for gauge fields, default integrator MinimumNorm2
-template <template <typename, typename, typename> class Integrator>
-using GenericHMCRunner = HMCWrapperTemplate<PeriodicGimplR, Integrator>;
-template <template <typename, typename, typename> class Integrator>
-using GenericHMCRunnerF = HMCWrapperTemplate<PeriodicGimplF, Integrator>;
-template <template <typename, typename, typename> class Integrator>
-using GenericHMCRunnerD = HMCWrapperTemplate<PeriodicGimplD, Integrator>;
-
-
-// These are for gauge fields, default integrator MinimumNorm2
-template <template <typename, typename, typename> class Integrator>
-using ConjugateHMCRunner = HMCWrapperTemplate<ConjugateGimplR, Integrator>;
-template <template <typename, typename, typename> class Integrator>
-using ConjugateHMCRunnerF = HMCWrapperTemplate<ConjugateGimplF, Integrator>;
-template <template <typename, typename, typename> class Integrator>
-using ConjugateHMCRunnerD = HMCWrapperTemplate<ConjugateGimplD, Integrator>;
-
-
-
-template <class RepresentationsPolicy,
-          template <typename, typename, typename> class Integrator>
-using GenericHMCRunnerHirep =
-    HMCWrapperTemplate<PeriodicGimplR, Integrator, RepresentationsPolicy>;
-
-template <class Implementation, class RepresentationsPolicy, 
-          template <typename, typename, typename> class Integrator>
-using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator, RepresentationsPolicy>;
-
-typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields>
-    ScalarGenericHMCRunner;
-
-typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
-    ScalarAdjGenericHMCRunner;
-
-template <int Colours> 
-using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;
-
-}  // namespace QCD
-}  // namespace Grid
-
-#endif  // GRID_GENERIC_HMC_RUNNER
@@ -1,111 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/GenericHmcRunner.h
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_HMC_MODULES
-#define GRID_HMC_MODULES
-
-
-#include "HMC_GridModules.h"
-
-namespace Grid {
-namespace QCD {
-
-////////////////////////////////////////////////////////////////////
-struct RNGModuleParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(RNGModuleParameters,
-  std::string, serial_seeds,
-  std::string, parallel_seeds,);
-
-  std::vector<int> getSerialSeeds(){return strToVec<int>(serial_seeds);}
-  std::vector<int> getParallelSeeds(){return strToVec<int>(parallel_seeds);}
-
-  RNGModuleParameters(): serial_seeds("1"), parallel_seeds("1"){}
-
-  template <class ReaderClass >
-  RNGModuleParameters(Reader<ReaderClass>& Reader){
-    read(Reader, "RandomNumberGenerator", *this); 
-  }
-  
-};
-
-// Random number generators module
-class RNGModule{
-   GridSerialRNG sRNG_;
-   std::unique_ptr<GridParallelRNG> pRNG_;
-   RNGModuleParameters Params_;
-
-public:
-
-  RNGModule(){};
-
-  void set_pRNG(GridParallelRNG* pRNG){
-    pRNG_.reset(pRNG);
-  }
-
-  void set_RNGSeeds(RNGModuleParameters& Params) {
-    Params_ = Params;
-  }
-
-  GridSerialRNG& get_sRNG() { return sRNG_; }
-  GridParallelRNG& get_pRNG() { return *pRNG_.get(); }
-
-  void seed() {
-    auto SerialSeeds   = Params_.getSerialSeeds();
-    auto ParallelSeeds = Params_.getParallelSeeds();
-    if (SerialSeeds.size() == 0 && ParallelSeeds.size() == 0) {
-      std::cout << GridLogError << "Seeds not initialized" << std::endl;
-      exit(1);
-    }
-    sRNG_.SeedFixedIntegers(SerialSeeds);
-    pRNG_->SeedFixedIntegers(ParallelSeeds);
-  }
-};
-
-
-/*
-///////////////////////////////////////////////////////////////////
-/// Smearing module
-template <class ImplementationPolicy>
-class SmearingModule{
-   virtual void get_smearing();
-};
-
-template <class ImplementationPolicy>
-class StoutSmearingModule: public SmearingModule<ImplementationPolicy>{
-   SmearedConfiguration<ImplementationPolicy> SmearingPolicy;
-};
-
-*/
-
-
-
-}  // namespace QCD
-}  // namespace Grid
-
-#endif  // GRID_HMC_MODULES
@@ -1,328 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/GenericHmcRunner.h
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-  See the full license in the file "LICENSE" in the top level distribution
-  directory
-  *************************************************************************************/
-/*  END LEGAL */
-#ifndef HMC_RESOURCE_MANAGER_H
-#define HMC_RESOURCE_MANAGER_H
-
-#include <unordered_map>
-
-// One function per Checkpointer, use a macro to simplify
-#define RegisterLoadCheckPointerFunction(NAME)                           \
-  void Load##NAME##Checkpointer(const CheckpointerParameters& Params_) { \
-    if (!have_CheckPointer) {                                            \
-      std::cout << GridLogDebug << "Loading Checkpointer " << #NAME      \
-                << std::endl;                                            \
-      CP = std::unique_ptr<CheckpointerBaseModule>(                      \
-        new NAME##CPModule<ImplementationPolicy>(Params_));              \
-      have_CheckPointer = true;                                          \
-    } else {                                                             \
-      std::cout << GridLogError << "Checkpointer already loaded "        \
-                << std::endl;                                            \
-      exit(1);                                                           \
-    }                                                                    \
-  }
-
-#define RegisterLoadCheckPointerMetadataFunction(NAME)                   \
-  template < class Metadata >                                            \
-  void Load##NAME##Checkpointer(const CheckpointerParameters& Params_, const Metadata& M_) { \
-    if (!have_CheckPointer) {                                            \
-      std::cout << GridLogDebug << "Loading Metadata Checkpointer " << #NAME      \
-                << std::endl;                                            \
-      CP = std::unique_ptr<CheckpointerBaseModule>(                      \
-        new NAME##CPModule<ImplementationPolicy, Metadata >(Params_, M_));   \
-      have_CheckPointer = true;                                          \
-    } else {                                                             \
-      std::cout << GridLogError << "Checkpointer already loaded "        \
-                << std::endl;                                            \
-      exit(1);                                                           \
-    }                                                                    \
-  }
-
-namespace Grid {
-namespace QCD {
-
-// HMC Resource manager
-template <class ImplementationPolicy>
-class HMCResourceManager {
-  typedef HMCModuleBase< QCD::BaseHmcCheckpointer<ImplementationPolicy> > CheckpointerBaseModule;
-  typedef HMCModuleBase< QCD::HmcObservable<typename ImplementationPolicy::Field> > ObservableBaseModule;
-  typedef ActionModuleBase< QCD::Action<typename ImplementationPolicy::Field>, GridModule > ActionBaseModule;
-
-  // Named storage for grid pairs (std + red-black)
-  std::unordered_map<std::string, GridModule> Grids;
-  RNGModule RNGs;
-
-  // SmearingModule<ImplementationPolicy> Smearing;
-  std::unique_ptr<CheckpointerBaseModule> CP;
-
-  // A vector of HmcObservable modules
-  std::vector<std::unique_ptr<ObservableBaseModule> > ObservablesList;
-
-
-  // A vector of HmcObservable modules
-  std::multimap<int, std::unique_ptr<ActionBaseModule> > ActionsList;
-  std::vector<int> multipliers;
-
-  bool have_RNG;
-  bool have_CheckPointer;
-
-  // NOTE: operator << is not overloaded for std::vector<string> 
-  // so this function is necessary
-  void output_vector_string(const std::vector<std::string> &vs){
-    for (auto &i: vs)
-      std::cout << i << " ";
-    std::cout << std::endl;
-  }
-
-
- public:
-  HMCResourceManager() : have_RNG(false), have_CheckPointer(false) {}
-
-  template <class ReaderClass, class vector_type = vComplex >
-  void initialize(ReaderClass &Read){
-    // assumes we are starting from the main node
-
-    // Geometry
-    GridModuleParameters GridPar(Read);
-    GridFourDimModule<vector_type> GridMod( GridPar) ;
-    AddGrid("gauge", GridMod);
-
-    // Checkpointer
-    auto &CPfactory = HMC_CPModuleFactory<cp_string, ImplementationPolicy, ReaderClass >::getInstance();
-    Read.push("Checkpointer");
-    std::string cp_type;
-    read(Read,"name", cp_type);
-    std::cout << "Registered types " << std::endl;
-    output_vector_string(CPfactory.getBuilderList());
-
-
-    CP = CPfactory.create(cp_type, Read);
-    CP->print_parameters();
-    Read.pop();    
-    have_CheckPointer = true;  
-
-    RNGModuleParameters RNGpar(Read);
-    SetRNGSeeds(RNGpar);
-
-    // Observables
-    auto &ObsFactory = HMC_ObservablesModuleFactory<observable_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance(); 
-    Read.push(observable_string);// here must check if existing...
-    do {
-      std::string obs_type;
-      read(Read,"name", obs_type);
-      std::cout << "Registered types " << std::endl;
-      output_vector_string(ObsFactory.getBuilderList() );
-
-      ObservablesList.emplace_back(ObsFactory.create(obs_type, Read));
-      ObservablesList[ObservablesList.size() - 1]->print_parameters();
-    } while (Read.nextElement(observable_string));
-    Read.pop();
-
-    // Loop on levels
-    if(!Read.push("Actions")){
-      std::cout << "Actions not found" << std::endl; 
-      exit(1);
-    }
-
-    if(!Read.push("Level")){// push must check if the node exist
-         std::cout << "Level not found" << std::endl; 
-      exit(1);
-    }
-    do
-    {
-      fill_ActionsLevel(Read); 
-    }
-    while(Read.push("Level"));
-
-    Read.pop();
-  }
-
-
- 
-  template <class RepresentationPolicy>
-  void GetActionSet(ActionSet<typename ImplementationPolicy::Field, RepresentationPolicy>& Aset){
-    Aset.resize(multipliers.size());
- 
-    for(auto it = ActionsList.begin(); it != ActionsList.end(); it++){
-      (*it).second->acquireResource(Grids["gauge"]);
-      Aset[(*it).first-1].push_back((*it).second->getPtr());
-    }
-  }
-
-
-
-  //////////////////////////////////////////////////////////////
-  // Grids
-  //////////////////////////////////////////////////////////////
-
-  void AddGrid(const std::string s, GridModule& M) {
-    // Check for name clashes
-    auto search = Grids.find(s);
-    if (search != Grids.end()) {
-      std::cout << GridLogError << "Grid with name \"" << search->first
-                << "\" already present. Terminating\n";
-      exit(1);
-    }
-    Grids[s] = std::move(M);
-    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
-    std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
-    std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
-    Grids[s].show_full_decomposition();
-    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
-  }
-
-  // Add a named grid set, 4d shortcut
-  void AddFourDimGrid(const std::string s) {
-    GridFourDimModule<vComplex> Mod;
-    AddGrid(s, Mod);
-  }
-
-  // Add a named grid set, 4d shortcut + tweak simd lanes
-  void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
-    GridFourDimModule<vComplex> Mod(simd_decomposition);
-    AddGrid(s, Mod);
-  }
-
-
-  GridCartesian* GetCartesian(std::string s = "") {
-    if (s.empty()) s = Grids.begin()->first;
-    std::cout << GridLogDebug << "Getting cartesian grid from: " << s
-              << std::endl;
-    return Grids[s].get_full();
-  }
-
-  GridRedBlackCartesian* GetRBCartesian(std::string s = "") {
-    if (s.empty()) s = Grids.begin()->first;
-    std::cout << GridLogDebug << "Getting rb-cartesian grid from: " << s
-              << std::endl;
-    return Grids[s].get_rb();
-  }
-
-  //////////////////////////////////////////////////////
-  // Random number generators
-  //////////////////////////////////////////////////////
-
-  void AddRNGs(std::string s = "") {
-    // Couple the RNGs to the GridModule tagged by s
-    // the default is the first grid registered
-    assert(Grids.size() > 0 && !have_RNG);
-    if (s.empty()) s = Grids.begin()->first;
-    std::cout << GridLogDebug << "Adding RNG to grid: " << s << std::endl;
-    RNGs.set_pRNG(new GridParallelRNG(GetCartesian(s)));
-    have_RNG = true;
-  }
-
-  void SetRNGSeeds(RNGModuleParameters& Params) { RNGs.set_RNGSeeds(Params); }
-
-  GridSerialRNG& GetSerialRNG() { return RNGs.get_sRNG(); }
-
-  GridParallelRNG& GetParallelRNG() {
-    assert(have_RNG);
-    return RNGs.get_pRNG();
-  }
-
-  void SeedFixedIntegers() {
-    assert(have_RNG);
-    RNGs.seed();
-  }
-
-  //////////////////////////////////////////////////////
-  // Checkpointers
-  //////////////////////////////////////////////////////
-
-  BaseHmcCheckpointer<ImplementationPolicy>* GetCheckPointer() {
-    if (have_CheckPointer)
-      return CP->getPtr();
-    else {
-      std::cout << GridLogError << "Error: no checkpointer defined"
-                << std::endl;
-      exit(1);
-    }
-  }
-
-  RegisterLoadCheckPointerFunction(Binary);
-  RegisterLoadCheckPointerFunction(Nersc);
-  #ifdef HAVE_LIME
-  RegisterLoadCheckPointerFunction(ILDG);
-  RegisterLoadCheckPointerMetadataFunction(Scidac);
-  #endif
-
-  ////////////////////////////////////////////////////////
-  // Observables
-  ////////////////////////////////////////////////////////
-
-  template<class T, class... Types>
-  void AddObservable(Types&&... Args){
-    ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
-    ObservablesList.back()->print_parameters();
-  }
-
-  std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
-    std::vector<HmcObservable<typename ImplementationPolicy::Field>* > out;
-    for (auto &i : ObservablesList){
-      out.push_back(i->getPtr());
-    }
-
-    // Add the checkpointer to the observables
-    out.push_back(GetCheckPointer());
-    return out;
-  }
-
-
-
-private:
-   // this private
-  template <class ReaderClass >
-  void fill_ActionsLevel(ReaderClass &Read){
-    // Actions set
-    int m;
-    Read.readDefault("multiplier",m);
-    multipliers.push_back(m);
-    std::cout << "Level : " << multipliers.size()  << " with multiplier : " << m << std::endl; 
-    // here gauge
-    Read.push("Action");
-    do{
-      auto &ActionFactory = HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance(); 
-      std::string action_type;
-      Read.readDefault("name", action_type); 
-      output_vector_string(ActionFactory.getBuilderList() );
-      ActionsList.emplace(m, ActionFactory.create(action_type, Read));
-    } while (Read.nextElement("Action"));
-    ActionsList.find(m)->second->print_parameters();    
-    Read.pop();
-
-  }
-
-
-
-};
-}
-}
-
-#endif  // HMC_RESOURCE_MANAGER_H
@@ -1,137 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/GenericHmcRunner.h
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef HMC_RUNNER_MODULE
-#define HMC_RUNNER_MODULE
-
-namespace Grid {
-
-// the reader class is necessary here for the automatic initialization of the resources
-// if we had a virtual reader would have been unecessary
-template <class HMCType, class ReaderClass >
-class HMCModule
-    : public Parametrized< QCD::HMCparameters >,
-      public HMCModuleBase< QCD::HMCRunnerBase<ReaderClass> > {
- public:
-  typedef HMCModuleBase< QCD::HMCRunnerBase<ReaderClass> > Base;
-  typedef typename Base::Product Product;
-
-  std::unique_ptr<HMCType> HMCPtr;
-
-  HMCModule(QCD::HMCparameters Par) : Parametrized<QCD::HMCparameters>(Par) {}
-
-  template <class ReaderCl>
-  HMCModule(Reader<ReaderCl>& R) : Parametrized<QCD::HMCparameters>(R, "HMC"){};
-
-  Product* getPtr() {
-    if (!HMCPtr) initialize();
- 
-    return HMCPtr.get();
-  }
-
- private:
-  virtual void initialize() = 0;
-};
-
-// Factory
-template <char const *str, class ReaderClass >
-class HMCRunnerModuleFactory
-    : public Factory < HMCModuleBase< QCD::HMCRunnerBase<ReaderClass> > ,	Reader<ReaderClass> > {
- public:
- 	typedef Reader<ReaderClass> TheReader; 
- 	// use SINGLETON FUNCTOR MACRO HERE
-  HMCRunnerModuleFactory(const HMCRunnerModuleFactory& e) = delete;
-  void operator=(const HMCRunnerModuleFactory& e) = delete;
-  static HMCRunnerModuleFactory& getInstance(void) {
-    static HMCRunnerModuleFactory e;
-    return e;
-  }
-
- private:
-  HMCRunnerModuleFactory(void) = default;
-  std::string obj_type() const {
-  	return std::string(str);
-  }
-};
-
-
-
-
-
-///////////////
-// macro for these
-
-template < class ImplementationPolicy, class RepresentationPolicy, class ReaderClass >
-class HMCLeapFrog: public HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::LeapFrog>, ReaderClass >{
-  typedef HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::LeapFrog>, ReaderClass  > HMCBaseMod;
-  using HMCBaseMod::HMCBaseMod;
-
-  // aquire resource
-  virtual void initialize(){
-    this->HMCPtr.reset(new QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::LeapFrog>(this->Par_) );
-  }
-};
-
-template < class ImplementationPolicy, class RepresentationPolicy, class ReaderClass >
-class HMCMinimumNorm2: public HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::MinimumNorm2>, ReaderClass  >{
-  typedef HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::MinimumNorm2>, ReaderClass  > HMCBaseMod;
-  using HMCBaseMod::HMCBaseMod;
-
-  // aquire resource
-  virtual void initialize(){
-    this->HMCPtr.reset(new QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::MinimumNorm2>(this->Par_));
-  }
-};
-
-
-template < class ImplementationPolicy, class RepresentationPolicy, class ReaderClass >
-class HMCForceGradient: public HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::ForceGradient>, ReaderClass  >{
-  typedef HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::ForceGradient>, ReaderClass   > HMCBaseMod;
-  using HMCBaseMod::HMCBaseMod;
-
-  // aquire resource
-  virtual void initialize(){
-    this->HMCPtr.reset(new QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::ForceGradient>(this->Par_) );
-  }
-};
-
-extern char hmc_string[];
-
-
-
-
-
-
-//////////////////////////////////////////////////////////////
-
-
-
-}
-
-#endif
@@ -1,177 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/HMC_GridModules.h
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef HMC_GRID_MODULES
-#define HMC_GRID_MODULES
-
-namespace Grid {
-
-// Resources
-// Modules for grids
-
-// Introduce another namespace HMCModules?
-
-class GridModuleParameters: Serializable{
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
-  std::string, lattice,
-  std::string, mpi);
-
-  std::vector<int> getLattice() const {return strToVec<int>(lattice);}
-  std::vector<int> getMpi()     const {return strToVec<int>(mpi);}
-
-
-  void check() const {
-    if (getLattice().size() != getMpi().size() ) {
-      std::cout << GridLogError
-                << "Error in GridModuleParameters: lattice and mpi dimensions "
-                   "do not match"
-                << std::endl;
-      exit(1);
-    }
-  }
-
-  template <class ReaderClass>
-  GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
-    read(Reader, name, *this);
-    check();
-  }
-
-  // Save on file
-  template< class WriterClass>
-  void save(Writer<WriterClass>& Writer){
-    check();
-    write(Writer, name, *this);
-  }
-private:
-    std::string name;
-};
-
-// Lower level class
-class GridModule {
- public:
-  GridCartesian* get_full() {
-    std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
-    return grid_.get(); }
-  GridRedBlackCartesian* get_rb() {
-    std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
-    return rbgrid_.get(); }
-
-  void set_full(GridCartesian* grid) { grid_.reset(grid); }
-  void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
-  void show_full_decomposition(){ grid_->show_decomposition(); }
-  void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
-
- protected:
-  std::unique_ptr<GridCartesian> grid_;
-  std::unique_ptr<GridRedBlackCartesian> rbgrid_;
-
-};
-
-////////////////////////////////////
-// Classes for the user
-////////////////////////////////////
-// Note: the space time grid should be out of the QCD namespace
-template <class vector_type>
-class GridFourDimModule : public GridModule
-{
-public:
-  GridFourDimModule()
-  {
-    using namespace QCD;
-    set_full(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), 
-        GridDefaultSimd(4, vector_type::Nsimd()),
-        GridDefaultMpi()));
-    set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
-  }
-
-  GridFourDimModule(const std::vector<int> tweak_simd)
-  {
-    using namespace QCD;
-    if (tweak_simd.size() != 4)
-    {
-      std::cout << GridLogError
-                << "Error in GridFourDimModule: SIMD size different from 4" 
-                << std::endl;
-      exit(1);
-    }
-
-    // Checks that the product agrees with the expectation
-    int simd_sum = 1;
-    for (auto &n : tweak_simd)
-      simd_sum *= n;
-    std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl;
-
-    if (simd_sum == vector_type::Nsimd())
-    {
-      set_full(SpaceTimeGrid::makeFourDimGrid(
-          GridDefaultLatt(), 
-          tweak_simd, 
-          GridDefaultMpi()));
-      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
-    }
-    else
-    {
-      std::cout << GridLogError 
-                << "Error in GridFourDimModule: SIMD lanes must sum to " 
-                << vector_type::Nsimd() 
-                << std::endl;
-    }
-  }
-
-  GridFourDimModule(const GridModuleParameters Params)
-  {
-    using namespace QCD;
-    std::vector<int> lattice_v = Params.getLattice();
-    std::vector<int> mpi_v = Params.getMpi();
-    if (lattice_v.size() == 4)
-    {
-      set_full(SpaceTimeGrid::makeFourDimGrid(
-          lattice_v, 
-          GridDefaultSimd(4, vector_type::Nsimd()),
-          mpi_v));
-      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
-    }
-    else
-    {
-      std::cout << GridLogError
-                << "Error in GridFourDimModule: lattice dimension different from 4"
-                << std::endl;
-      exit(1);
-    }
-  }
-};
-
-typedef GridFourDimModule<vComplex> GridDefaultFourDimModule;
-
-
-}  // namespace Grid
-
-#endif  // HMC_GRID_MODULES
@@ -1,107 +0,0 @@
-Using HMC in Grid version 0.5.1
-
-These are the instructions to use the Generalised HMC on Grid version 0.5.1.
-Disclaimer: GRID is still under active development so any information here can be changed in future releases.
-
-
-Command line options
-===================
-(relevant file GenericHMCrunner.h)
-The initial configuration can be changed at the command line using 
--StartType <your choice>
-valid choices, one among these
-HotStart, ColdStart, TepidStart, CheckpointStart
-default: HotStart
-
-example
-./My_hmc_exec  --StartType HotStart
-
-The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
-default: 0
-
-The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
-default: 1
-
-The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
-default: 10
-
-
-Any other parameter is defined in the source for the executable.
-
-HMC controls
-===========
-
-The lines 
-
-  std::vector<int> SerSeed({1, 2, 3, 4, 5});
-  std::vector<int> ParSeed({6, 7, 8, 9, 10});
-
-define the seeds for the serial and the parallel RNG.
-
-The line 
-
-  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
-
-declares the number of molecular dynamics steps and the total trajectory length.
-
-
-Actions
-======
-
-Action names are defined in the file
-lib/qcd/Actions.h
-
-Gauge actions list:
-
-WilsonGaugeActionR;
-WilsonGaugeActionF;
-WilsonGaugeActionD;
-PlaqPlusRectangleActionR;
-PlaqPlusRectangleActionF;
-PlaqPlusRectangleActionD;
-IwasakiGaugeActionR;
-IwasakiGaugeActionF;
-IwasakiGaugeActionD;
-SymanzikGaugeActionR;
-SymanzikGaugeActionF;
-SymanzikGaugeActionD;
-
-
-ConjugateWilsonGaugeActionR;
-ConjugateWilsonGaugeActionF;
-ConjugateWilsonGaugeActionD;
-ConjugatePlaqPlusRectangleActionR;
-ConjugatePlaqPlusRectangleActionF;
-ConjugatePlaqPlusRectangleActionD;
-ConjugateIwasakiGaugeActionR;
-ConjugateIwasakiGaugeActionF;
-ConjugateIwasakiGaugeActionD;
-ConjugateSymanzikGaugeActionR;
-ConjugateSymanzikGaugeActionF;
-ConjugateSymanzikGaugeActionD;
-
-
-ScalarActionR;
-ScalarActionF;
-ScalarActionD;
-
-
-each of these action accept one single parameter at creation time (beta).
-Example for creating a Symanzik action with beta=4.0
-
-	SymanzikGaugeActionR(4.0)
-
-The suffixes R,F,D in the action names refer to the Real
-(the precision is defined at compile time by the --enable-precision flag in the configure),
-Float and Double, that force the precision of the action to be 32, 64 bit respectively.
-
-
-
-
-
-
-
-
@@ -1,97 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/BaseCheckpointer.h
-
-Copyright (C) 2015
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef BASE_CHECKPOINTER
-#define BASE_CHECKPOINTER
-
-namespace Grid {
-namespace QCD {
-
-class CheckpointerParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters, 
-  	std::string, config_prefix, 
-  	std::string, rng_prefix, 
-  	int, saveInterval, 
-  	std::string, format, );
-
-  CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng",
-   		      int savemodulo = 1, const std::string &f = "IEEE64BIG")
-      : config_prefix(cf),
-        rng_prefix(rn),
-        saveInterval(savemodulo),
-        format(f){};
-
-
-  template <class ReaderClass >
-  CheckpointerParameters(Reader<ReaderClass> &Reader) {
-    read(Reader, "Checkpointer", *this);
-  }
- 
-
-};
-
-//////////////////////////////////////////////////////////////////////////////
-// Base class for checkpointers
-template <class Impl>
-class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
- public:
-  void build_filenames(int traj, CheckpointerParameters &Params,
-                       std::string &conf_file, std::string &rng_file) {
-    {
-      std::ostringstream os;
-      os << Params.rng_prefix << "." << traj;
-      rng_file = os.str();
-    }
-
-    {
-      std::ostringstream os;
-      os << Params.config_prefix << "." << traj;
-      conf_file = os.str();
-    }
- 	} 
-
-  void check_filename(const std::string &filename){
-    std::ifstream f(filename.c_str());
-    if(!f.good()){
-      std::cout << GridLogError << "Filename " << filename << " not found. Aborting. " << std::endl;
-      abort();
-    };
-  }
-
-  virtual void initialize(const CheckpointerParameters &Params) = 0;
-
-  virtual void CheckpointRestore(int traj, typename Impl::Field &U,
-                                 GridSerialRNG &sRNG,
-                                 GridParallelRNG &pRNG) = 0;
-
-};  // class BaseHmcCheckpointer
-///////////////////////////////////////////////////////////////////////////////
-}
-}
-#endif
@@ -1,116 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/BinaryCheckpointer.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef BINARY_CHECKPOINTER
-#define BINARY_CHECKPOINTER
-
-#include <iostream>
-#include <sstream>
-#include <string>
-
-namespace Grid {
-namespace QCD {
-
-// Simple checkpointer, only binary file
-template <class Impl>
-class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
- private:
-  CheckpointerParameters Params;
-
- public:
-  INHERIT_FIELD_TYPES(Impl);  // Gets the Field type, a Lattice object
-
-  // Extract types from the Field
-  typedef typename Field::vector_object vobj;
-  typedef typename vobj::scalar_object sobj;
-  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
-  typedef typename sobj::DoublePrecision sobj_double;
-
-  BinaryHmcCheckpointer(const CheckpointerParameters &Params_) {
-    initialize(Params_);
-  }
-
-  void initialize(const CheckpointerParameters &Params_) { Params = Params_; }
-
-  void truncate(std::string file) {
-    std::ofstream fout(file, std::ios::out);
-    fout.close();
-  }
-
-  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
-
-    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
-
-      uint32_t nersc_csum;
-      uint32_t scidac_csuma;
-      uint32_t scidac_csumb;
-      
-      BinarySimpleUnmunger<sobj_double, sobj> munge;
-      truncate(rng);
-      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      truncate(config);
-
-      BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
-						      nersc_csum,scidac_csuma,scidac_csumb);
-
-      std::cout << GridLogMessage << "Written Binary Configuration " << config
-                << " checksum " << std::hex 
-		<< nersc_csum   <<"/"
-		<< scidac_csuma   <<"/"
-		<< scidac_csumb 
-		<< std::dec << std::endl;
-    }
-
-  };
-
-  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
-    this->check_filename(rng);
-    this->check_filename(config);
-
-
-    BinarySimpleMunger<sobj_double, sobj> munge;
-
-    uint32_t nersc_csum;
-    uint32_t scidac_csuma;
-    uint32_t scidac_csumb;
-    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-    BinaryIO::readLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
-						   nersc_csum,scidac_csuma,scidac_csumb);
-    
-    std::cout << GridLogMessage << "Read Binary Configuration " << config
-              << " checksums " << std::hex << nersc_csum<<"/"<<scidac_csuma<<"/"<<scidac_csumb 
-	      << std::dec << std::endl;
-  };
-};
-}
-}
-#endif
@@ -1,172 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef CP_MODULES_H
-#define CP_MODULES_H
-
-
-// FIXME  Reorganize QCD namespace 
-
-
-namespace Grid {
-
-////////////////////////////////////////////////////////////////////////
-// Checkpoint module, owns the Checkpointer
-////////////////////////////////////////////////////////////////////////
-
-template <class ImplementationPolicy>
-class CheckPointerModule: public Parametrized<QCD::CheckpointerParameters>, public HMCModuleBase< QCD::BaseHmcCheckpointer<ImplementationPolicy> >  {
- public:
- 	std::unique_ptr<QCD::BaseHmcCheckpointer<ImplementationPolicy> > CheckPointPtr;
- 	typedef QCD::CheckpointerParameters APar;
-  typedef HMCModuleBase< QCD::BaseHmcCheckpointer<ImplementationPolicy> > Base;
-  typedef typename Base::Product Product;
-
-  CheckPointerModule(APar Par): Parametrized<APar>(Par) {}
-  template <class ReaderClass>
-  CheckPointerModule(Reader<ReaderClass>& Reader) : Parametrized<APar>(Reader){};
-
-  virtual void print_parameters(){
-  	std::cout << this->Par_ << std::endl;
-  }
-
-  Product* getPtr() {
-    if (!CheckPointPtr) initialize();
-
-    return CheckPointPtr.get();
-  }
-
- private:
-  virtual void initialize() = 0;
-
-};
-
-
-
-template <char const *str, class ImplementationPolicy, class ReaderClass >
-class HMC_CPModuleFactory
-    : public Factory < HMCModuleBase< QCD::BaseHmcCheckpointer<ImplementationPolicy> > ,	Reader<ReaderClass> > {
- public:
- 	typedef Reader<ReaderClass> TheReader; 
- 	// use SINGLETON FUNCTOR MACRO HERE
-  HMC_CPModuleFactory(const HMC_CPModuleFactory& e) = delete;
-  void operator=(const HMC_CPModuleFactory& e) = delete;
-  static HMC_CPModuleFactory& getInstance(void) {
-    static HMC_CPModuleFactory e;
-    return e;
-  }
-
- private:
-  HMC_CPModuleFactory(void) = default;
-  std::string obj_type() const {
-  	return std::string(str);
-  }
-};
-
-
-
-/////////////////////////////////////////////////////////////////////
-// Concrete classes
-/////////////////////////////////////////////////////////////////////
-namespace QCD{
-
-template<class ImplementationPolicy>
-class BinaryCPModule: public CheckPointerModule< ImplementationPolicy> {
-  typedef CheckPointerModule< ImplementationPolicy> CPBase;
-  using CPBase::CPBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->CheckPointPtr.reset(new BinaryHmcCheckpointer<ImplementationPolicy>(this->Par_));
-  }
-
-};
-
-
-template<class ImplementationPolicy>
-class NerscCPModule: public CheckPointerModule< ImplementationPolicy> {
-  typedef CheckPointerModule< ImplementationPolicy> CPBase;
-  using CPBase::CPBase; // for constructors inheritance
-
-  // acquire resource
-  virtual void initialize(){
-     this->CheckPointPtr.reset(new NerscHmcCheckpointer<ImplementationPolicy>(this->Par_));
-  }
-
-};
-
-
-#ifdef HAVE_LIME
-  
-template<class ImplementationPolicy>
-class ILDGCPModule: public CheckPointerModule< ImplementationPolicy> {
-  typedef CheckPointerModule< ImplementationPolicy> CPBase;
-  using CPBase::CPBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-     this->CheckPointPtr.reset(new ILDGHmcCheckpointer<ImplementationPolicy>(this->Par_));
-  }
-
-};
-
-template<class ImplementationPolicy, class Metadata>
-class ScidacCPModule: public CheckPointerModule< ImplementationPolicy> {
-  typedef CheckPointerModule< ImplementationPolicy> CPBase;
-  Metadata M;
-
-  // acquire resource
-  virtual void initialize(){
-     this->CheckPointPtr.reset(new ScidacHmcCheckpointer<ImplementationPolicy, Metadata>(this->Par_, M));
-  }
-public:
-  ScidacCPModule(typename CPBase::APar Par, Metadata M_):M(M_), CPBase(Par) {}
-  template <class ReaderClass>
-  ScidacCPModule(Reader<ReaderClass>& Reader) : Parametrized<typename CPBase::APar>(Reader), M(Reader){};
-};
-#endif
-
-
-}// QCD temporarily here
-
-
-extern char cp_string[];
-
-/*
-// use macros?
-static Registrar<QCD::BinaryCPModule<QCD::PeriodicGimplR>, HMC_CPModuleFactory<cp_string, QCD::PeriodicGimplR, XmlReader> > __CPBinarymodXMLInit("Binary");
-static Registrar<QCD::NerscCPModule<QCD::PeriodicGimplR> , HMC_CPModuleFactory<cp_string, QCD::PeriodicGimplR, XmlReader> > __CPNerscmodXMLInit("Nersc");
-
-#ifdef HAVE_LIME
-static Registrar<QCD::ILDGCPModule<QCD::PeriodicGimplR>  , HMC_CPModuleFactory<cp_string, QCD::PeriodicGimplR, XmlReader> > __CPILDGmodXMLInit("ILDG");
-#endif
-*/
-
-}// Grid
-#endif //CP_MODULES_H
@@ -1,41 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef CHECKPOINTERS_H
-#define CHECKPOINTERS_H
-
-#include <Grid/qcd/hmc/checkpointers/BaseCheckpointer.h>
-#include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h>
-#include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h>
-#include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h>
-#include <Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h>
-//#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h>
-
-
-#endif // CHECKPOINTERS_H
@@ -1,124 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/ILDGCheckpointer.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef ILDG_CHECKPOINTER
-#define ILDG_CHECKPOINTER
-
-#ifdef HAVE_LIME
-
-#include <iostream>
-#include <sstream>
-#include <string>
-
-namespace Grid {
-namespace QCD {
-
-// Only for Gauge fields
-template <class Implementation>
-class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
- private:
-  CheckpointerParameters Params;
-
- public:
-  INHERIT_GIMPL_TYPES(Implementation);
-
-  ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
-
-  void initialize(const CheckpointerParameters &Params_) {
-    Params = Params_;
-
-    // check here that the format is valid
-    int ieee32big = (Params.format == std::string("IEEE32BIG"));
-    int ieee32    = (Params.format == std::string("IEEE32"));
-    int ieee64big = (Params.format == std::string("IEEE64BIG"));
-    int ieee64    = (Params.format == std::string("IEEE64"));
-
-    if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
-      std::cout << GridLogError << "Unrecognized file format " << Params.format
-                << std::endl;
-      std::cout << GridLogError
-                << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
-                << std::endl;
-
-      exit(1);
-    }
-  }
-
-  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
-                          GridParallelRNG &pRNG) {
-    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = U._grid;
-      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      IldgWriter _IldgWriter(grid->IsBoss());
-      _IldgWriter.open(config);
-      _IldgWriter.writeConfiguration(U, traj, config, config);
-      _IldgWriter.close();
-
-      std::cout << GridLogMessage << "Written ILDG Configuration on " << config
-                << " checksum " << std::hex 
-		<< nersc_csum<<"/"
-		<< scidac_csuma<<"/"
-		<< scidac_csumb
-		<< std::dec << std::endl;
-    }
-  };
-
-  void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
-                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
-    this->check_filename(rng);
-    this->check_filename(config);
-
-    
-
-    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-
-    FieldMetaData header;
-    IldgReader _IldgReader;
-    _IldgReader.open(config);
-    _IldgReader.readConfiguration(U,header);  // format from the header
-    _IldgReader.close();
-
-    std::cout << GridLogMessage << "Read ILDG Configuration from " << config
-              << " checksum " << std::hex 
-	      << nersc_csum<<"/"
-	      << scidac_csuma<<"/"
-	      << scidac_csumb
-	      << std::dec << std::endl;
-  };
-};
-}
-}
-
-#endif  // HAVE_LIME
-#endif  // ILDG_CHECKPOINTER
@@ -1,83 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/NerscCheckpointer.h
-
-Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef NERSC_CHECKPOINTER
-#define NERSC_CHECKPOINTER
-
-#include <iostream>
-#include <sstream>
-#include <string>
-
-namespace Grid {
-namespace QCD {
-
-// Only for Gauge fields
-template <class Gimpl>
-class NerscHmcCheckpointer : public BaseHmcCheckpointer<Gimpl> {
- private:
-  CheckpointerParameters Params;
-
- public:
-  INHERIT_GIMPL_TYPES(Gimpl);  // only for gauge configurations
-
-  NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
-
-  void initialize(const CheckpointerParameters &Params_) {
-    Params = Params_;
-    Params.format = "IEEE64BIG";  // fixed, overwrite any other choice
-  }
-
-  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
-                          GridParallelRNG &pRNG) {
-    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
-
-      int precision32 = 1;
-      int tworow = 0;
-      NerscIO::writeRNGState(sRNG, pRNG, rng);
-      NerscIO::writeConfiguration(U, config, tworow, precision32);
-    }
-  };
-
-  void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
-                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
-    this->check_filename(rng);
-    this->check_filename(config);
-
-
-    FieldMetaData header;
-    NerscIO::readRNGState(sRNG, pRNG, header, rng);
-    NerscIO::readConfiguration(U, header, config);
-  };
-};
-}
-}
-#endif
@@ -1,122 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/ScidacCheckpointer.h
-
-Copyright (C) 2018
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef SCIDAC_CHECKPOINTER
-#define SCIDAC_CHECKPOINTER
-
-#ifdef HAVE_LIME
-
-#include <iostream>
-#include <sstream>
-#include <string>
-
-namespace Grid {
-namespace QCD {
-
-// For generic fields
-template <class Implementation, class Metadata>
-class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
- private:
-  CheckpointerParameters Params;
-  Metadata MData;
-
-  typedef typename Implementation::Field Field;
-
- public:
-  //INHERIT_GIMPL_TYPES(Implementation);
-
-  ScidacHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
-  ScidacHmcCheckpointer(const CheckpointerParameters &Params_, const Metadata& M_):MData(M_) { initialize(Params_); }
-
-  void initialize(const CheckpointerParameters &Params_) {
-    Params = Params_;
-
-    // check here that the format is valid
-    int ieee32big = (Params.format == std::string("IEEE32BIG"));
-    int ieee32    = (Params.format == std::string("IEEE32"));
-    int ieee64big = (Params.format == std::string("IEEE64BIG"));
-    int ieee64    = (Params.format == std::string("IEEE64"));
-
-    if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
-      std::cout << GridLogError << "Unrecognized file format " << Params.format
-                << std::endl;
-      std::cout << GridLogError
-                << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
-                << std::endl;
-
-      exit(1);
-    }
-  }
-
-  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
-                          GridParallelRNG &pRNG) {
-    if ((traj % Params.saveInterval) == 0) {
-      std::string config, rng;
-      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = U._grid;
-      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      ScidacWriter _ScidacWriter(grid->IsBoss());
-      _ScidacWriter.open(config);
-      _ScidacWriter.writeScidacFieldRecord(U, MData);
-      _ScidacWriter.close();
-
-      std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
-    }
-  };
-
-  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
-                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
-    this->check_filename(rng);
-    this->check_filename(config);
-
-
-    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-
-    Metadata md_content;
-    ScidacReader _ScidacReader;
-    _ScidacReader.open(config);
-    _ScidacReader.readScidacFieldRecord(U,md_content);  // format from the header
-    _ScidacReader.close();
-
-    std::cout << GridLogMessage << "Read Scidac Configuration from " << config
-              << " checksum " << std::hex 
-	      << nersc_csum<<"/"
-	      << scidac_csuma<<"/"
-	      << scidac_csumb
-	      << std::dec << std::endl;
-  };
-};
-}
-}
-
-#endif  // HAVE_LIME
-#endif  // ILDG_CHECKPOINTER
@@ -1,295 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/integrators/Integrator_algorithm.h
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-//--------------------------------------------------------------------
-
-
-/*! @file Integrator_algorithm.h
- * @brief Declaration of classes for the Molecular Dynamics algorithms
- *
- */
-//--------------------------------------------------------------------
-
-#ifndef INTEGRATOR_ALG_INCLUDED
-#define INTEGRATOR_ALG_INCLUDED
-
-namespace Grid {
-namespace QCD {
-
-/* PAB:
- *
- * Recursive leapfrog; explanation of nested stepping
- *
- * Nested 1:4; units in dt for top level integrator
- *
- * CHROMA                           IroIro
- *   0        1                      0
- *  P 1/2                           P 1/2
- *          P 1/16                                  P1/16
- *                 U 1/8                                   U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                   U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/16                                  P1/8
- *  P 1                             P 1
- *          P 1/16                    * skipped --- avoids revaluating force
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/16                                  P1/8
- *  P 1                             P 1
- *          P 1/16                    * skipped
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/16                    * skipped
- *  P 1                             P 1
- *          P 1/16                                  P1/8
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/8                                   P1/8
- *                 U 1/8                                 U1/8
- *          P 1/16                                  P1/16
- *  P 1/2                            P 1/2
- */
-
-template <class FieldImplementation, class SmearingPolicy,
-          class RepresentationPolicy =
-              Representations<FundamentalRepresentation> >
-class LeapFrog : public Integrator<FieldImplementation, SmearingPolicy,
-                                   RepresentationPolicy> {
- public:
-  typedef LeapFrog<FieldImplementation, SmearingPolicy, RepresentationPolicy>
-      Algorithm;
-  INHERIT_FIELD_TYPES(FieldImplementation);
-
-  std::string integrator_name(){return "LeapFrog";}
-
-  LeapFrog(GridBase* grid, IntegratorParameters Par,
-           ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm)
-      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
-            grid, Par, Aset, Sm){};
-
-  void step(Field& U, int level, int _first, int _last) {
-    int fl = this->as.size() - 1;
-    // level  : current level
-    // fl     : final level
-    // eps    : current step size
-
-    // Get current level step size
-    RealD eps = this->Params.trajL/this->Params.MDsteps;
-    for (int l = 0; l <= level; ++l) eps /= this->as[l].multiplier;
-
-    int multiplier = this->as[level].multiplier;
-    for (int e = 0; e < multiplier; ++e) {
-      int first_step = _first && (e == 0);
-      int last_step = _last && (e == multiplier - 1);
-
-      if (first_step) {  // initial half step
-        this->update_P(U, level, eps / 2.0);
-      }
-
-      if (level == fl) {  // lowest level
-        this->update_U(U, eps);
-      } else {  // recursive function call
-        this->step(U, level + 1, first_step, last_step);
-      }
-
-      int mm = last_step ? 1 : 2;
-      this->update_P(U, level, mm * eps / 2.0);
-    }
-  }
-};
-
-template <class FieldImplementation, class SmearingPolicy,
-          class RepresentationPolicy =
-              Representations<FundamentalRepresentation> >
-class MinimumNorm2 : public Integrator<FieldImplementation, SmearingPolicy,
-                                       RepresentationPolicy> {
- private:
-  const RealD lambda = 0.1931833275037836;
-
- public:
-  INHERIT_FIELD_TYPES(FieldImplementation);
-
-  MinimumNorm2(GridBase* grid, IntegratorParameters Par,
-               ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm)
-      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
-            grid, Par, Aset, Sm){};
-
-  std::string integrator_name(){return "MininumNorm2";}
-
-  void step(Field& U, int level, int _first, int _last) {
-    // level  : current level
-    // fl     : final level
-    // eps    : current step size
-
-    int fl = this->as.size() - 1;
-
-    RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
-    for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
-
-    // Nesting:  2xupdate_U of size eps/2
-    // Next level is eps/2/multiplier
-
-    int multiplier = this->as[level].multiplier;
-    for (int e = 0; e < multiplier; ++e) {  // steps per step
-
-      int first_step = _first && (e == 0);
-      int last_step = _last && (e == multiplier - 1);
-
-      if (first_step) {  // initial half step
-        this->update_P(U, level, lambda * eps);
-      }
-
-      if (level == fl) {  // lowest level
-        this->update_U(U, 0.5 * eps);
-      } else {  // recursive function call
-        this->step(U, level + 1, first_step, 0);
-      }
-
-      this->update_P(U, level, (1.0 - 2.0 * lambda) * eps);
-
-      if (level == fl) {  // lowest level
-        this->update_U(U, 0.5 * eps);
-      } else {  // recursive function call
-        this->step(U, level + 1, 0, last_step);
-      }
-
-      int mm = (last_step) ? 1 : 2;
-      this->update_P(U, level, lambda * eps * mm);
-    }
-  }
-};
-
-template <class FieldImplementation, class SmearingPolicy,
-          class RepresentationPolicy =
-              Representations<FundamentalRepresentation> >
-class ForceGradient : public Integrator<FieldImplementation, SmearingPolicy,
-                                        RepresentationPolicy> {
- private:
-  const RealD lambda = 1.0 / 6.0;
-  ;
-  const RealD chi = 1.0 / 72.0;
-  const RealD xi = 0.0;
-  const RealD theta = 0.0;
-
- public:
-  INHERIT_FIELD_TYPES(FieldImplementation);
-
-  // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
-  ForceGradient(GridBase* grid, IntegratorParameters Par,
-                ActionSet<Field, RepresentationPolicy>& Aset,
-                SmearingPolicy& Sm)
-      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
-            grid, Par, Aset, Sm){};
-
-  std::string integrator_name(){return "ForceGradient";}
-  
-  void FG_update_P(Field& U, int level, double fg_dt, double ep) {
-    Field Ufg(U._grid);
-    Field Pfg(U._grid);
-    Ufg = U;
-    Pfg = zero;
-    std::cout << GridLogIntegrator << "FG update " << fg_dt << " " << ep
-              << std::endl;
-    // prepare_fg; no prediction/result cache for now
-    // could relax CG stopping conditions for the
-    // derivatives in the small step since the force gets multiplied by
-    // a tiny dt^2 term relative to main force.
-    //
-    // Presently 4 force evals, and should have 3, so 1.33x too expensive.
-    // could reduce this with sloppy CG to perhaps 1.15x too expensive
-    // even without prediction.
-    this->update_P(Pfg, Ufg, level, 1.0);
-    this->update_U(Pfg, Ufg, fg_dt);
-    this->update_P(Ufg, level, ep);
-  }
-
-  void step(Field& U, int level, int _first, int _last) {
-    RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
-    for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
-
-    RealD Chi = chi * eps * eps * eps;
-
-    int fl = this->as.size() - 1;
-
-    int multiplier = this->as[level].multiplier;
-
-    for (int e = 0; e < multiplier; ++e) {  // steps per step
-
-      int first_step = _first && (e == 0);
-      int last_step = _last && (e == multiplier - 1);
-
-      if (first_step) {  // initial half step
-        this->update_P(U, level, lambda * eps);
-      }
-
-      if (level == fl) {  // lowest level
-        this->update_U(U, 0.5 * eps);
-      } else {  // recursive function call
-        this->step(U, level + 1, first_step, 0);
-      }
-
-      this->FG_update_P(U, level, 2 * Chi / ((1.0 - 2.0 * lambda) * eps),
-                        (1.0 - 2.0 * lambda) * eps);
-
-      if (level == fl) {  // lowest level
-        this->update_U(U, 0.5 * eps);
-      } else {  // recursive function call
-        this->step(U, level + 1, 0, last_step);
-      }
-
-      int mm = (last_step) ? 1 : 2;
-      this->update_P(U, level, lambda * eps * mm);
-    }
-  }
-};
-
-
-
-
-}
-}
-
-#endif  // INTEGRATOR_INCLUDED
@@ -1,517 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/ActionModules.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef ACTION_MODULES_H
-#define ACTION_MODULES_H
-
-/*
-Define loadable, serializable modules
-for the HMC execution
-*/
-
-namespace Grid {
-
-//////////////////////////////////////////////
-//              Actions
-//////////////////////////////////////////////
-
-template <class Product, class R>
-class ActionModuleBase: public HMCModuleBase<Product>{
-public:
-  typedef R Resource;
-  virtual void acquireResource(R& ){};
-
-};
-
-
-template <class ActionType, class APar>
-class ActionModule
-    : public Parametrized<APar>,
-      public ActionModuleBase< QCD::Action<typename ActionType::GaugeField> , QCD::GridModule > {
- public:
-  typedef ActionModuleBase< QCD::Action<typename ActionType::GaugeField>, QCD::GridModule > Base;
-  typedef typename Base::Product Product;
-  typedef APar Parameters;
-
-  std::unique_ptr<ActionType> ActionPtr;
-
-  ActionModule(APar Par) : Parametrized<APar>(Par) {}
-
-  template <class ReaderClass>
-  ActionModule(Reader<ReaderClass>& Reader) : Parametrized<APar>(Reader){};
-
-
-  virtual void print_parameters(){
-    Parametrized<APar>::print_parameters();
-  }
-
-  Product* getPtr() {
-    if (!ActionPtr) initialize();
-
-    return ActionPtr.get();
-  }
-
- private:
-  virtual void initialize() = 0;
-
-};
-
-//////////////////////////
-// Modules
-//////////////////////////
-
-namespace QCD{
-
-class PlaqPlusRectangleGaugeActionParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(PlaqPlusRectangleGaugeActionParameters, 
-    RealD, c_plaq,
-    RealD, c_rect);
-
-};
-
-class RBCGaugeActionParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(RBCGaugeActionParameters, 
-    RealD, beta,
-    RealD, c1);
-
-};
-
-class BetaGaugeActionParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(BetaGaugeActionParameters, 
-    RealD, beta);
-};
-
-
-
-
-template <class Impl >
-class WilsonGModule: public ActionModule<WilsonGaugeAction<Impl>, BetaGaugeActionParameters> {
-  typedef ActionModule<WilsonGaugeAction<Impl>, BetaGaugeActionParameters> ActionBase;
-  using ActionBase::ActionBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ActionPtr.reset(new WilsonGaugeAction<Impl>(this->Par_.beta));
-  }
-
-};
-
-template <class Impl >
-class PlaqPlusRectangleGModule: public ActionModule<PlaqPlusRectangleAction<Impl>, PlaqPlusRectangleGaugeActionParameters> {
-  typedef ActionModule<PlaqPlusRectangleAction<Impl>, PlaqPlusRectangleGaugeActionParameters> ActionBase;
-  using ActionBase::ActionBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ActionPtr.reset(new PlaqPlusRectangleAction<Impl>(this->Par_.c_plaq, this->Par_.c_rect));
-  }
-
-};
-
-template <class Impl >
-class RBCGModule: public ActionModule<RBCGaugeAction<Impl>, RBCGaugeActionParameters> {
-  typedef ActionModule<RBCGaugeAction<Impl>, RBCGaugeActionParameters> ActionBase;
-  using ActionBase::ActionBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ActionPtr.reset(new RBCGaugeAction<Impl>(this->Par_.beta, this->Par_.c1));
-  }
-
-};
-
-
-
-
-template <class Impl >
-class SymanzikGModule: public ActionModule<SymanzikGaugeAction<Impl>, BetaGaugeActionParameters> {
-  typedef ActionModule<SymanzikGaugeAction<Impl>, BetaGaugeActionParameters> ActionBase;
-  using ActionBase::ActionBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ActionPtr.reset(new SymanzikGaugeAction<Impl>(this->Par_.beta));
-  }
-
-};
-
-template <class Impl >
-class IwasakiGModule: public ActionModule<IwasakiGaugeAction<Impl>, BetaGaugeActionParameters> {
-  typedef ActionModule<IwasakiGaugeAction<Impl>, BetaGaugeActionParameters> ActionBase;
-  using ActionBase::ActionBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ActionPtr.reset(new IwasakiGaugeAction<Impl>(this->Par_.beta));
-  }
-
-};
-
-
-template <class Impl >
-class DBW2GModule: public ActionModule<DBW2GaugeAction<Impl>, BetaGaugeActionParameters> {
-  typedef ActionModule<DBW2GaugeAction<Impl>, BetaGaugeActionParameters> ActionBase;
-  using ActionBase::ActionBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ActionPtr.reset(new DBW2GaugeAction<Impl>(this->Par_.beta));
-  }
-
-};
-
-/////////////////////////////////////////
-// Fermion Actions
-/////////////////////////////////////////
-
-
-template <class Impl, template <typename> class FermionA, class Params = NoParameters >
-class PseudoFermionModuleBase: public ActionModule<FermionA<Impl>, Params> {
-protected:
-  typedef ActionModule<FermionA<Impl>, Params> ActionBase;
-  using ActionBase::ActionBase; // for constructors
-
-  typedef std::unique_ptr<FermionOperatorModuleBase<FermionOperator<Impl>> > operator_type;
-  typedef std::unique_ptr<HMCModuleBase<OperatorFunction<typename Impl::FermionField> > > solver_type;
-
-  template <class ReaderClass>
-  void getFermionOperator(Reader<ReaderClass>& Reader, operator_type &fo, std::string section_name){
-    auto &FOFactory = HMC_FermionOperatorModuleFactory<fermionop_string, Impl, ReaderClass>::getInstance();
-    Reader.push(section_name);
-    std::string op_name;
-    read(Reader,"name", op_name);
-    fo = FOFactory.create(op_name, Reader);
-    Reader.pop();  
-  }
-
-  template <class ReaderClass>
-  void getSolverOperator(Reader<ReaderClass>& Reader, solver_type &so, std::string section_name){
-    auto& SolverFactory = HMC_SolverModuleFactory<solver_string, typename Impl::FermionField, ReaderClass>::getInstance();
-    Reader.push(section_name);
-    std::string solv_name;
-    read(Reader,"name", solv_name);
-    so = SolverFactory.create(solv_name, Reader);
-    Reader.pop();    
-  }
-};
-
-
-template <class Impl >
-class TwoFlavourFModule: public PseudoFermionModuleBase<Impl, TwoFlavourPseudoFermionAction>{
-  typedef PseudoFermionModuleBase<Impl, TwoFlavourPseudoFermionAction> Base;
-  using Base::Base;
-
-  typename Base::operator_type fop_mod;
-  typename Base::solver_type   solver_mod;
-
- public:
-  virtual void acquireResource(typename Base::Resource& GridMod){
-    fop_mod->AddGridPair(GridMod);
-  }
-
-   // constructor
-   template <class ReaderClass>
-   TwoFlavourFModule(Reader<ReaderClass>& R): Base(R) {
-    this->getSolverOperator(R, solver_mod, "Solver");
-    this->getFermionOperator(R, fop_mod, "Operator");
-   } 
-
-  // acquire resource
-  virtual void initialize() {
-    // here temporarily assuming that the force and action solver are the same
-    this->ActionPtr.reset(new TwoFlavourPseudoFermionAction<Impl>(*(this->fop_mod->getPtr()), *(this->solver_mod->getPtr()), *(this->solver_mod->getPtr())));
-  }
-
-};
-
-// very similar, I could have templated this but it is overkilling
-template <class Impl >
-class TwoFlavourEOFModule: public PseudoFermionModuleBase<Impl, TwoFlavourEvenOddPseudoFermionAction>{
-  typedef PseudoFermionModuleBase<Impl, TwoFlavourEvenOddPseudoFermionAction> Base;
-  using Base::Base;
-
-  typename Base::operator_type fop_mod;
-  typename Base::solver_type   solver_mod;
-
- public:
-  virtual void acquireResource(typename Base::Resource& GridMod){
-    fop_mod->AddGridPair(GridMod);
-  }
-
-   // constructor
-   template <class ReaderClass>
-   TwoFlavourEOFModule(Reader<ReaderClass>& R): PseudoFermionModuleBase<Impl, TwoFlavourEvenOddPseudoFermionAction>(R) {
-    this->getSolverOperator(R, solver_mod, "Solver");
-    this->getFermionOperator(R, fop_mod, "Operator");
-   } 
-
-  // acquire resource
-  virtual void initialize() {
-    // here temporarily assuming that the force and action solver are the same
-    this->ActionPtr.reset(new TwoFlavourEvenOddPseudoFermionAction<Impl>(*(this->fop_mod->getPtr()), *(this->solver_mod->getPtr()), *(this->solver_mod->getPtr())));
-  }
-
-};
-
-
-template <class Impl >
-class TwoFlavourRatioFModule: public PseudoFermionModuleBase<Impl, TwoFlavourRatioPseudoFermionAction>{
-  typedef PseudoFermionModuleBase<Impl, TwoFlavourRatioPseudoFermionAction> Base;
-  using Base::Base;
-
-  typename Base::operator_type fop_numerator_mod;
-  typename Base::operator_type fop_denominator_mod;
-  typename Base::solver_type   solver_mod;
-
- public:
-  virtual void acquireResource(typename Base::Resource& GridMod){
-    fop_numerator_mod->AddGridPair(GridMod);
-    fop_denominator_mod->AddGridPair(GridMod);
-  }
-
-   // constructor
-   template <class ReaderClass>
-   TwoFlavourRatioFModule(Reader<ReaderClass>& R): PseudoFermionModuleBase<Impl, TwoFlavourRatioPseudoFermionAction>(R) {
-    this->getSolverOperator(R, solver_mod, "Solver");
-    this->getFermionOperator(R, fop_numerator_mod, "Numerator");
-    this->getFermionOperator(R, fop_denominator_mod, "Denominator");
-   } 
-
-  // acquire resource
-  virtual void initialize() {
-    // here temporarily assuming that the force and action solver are the same
-    this->ActionPtr.reset(new TwoFlavourRatioPseudoFermionAction<Impl>(*(this->fop_numerator_mod->getPtr()), 
-      *(this->fop_denominator_mod->getPtr()), *(this->solver_mod->getPtr()), *(this->solver_mod->getPtr())));
-  }
-
-};
-
-template <class Impl >
-class TwoFlavourRatioEOFModule: public PseudoFermionModuleBase<Impl, TwoFlavourEvenOddRatioPseudoFermionAction>{
-  typedef PseudoFermionModuleBase<Impl, TwoFlavourEvenOddRatioPseudoFermionAction> Base;
-  using Base::Base;
-
-  typename Base::operator_type fop_numerator_mod;
-  typename Base::operator_type fop_denominator_mod;
-  typename Base::solver_type   solver_mod;
-
- public:
-  virtual void acquireResource(typename Base::Resource& GridMod){
-    fop_numerator_mod->AddGridPair(GridMod);
-    fop_denominator_mod->AddGridPair(GridMod);
-  }
-
-   // constructor
-   template <class ReaderClass>
-   TwoFlavourRatioEOFModule(Reader<ReaderClass>& R): Base(R) {
-    this->getSolverOperator(R, solver_mod, "Solver");
-    this->getFermionOperator(R, fop_numerator_mod, "Numerator");
-    this->getFermionOperator(R, fop_denominator_mod, "Denominator");
-   } 
-
-  // acquire resource
-  virtual void initialize() {
-    // here temporarily assuming that the force and action solver are the same
-    this->ActionPtr.reset(new TwoFlavourEvenOddRatioPseudoFermionAction<Impl>(*(this->fop_numerator_mod->getPtr()), 
-      *(this->fop_denominator_mod->getPtr()), *(this->solver_mod->getPtr()), *(this->solver_mod->getPtr())));
-  }
-
-};
-
-
-template <class Impl >
-class OneFlavourFModule: public PseudoFermionModuleBase<Impl, OneFlavourRationalPseudoFermionAction, OneFlavourRationalParams>{
-  typedef PseudoFermionModuleBase<Impl, OneFlavourRationalPseudoFermionAction, OneFlavourRationalParams> Base;
-  using Base::Base;
-
-  typename Base::operator_type fop_mod;
-
- public:
-  virtual void acquireResource(typename Base::Resource& GridMod){
-    fop_mod->AddGridPair(GridMod);
-  }
-
-   // constructor
-   template <class ReaderClass>
-   OneFlavourFModule(Reader<ReaderClass>& R): Base(R) {
-    this->getFermionOperator(R, fop_mod, "Operator");
-   } 
-
-  // acquire resource
-  virtual void initialize() {
-    this->ActionPtr.reset(new OneFlavourRationalPseudoFermionAction<Impl>(*(this->fop_mod->getPtr()), this->Par_ ));
-  }
-
-};
-
-template <class Impl >
-class OneFlavourEOFModule: 
-  public PseudoFermionModuleBase<Impl, OneFlavourEvenOddRationalPseudoFermionAction, OneFlavourRationalParams>
-  {
-  typedef PseudoFermionModuleBase<Impl, OneFlavourEvenOddRationalPseudoFermionAction, OneFlavourRationalParams> Base;
-  using Base::Base;
-
-  typename Base::operator_type fop_mod;
-
- public:
-  virtual void acquireResource(typename Base::Resource& GridMod){
-    fop_mod->AddGridPair(GridMod);
-  }
-
-   // constructor
-   template <class ReaderClass>
-   OneFlavourEOFModule(Reader<ReaderClass>& R): Base(R) {
-    this->getFermionOperator(R, fop_mod, "Operator");
-   } 
-
-  // acquire resource
-  virtual void initialize() {
-    this->ActionPtr.reset(new OneFlavourEvenOddRationalPseudoFermionAction<Impl>(*(this->fop_mod->getPtr()), this->Par_ ));
-  }
-
-};
-
-
-template <class Impl >
-class OneFlavourRatioFModule: 
-  public PseudoFermionModuleBase<Impl, OneFlavourRatioRationalPseudoFermionAction, OneFlavourRationalParams>
-  {
-
-  typedef PseudoFermionModuleBase<Impl, OneFlavourRatioRationalPseudoFermionAction, OneFlavourRationalParams> Base;
-  using Base::Base;
-
-  typename Base::operator_type fop_numerator_mod;
-  typename Base::operator_type fop_denominator_mod;
-
- public:
-  virtual void acquireResource(typename Base::Resource& GridMod){
-    fop_numerator_mod->AddGridPair(GridMod);
-    fop_denominator_mod->AddGridPair(GridMod);
-  }
-
-   // constructor
-   template <class ReaderClass>
-   OneFlavourRatioFModule(Reader<ReaderClass>& R): Base(R) {
-    this->getFermionOperator(R, fop_numerator_mod, "Numerator");
-    this->getFermionOperator(R, fop_denominator_mod, "Denominator");
-   } 
-
-  // acquire resource
-  virtual void initialize() {
-    this->ActionPtr.reset(new OneFlavourRatioRationalPseudoFermionAction<Impl>( *(this->fop_numerator_mod->getPtr()), 
-                                                                                *(this->fop_denominator_mod->getPtr()), 
-                                                                                this->Par_ ));
-  }
-
-};
-
-
-template <class Impl >
-class OneFlavourRatioEOFModule: 
-  public PseudoFermionModuleBase<Impl, OneFlavourEvenOddRatioRationalPseudoFermionAction, OneFlavourRationalParams>
-  {
-
-  typedef PseudoFermionModuleBase<Impl, OneFlavourEvenOddRatioRationalPseudoFermionAction, OneFlavourRationalParams> Base;
-  using Base::Base;
-
-  typename Base::operator_type fop_numerator_mod;
-  typename Base::operator_type fop_denominator_mod;
-
- public:
-  virtual void acquireResource(typename Base::Resource& GridMod){
-    fop_numerator_mod->AddGridPair(GridMod);
-    fop_denominator_mod->AddGridPair(GridMod);
-  }
-
-   // constructor
-   template <class ReaderClass>
-   OneFlavourRatioEOFModule(Reader<ReaderClass>& R): Base(R) {
-    this->getFermionOperator(R, fop_numerator_mod, "Numerator");
-    this->getFermionOperator(R, fop_denominator_mod, "Denominator");
-   } 
-
-  // acquire resource
-  virtual void initialize() {
-    this->ActionPtr.reset(new OneFlavourEvenOddRatioRationalPseudoFermionAction<Impl>(*(this->fop_numerator_mod->getPtr()), 
-                                                                                      *(this->fop_denominator_mod->getPtr()), 
-                                                                                      this->Par_ ));
-  }
-
-};
-
-}// QCD temporarily here
-
-
-
-
-
-
-
-////////////////////////////////////////
-// Factories specialisations
-////////////////////////////////////////
-
-
-
-// use the same classed defined by Antonin, does not make sense to rewrite
-// Factory is perfectly fine
-// Registar must be changed because I do not want to use the ModuleFactory
-
-// explicit ref to LatticeGaugeField must be changed or put in the factory
-//typedef ActionModuleBase< QCD::Action< QCD::LatticeGaugeField >, QCD::GridModule > HMC_LGTActionModBase;
-//typedef ActionModuleBase< QCD::Action< QCD::LatticeReal >, QCD::GridModule > HMC_ScalarActionModBase;
-
-template <char const *str, class Field, class ReaderClass >
-class HMC_ActionModuleFactory
-    : public Factory < ActionModuleBase< QCD::Action< Field >, QCD::GridModule > , Reader<ReaderClass> > {
- public:
-  typedef Reader<ReaderClass> TheReader; 
-  // use SINGLETON FUNCTOR MACRO HERE
-  HMC_ActionModuleFactory(const HMC_ActionModuleFactory& e) = delete;
-  void operator=(const HMC_ActionModuleFactory& e) = delete;
-  static HMC_ActionModuleFactory& getInstance(void) {
-    static HMC_ActionModuleFactory e;
-    return e;
-  }
-
- private:
-  HMC_ActionModuleFactory(void) = default;
-    std::string obj_type() const {
-        return std::string(str);
-  }
-};
-
-
-extern char gauge_string[];
-} // Grid
-
-
-#endif //HMC_MODULES_H
@@ -1,109 +0,0 @@
-/*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid 
-Source file: Factory.h
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Factory_hpp_
-#define Factory_hpp_
-
-
-namespace Grid{
-
-
-
-
-/******************************************************************************
- *                        abstract factory class                              *
- ******************************************************************************/
-template <typename T, typename CreatorInput>
-class Factory
-{
-public:
-    typedef std::function< std::unique_ptr<T> (const CreatorInput&) > Func;
-
-    // constructor
-    Factory(void) = default;
-    // destructor
-    virtual ~Factory(void) = default;
-    // registration
-    void registerBuilder(const std::string type, const Func &f);
-    // get builder list
-    std::vector<std::string> getBuilderList(void) const;
-    // factory
-    std::unique_ptr<T> create(const std::string type,
-              								const CreatorInput& input) const;
-private:
-    std::map<std::string, Func> builder_;
-    virtual std::string obj_type() const = 0;
-};
-
-/******************************************************************************
- *                         template implementation                            *
- ******************************************************************************/
-// registration ////////////////////////////////////////////////////////////////
-template <typename T, typename CreatorInput>
-void Factory<T, CreatorInput>::registerBuilder(const std::string type, const Func &f)
-{
-    builder_[type] = f;
-}
-
-// get module list /////////////////////////////////////////////////////////////
-template <typename T, typename CreatorInput>
-std::vector<std::string> Factory<T, CreatorInput>::getBuilderList(void) const
-{
-    std::vector<std::string> list;
-    
-    for (auto &b: builder_)
-    {
-        list.push_back(b.first);
-    }
-    
-    return list;
-}
-
-// factory /////////////////////////////////////////////////////////////////////
-template <typename T, typename CreatorInput>
-std::unique_ptr<T> Factory<T, CreatorInput>::create(const std::string type,
-                                      							const CreatorInput& input) const
-{
-    Func func;
-    
-    std::cout << GridLogDebug << "Creating object of type "<< type << std::endl;
-    try
-    {
-        func = builder_.at(type);
-    }
-    catch (std::out_of_range &)
-    {
-      //HADRONS_ERROR("object of type '" + type + "' unknown");
-    	std::cout << GridLogError << "Error" << std::endl;
-    	std::cout << GridLogError << obj_type() << " object of name [" << type << "] unknown" << std::endl;
-    	exit(1);
-    }
-    
-    return func(input);
-}
-
-}
-
-#endif // Factory_hpp_
@@ -1,243 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/FermionOperatorModules.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef FERMIONOPERATOR_MODULES_H
-#define FERMIONOPERATOR_MODULES_H
-
-namespace Grid {
-
-////////////////////////////////////
-//  Fermion operators
-/////////////////////////////////////
-template < class Product>
-class FermionOperatorModuleBase : public HMCModuleBase<Product>{
-public:
-  virtual void AddGridPair(QCD::GridModule&) = 0;
-};
-
-template <template <typename> class FOType, class FermionImpl, class FOPar>
-class FermionOperatorModule
-    : public Parametrized<FOPar>,
-      public FermionOperatorModuleBase<QCD::FermionOperator<FermionImpl> > {
-
-protected:
-  std::unique_ptr< FOType<FermionImpl> > FOPtr;
-  std::vector< QCD::GridModule* >    GridRefs;
- public:
-  typedef HMCModuleBase< QCD::FermionOperator<FermionImpl> > Base;
-  typedef typename Base::Product Product;
-
-  FermionOperatorModule(FOPar Par) : Parametrized<FOPar>(Par) {}
-
-  template <class ReaderClass>
-  FermionOperatorModule(Reader<ReaderClass>& Reader) : Parametrized<FOPar>(Reader){};
-
-  void AddGridPair(QCD::GridModule &Mod){
-    if (GridRefs.size()>1){
-      std::cout << GridLogError << "Adding too many Grids to the FermionOperatorModule" << std::endl;
-      exit(1);
-    }
-    GridRefs.push_back(&Mod);
-
-    if (Ls()){
-      GridRefs.push_back(new QCD::GridModule());
-      GridRefs[1]->set_full(QCD::SpaceTimeGrid::makeFiveDimGrid(Ls(),GridRefs[0]->get_full()));
-      GridRefs[1]->set_rb(QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls(),GridRefs[0]->get_full()));
-    }
-  }
-
-  virtual unsigned int Ls(){
-    return 0;
-  }
-
-  virtual void print_parameters(){
-    std::cout << this->Par_ << std::endl;
-  }
-
-  Product* getPtr() {
-    if (!FOPtr) initialize();
-
-    return FOPtr.get();
-  }
-
- private:
-  virtual void initialize() = 0;
-};
-
-
-
-// Factory
-template <char const *str, class FermionImpl, class ReaderClass >
-class HMC_FermionOperatorModuleFactory
-    : public Factory < FermionOperatorModuleBase<QCD::FermionOperator<FermionImpl> > ,  Reader<ReaderClass> > {
- public:
-  // use SINGLETON FUNCTOR MACRO HERE
-  typedef Reader<ReaderClass> TheReader;
-
-  HMC_FermionOperatorModuleFactory(const HMC_FermionOperatorModuleFactory& e) = delete;
-  void operator=(const HMC_FermionOperatorModuleFactory& e) = delete;
-  static HMC_FermionOperatorModuleFactory& getInstance(void) {
-    static HMC_FermionOperatorModuleFactory e;
-    return e;
-  }
-
- private:
-  HMC_FermionOperatorModuleFactory(void) = default;
-    std::string obj_type() const {
-        return std::string(str);
-  }
-};
-
-
-
-
-extern char fermionop_string[];
-namespace QCD{
-
-// Modules
-class WilsonFermionParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonFermionParameters,
-    RealD, mass);
-};
-
-
-template <class FermionImpl >
-class WilsonFermionModule: public FermionOperatorModule<WilsonFermion, FermionImpl, WilsonFermionParameters> {
-  typedef FermionOperatorModule<WilsonFermion, FermionImpl, WilsonFermionParameters> FermBase;
-  using FermBase::FermBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    auto GridMod = this->GridRefs[0];
-    typename FermionImpl::GaugeField U(GridMod->get_full());
-    this->FOPtr.reset(new WilsonFermion<FermionImpl>(U, *(GridMod->get_full()), *(GridMod->get_rb()), this->Par_.mass));
-  }
-};
-
-
-
-class MobiusFermionParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(MobiusFermionParameters,
-    RealD, mass,
-    RealD, M5,
-    RealD, b,
-    RealD, c,
-    unsigned int, Ls);
-};
-
-template <class FermionImpl >
-class MobiusFermionModule: public FermionOperatorModule<MobiusFermion, FermionImpl, MobiusFermionParameters> {
-  typedef FermionOperatorModule<MobiusFermion, FermionImpl, MobiusFermionParameters> FermBase;
-  using FermBase::FermBase; // for constructors
-
-  virtual unsigned int Ls(){
-    return this->Par_.Ls;
-  }
-
-  // acquire resource
-  virtual void initialize(){
-    auto GridMod = this->GridRefs[0];
-    auto GridMod5d = this->GridRefs[1];
-    typename FermionImpl::GaugeField U(GridMod->get_full());
-    this->FOPtr.reset(new MobiusFermion<FermionImpl>( U, *(GridMod->get_full()), *(GridMod->get_rb()),
-                                                      *(GridMod5d->get_full()), *(GridMod5d->get_rb()),
-                                                      this->Par_.mass, this->Par_.M5, this->Par_.b, this->Par_.c));
-  }
-};
-
-
-class DomainWallFermionParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(DomainWallFermionParameters,
-    RealD, mass,
-    RealD, M5,
-    unsigned int, Ls);
-};
-
-template <class FermionImpl >
-class DomainWallFermionModule: public FermionOperatorModule<DomainWallFermion, FermionImpl, DomainWallFermionParameters> {
-  typedef FermionOperatorModule<DomainWallFermion, FermionImpl, DomainWallFermionParameters> FermBase;
-  using FermBase::FermBase; // for constructors
-
-  virtual unsigned int Ls(){
-    return this->Par_.Ls;
-  }
-
-  // acquire resource
-  virtual void initialize(){
-    auto GridMod = this->GridRefs[0];
-    auto GridMod5d = this->GridRefs[1];
-    typename FermionImpl::GaugeField U(GridMod->get_full());
-    this->FOPtr.reset(new DomainWallFermion<FermionImpl>( U, *(GridMod->get_full()), *(GridMod->get_rb()),
-                                                      *(GridMod5d->get_full()), *(GridMod5d->get_rb()),
-                                                      this->Par_.mass, this->Par_.M5));
-  }
-};
-
-
-class DomainWallEOFAFermionParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(DomainWallEOFAFermionParameters,
-    RealD, mq1,
-    RealD, mq2,
-    RealD, mq3,
-    RealD, shift,
-    int, pm,
-    RealD, M5,
-    unsigned int, Ls);
-};
-
-template <class FermionImpl >
-class DomainWallEOFAFermionModule: public FermionOperatorModule<DomainWallEOFAFermion, FermionImpl, DomainWallEOFAFermionParameters> {
-  typedef FermionOperatorModule<DomainWallEOFAFermion, FermionImpl, DomainWallEOFAFermionParameters> FermBase;
-  using FermBase::FermBase; // for constructors
-
-  virtual unsigned int Ls(){
-    return this->Par_.Ls;
-  }
-
-  // acquire resource
-  virtual void initialize(){
-    auto GridMod = this->GridRefs[0];
-    auto GridMod5d = this->GridRefs[1];
-    typename FermionImpl::GaugeField U(GridMod->get_full());
-    this->FOPtr.reset(new DomainWallEOFAFermion<FermionImpl>( U, *(GridMod->get_full()), *(GridMod->get_rb()),
-                                                      *(GridMod5d->get_full()), *(GridMod5d->get_rb()),
-                                                      this->Par_.mq1, this->Par_.mq2, this->Par_.mq3,
-                                                      this->Par_.shift, this->Par_.pm, this->Par_.M5));
-  }
-};
-
-
-} // QCD
-} // Grid
-
-
-#endif //FERMIONOPERATOR_MODULES_H
@@ -1,39 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/Modules.cc
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-namespace Grid{
-
-char gauge_string[]      = "gauge";
-char cp_string[]         = "CheckPointer";
-char hmc_string[]        = "HMC";
-char observable_string[] = "Observable";
-char solver_string[]     = "Solver";
-char fermionop_string[]  = "FermionOperator";
-
-}
@@ -1,130 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef HMC_MODULES_H
-#define HMC_MODULES_H
-
-/*
-Define loadable, serializable modules
-for the HMC execution
-*/
-
-namespace Grid {
-
-// Empty class for no parameters
-class NoParameters{};
-
-
-/*
-Base class for modules with parameters
-*/
-template < class P >
-class Parametrized{
-public:
-  typedef P Parameters;
-
-  Parametrized(Parameters Par):Par_(Par){};
-
-  template <class ReaderClass>
-  Parametrized(Reader<ReaderClass> & R, std::string section_name = "parameters"){
-    read(R, section_name, Par_);
-  }
-
-  void set_parameters(Parameters Par){
-        Par_ = Par;
-  }
-
-  void print_parameters(){
-    std::cout << Par_ << std::endl;
-  }
-
-protected:
-  Parameters Par_;
-private:
-  std::string section_name;
-};
-
-
-template <>
-class Parametrized<NoParameters>{
-        public:
-  typedef NoParameters Parameters;
-
-  Parametrized(Parameters Par){};
-
-  template <class ReaderClass>
-  Parametrized(Reader<ReaderClass> & Reader){};
-
-  void set_parameters(Parameters Par){}
-
-  void print_parameters(){}
-
-};
-
-
-
-////////////////////////////////////////
-// Lowest level abstract module class
-////////////////////////////////////////
-template <class Prod>
-class HMCModuleBase {
- public:
-  typedef Prod Product;
-
-  virtual Prod* getPtr() = 0;
-
-  // add a getReference? 
-  
-  virtual void print_parameters(){};  // default to nothing
-};
-
-
-/////////////////////////////////////////////
-// Registration class
-/////////////////////////////////////////////
-
-template <class T, class TheFactory>
-class Registrar {
- public:
-  Registrar(std::string className) {
-    // register the class factory function
-    TheFactory::getInstance().registerBuilder(className, 
-        [&](typename TheFactory::TheReader Reader)
-        { 
-          return std::unique_ptr<T>(new T(Reader));
-        }
-        );
-  }
-};
-
-
-
-}
-
-
-#endif //HMC_MODULES_H
@@ -1,158 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/ObservableModules.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef HMC_OBSERVABLE_MODULES_H
-#define HMC_OBSERVABLE_MODULES_H
-
-namespace Grid {
-
-/////////////////////////////
-// Observables
-/////////////////////////////
-template <class ObservableType, class OPar>
-class ObservableModule
-    : public Parametrized<OPar>,
-      public HMCModuleBase< QCD::HmcObservable<typename ObservableType::Field> > {
- public:
-  typedef HMCModuleBase< QCD::HmcObservable< typename ObservableType::Field> > Base;
-  typedef typename Base::Product Product;
-  typedef OPar Parameters;
-
-  std::unique_ptr<ObservableType> ObservablePtr;
-
-  ObservableModule(OPar Par) : Parametrized<OPar>(Par) {}
-
-  virtual void print_parameters(){
-    Parametrized<OPar>::print_parameters();
-  }
-
-  template <class ReaderClass>
-  ObservableModule(Reader<ReaderClass>& Reader) : Parametrized<OPar>(Reader){};
-
-  Product* getPtr() {
-    if (!ObservablePtr) initialize();
-
-    return ObservablePtr.get();
-  }
-
- private:
-  virtual void initialize() = 0;
-};
-
-
-
-////////////////
-// Modules
-////////////////
-
-namespace QCD{
-
-//// Observables module
-class PlaquetteObsParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(PlaquetteObsParameters, 
-    std::string, output_prefix);
-};
-
-template < class Impl >
-class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>{
-  typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
-  using ObsBase::ObsBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
-  }
-  public:
-  PlaquetteMod(): ObsBase(NoParameters()){}
-};
-
-template < class Impl >
-class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
-  typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase;
-  using ObsBase::ObsBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ObservablePtr.reset(new PolyakovLogger<Impl>());
-  }
-  public:
-  PolyakovMod(): ObsBase(NoParameters()){}
-};
-
-
-template < class Impl >
-class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
-  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
-  using ObsBase::ObsBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
-  }
-  public:
-  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
-  TopologicalChargeMod(): ObsBase(){}
-};
-
-
-}// QCD temporarily here
-
-
-////////////////////////////////////////
-// Factories specialisations
-////////////////////////////////////////
-// explicit ref to LatticeGaugeField must be changed or put in the factory
-//typedef HMCModuleBase< QCD::HmcObservable<QCD::LatticeGaugeField> > HMC_ObsModBase;
-
-template <char const *str, class Field, class ReaderClass >
-class HMC_ObservablesModuleFactory
-    : public Factory < HMCModuleBase< QCD::HmcObservable<Field> >, Reader<ReaderClass> > {
- public:
-  typedef Reader<ReaderClass> TheReader; 
-  // use SINGLETON FUNCTOR MACRO HERE
-  HMC_ObservablesModuleFactory(const HMC_ObservablesModuleFactory& e) = delete;
-  void operator=(const HMC_ObservablesModuleFactory& e) = delete;
-  static HMC_ObservablesModuleFactory& getInstance(void) {
-    static HMC_ObservablesModuleFactory e;
-    return e;
-  }
-
- private:
-  HMC_ObservablesModuleFactory(void) = default;
-    std::string obj_type() const {
-    return std::string(str);
-  }
-};
-
-extern char observable_string[];
-
-}
-
-
-#endif //HMC_OBSERVABLE_MODULES_H
@@ -1,129 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/Registration.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef MODULES_REGISTRATION_H
-#define MODULES_REGISTRATION_H
-
-// simplify with macros
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Actions
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-typedef QCD::WilsonGModule<ImplementationPolicy> WilsonGMod;
-typedef QCD::SymanzikGModule<ImplementationPolicy> SymanzikGMod;
-typedef QCD::IwasakiGModule<ImplementationPolicy> IwasakiGMod;
-typedef QCD::DBW2GModule<ImplementationPolicy> DBW2GMod;
-typedef QCD::RBCGModule<ImplementationPolicy> RBCGMod;
-typedef QCD::PlaqPlusRectangleGModule<ImplementationPolicy> PlaqPlusRectangleGMod;
-
-static Registrar<QCD::WilsonGMod,            HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __WGmodXMLInit("Wilson"); 
-static Registrar<QCD::SymanzikGMod,          HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __SymGmodXMLInit("Symanzik"); 
-static Registrar<QCD::IwasakiGMod,           HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __IwGmodXMLInit("Iwasaki"); 
-static Registrar<QCD::DBW2GMod,              HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __DBW2GmodXMLInit("DBW2"); 
-static Registrar<QCD::RBCGMod,               HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __RBCGmodXMLInit("RBC"); 
-static Registrar<QCD::PlaqPlusRectangleGMod, HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __PPRectGmodXMLInit("PlaqPlusRect"); 
-
-
-// FIXME more general implementation
-static Registrar<QCD::TwoFlavourFModule<FermionImplementationPolicy> ,      
-	HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __TwoFlavourFmodXMLInit("TwoFlavours"); 
-static Registrar<QCD::TwoFlavourRatioFModule<FermionImplementationPolicy> , 
-	HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __TwoFlavourRatioFmodXMLInit("TwoFlavoursRatio"); 
-static Registrar<QCD::TwoFlavourEOFModule<FermionImplementationPolicy> ,    
-	HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __TwoFlavourEOFmodXMLInit("TwoFlavoursEvenOdd"); 
-static Registrar<QCD::TwoFlavourRatioEOFModule<FermionImplementationPolicy>,
-	HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __TwoFlavourRatioEOFmodXMLInit("TwoFlavoursEvenOddRatio"); 
-static Registrar<QCD::OneFlavourFModule<FermionImplementationPolicy> ,      
-	HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __OneFlavourFmodXMLInit("OneFlavour"); 
-static Registrar<QCD::OneFlavourEOFModule<FermionImplementationPolicy> ,    
-	HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __OneFlavourEOFmodXMLInit("OneFlavourEvenOdd"); 
-static Registrar<QCD::OneFlavourRatioFModule<FermionImplementationPolicy> , 
-	HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __OneFlavourRatioFmodXMLInit("OneFlavourRatio"); 
-static Registrar<QCD::OneFlavourRatioEOFModule<FermionImplementationPolicy>,
-	HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, Serialiser> > __OneFlavourRatioEOFmodXMLInit("OneFlavourEvenOddRatio"); 
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Solvers
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-// Now a specific registration with a fermion field
-// here must instantiate CG and CR for every new fermion field type (macro!!)
-
-static Registrar< ConjugateGradientModule<QCD::WilsonFermionR::FermionField>,   
-                  HMC_SolverModuleFactory<solver_string, QCD::WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
-static Registrar< ConjugateResidualModule<QCD::WilsonFermionR::FermionField>,   
-                  HMC_SolverModuleFactory<solver_string, QCD::WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
-
-// add the staggered, scalar versions here
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Fermion operators
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static Registrar< QCD::WilsonFermionModule<FermionImplementationPolicy>,   
-                  HMC_FermionOperatorModuleFactory<fermionop_string, FermionImplementationPolicy, Serialiser> > __WilsonFOPmodXMLInit("Wilson"); 
-static Registrar< QCD::MobiusFermionModule<FermionImplementationPolicy>,   
-                  HMC_FermionOperatorModuleFactory<fermionop_string, FermionImplementationPolicy, Serialiser> > __MobiusFOPmodXMLInit("Mobius");
-static Registrar< QCD::DomainWallFermionModule<FermionImplementationPolicy>,   
-                  HMC_FermionOperatorModuleFactory<fermionop_string, FermionImplementationPolicy, Serialiser> > __DWFOPmodXMLInit("DomainWall");
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Observables
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static Registrar<QCD::PlaquetteMod<ImplementationPolicy>, HMC_ObservablesModuleFactory<observable_string, typename ImplementationPolicy::Field, Serialiser> > __OBSPLmodXMLInit("Plaquette"); 
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Checkpointers
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static Registrar<QCD::BinaryCPModule<ImplementationPolicy>, HMC_CPModuleFactory<cp_string, ImplementationPolicy, Serialiser> > __CPBinarymodXMLInit("Binary");
-static Registrar<QCD::NerscCPModule<ImplementationPolicy> , HMC_CPModuleFactory<cp_string, ImplementationPolicy, Serialiser> > __CPNerscmodXMLInit("Nersc");
-
-#ifdef HAVE_LIME
-static Registrar<QCD::ILDGCPModule<ImplementationPolicy>  , HMC_CPModuleFactory<cp_string, ImplementationPolicy, Serialiser> > __CPILDGmodXMLInit("ILDG");
-#endif
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Integrators
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-static Registrar< HMCLeapFrog<ImplementationPolicy, RepresentationPolicy, Serialiser>      , HMCRunnerModuleFactory<hmc_string, Serialiser> > __HMCLFmodXMLInit("LeapFrog");
-static Registrar< HMCMinimumNorm2<ImplementationPolicy, RepresentationPolicy, Serialiser>  , HMCRunnerModuleFactory<hmc_string, Serialiser> > __HMCMN2modXMLInit("MinimumNorm2");
-static Registrar< HMCForceGradient<ImplementationPolicy, RepresentationPolicy, Serialiser> , HMCRunnerModuleFactory<hmc_string, Serialiser> > __HMCFGmodXMLInit("ForceGradient");
-
-typedef HMCRunnerModuleFactory<hmc_string, Serialiser > HMCModuleFactory;
-
-#endif
@@ -1,138 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/SolverModules.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef SOLVER_MODULES_H
-#define SOLVER_MODULES_H
-
-namespace Grid {
-
-//////////////////////////////////////////////
-//       Operator Functions (Solvers)
-//////////////////////////////////////////////
-
-template <template <typename> class SolverType, class Field, class SPar>
-class SolverModule
-    : public Parametrized<SPar>,
-      public HMCModuleBase<OperatorFunction<Field> > {
- public:
-  typedef HMCModuleBase< OperatorFunction<Field> > Base;
-  typedef typename Base::Product Product;
-
-  std::unique_ptr< SolverType<Field> > SolverPtr;
-
-  SolverModule(SPar Par) : Parametrized<SPar>(Par) {}
-
-  template <class ReaderClass>
-  SolverModule(Reader<ReaderClass>& Reader) : Parametrized<SPar>(Reader){};
-
-  virtual void print_parameters(){
-    std::cout << this->Par_ << std::endl;
-  }
-
-  Product* getPtr() {
-    if (!SolverPtr) initialize();
-
-    return SolverPtr.get();
-  }
-
- private:
-  virtual void initialize() = 0;
-};
-
-
-// Factory
-template <char const *str, class Field, class ReaderClass >
-class HMC_SolverModuleFactory
-    : public Factory < HMCModuleBase<OperatorFunction<Field> > ,  Reader<ReaderClass> > {
- public:
-  // use SINGLETON FUNCTOR MACRO HERE
-  typedef Reader<ReaderClass> TheReader; 
-
-  HMC_SolverModuleFactory(const HMC_SolverModuleFactory& e) = delete;
-  void operator=(const HMC_SolverModuleFactory& e) = delete;
-  static HMC_SolverModuleFactory& getInstance(void) {
-    static HMC_SolverModuleFactory e;
-    return e;
-  }
-
- private:
-  HMC_SolverModuleFactory(void) = default;
-    std::string obj_type() const {
-        return std::string(str);
-  }
-};
-
-
-
-class SolverParameters : Serializable {
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(SolverParameters,
-    RealD, tolerance,
-    RealD, max_iterations);
-  // add error on no convergence?
-};
-
-
-class SolverObjName: Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(SolverObjName, 
-  std::string, name,
-  SolverParameters, parameters);
-
-};
-
-
-
-template <class Field >
-class ConjugateGradientModule: public SolverModule<ConjugateGradient, Field, SolverParameters> {
-  typedef SolverModule<ConjugateGradient, Field, SolverParameters> SolverBase;
-  using SolverBase::SolverBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->SolverPtr.reset(new ConjugateGradient<Field>(this->Par_.tolerance, this->Par_.max_iterations, true));
-  }
-};
-
-template <class Field >
-class ConjugateResidualModule: public SolverModule<ConjugateResidual, Field, SolverParameters> {
-  typedef SolverModule<ConjugateResidual, Field, SolverParameters> SolverBase;
-  using SolverBase::SolverBase; // for constructors
-
-  // acquire resource
-  virtual void initialize(){
-    this->SolverPtr.reset(new ConjugateResidual<Field>(this->Par_.tolerance, this->Par_.max_iterations));
-  }
-
-};
-
-extern char solver_string[];
-} // Grid
-
-
-#endif //SOLVER_MODULES_H
@@ -1,46 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef MODS_H
-#define MODS_H
-
-// Modules files
-
-#include <Grid/qcd/modules/Factory.h>
-#include <Grid/qcd/modules/Modules.h>
-
-
-#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h>
-#include <Grid/qcd/modules/SolverModules.h>
-#include <Grid/qcd/modules/FermionOperatorModules.h>
-#include <Grid/qcd/modules/ActionModules.h>
-#include <Grid/qcd/modules/ObservableModules.h>
-
-
-
-#endif //MODS_H
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/observables/hmc_observable.h
-
-Copyright (C) 2017
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef HMC_OBSERVABLE_H
-#define HMC_OBSERVABLE_H
-
-namespace Grid{
-
-template <class Field>
-class HmcObservable {
- public:
-  virtual void TrajectoryComplete(int traj,
-                                  Field &U,
-                                  GridSerialRNG &sRNG,
-                                  GridParallelRNG &pRNG) = 0;
-};
-
-}  // namespace Grid
-
-#include "plaquette.h"
-#include "topological_charge.h"
-#include "polyakov_loop.h"
-
-
-#endif  //  HMC_OBSERVABLE_H
@@ -1,68 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/plaquette.h
-
-Copyright (C) 2017
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef HMC_PLAQUETTE_H
-#define HMC_PLAQUETTE_H
-
-namespace Grid {
-namespace QCD {
-
-// this is only defined for a gauge theory
-template <class Impl>
-class PlaquetteLogger : public HmcObservable<typename Impl::Field> {
- public:
-  // here forces the Impl to be of gauge fields
-  // if not the compiler will complain
-  INHERIT_GIMPL_TYPES(Impl);
-
-  // necessary for HmcObservable compatibility
-  typedef typename Impl::Field Field;
-
-  void TrajectoryComplete(int traj,
-                          Field &U,
-                          GridSerialRNG &sRNG,
-                          GridParallelRNG &pRNG) {
-
-    RealD plaq = WilsonLoops<Impl>::avgPlaquette(U);
-
-    int def_prec = std::cout.precision();
-
-    std::cout << GridLogMessage
-        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
-        << "Plaquette: [ " << traj << " ] "<< plaq << std::endl;
-
-    std::cout.precision(def_prec);
-
-  }
-};
-
-}  // namespace QCD
-}  // namespace Grid
-
-#endif  // HMC_PLAQUETTE_H
@@ -1,68 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/polyakov_line.h
-
-Copyright (C) 2017
-
-Author: David Preti <david.preti@csic.es>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef HMC_POLYAKOV_H
-#define HMC_POLYAKOV_H
-
-namespace Grid {
-namespace QCD {
-
-// this is only defined for a gauge theory
-template <class Impl>
-class PolyakovLogger : public HmcObservable<typename Impl::Field> {
- public:
-  // here forces the Impl to be of gauge fields
-  // if not the compiler will complain
-  INHERIT_GIMPL_TYPES(Impl);
-
-  // necessary for HmcObservable compatibility
-  typedef typename Impl::Field Field;
-
-  void TrajectoryComplete(int traj,
-                          Field &U,
-                          GridSerialRNG &sRNG,
-                          GridParallelRNG &pRNG) {
-
-    ComplexD polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U);
-
-    int def_prec = std::cout.precision();
-
-    std::cout << GridLogMessage
-        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
-        << "Polyakov Loop: [ " << traj << " ] "<< polyakov << std::endl;
-
-    std::cout.precision(def_prec);
-
-  }
-};
-
-}  // namespace QCD
-}  // namespace Grid
-
-#endif  // HMC_POLYAKOV_H
@@ -1,122 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/topological_charge.h
-
-Copyright (C) 2017
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef HMC_TOP_CHARGE_H
-#define HMC_TOP_CHARGE_H
-
-namespace Grid {
-namespace QCD {
-
-struct TopologySmearingParameters : Serializable {
-    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
-    int, steps,
-    float, step_size,
-    int, meas_interval,
-    float, maxTau);
-
-    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
-        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
-
-    template < class ReaderClass >
-    TopologySmearingParameters(Reader<ReaderClass>& Reader){
-        read(Reader, "Smearing", *this);  
-    }  
-};
-
-
-
-struct TopologyObsParameters : Serializable {
-    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
-      int, interval,
-      bool, do_smearing,
-      TopologySmearingParameters, Smearing);  
-
-    TopologyObsParameters(int interval = 1, bool smearing = false):
-        interval(interval), Smearing(smearing){}
-
-    template <class ReaderClass >
-      TopologyObsParameters(Reader<ReaderClass>& Reader){
-        read(Reader, "TopologyMeasurement", *this);
-  }
-};
-
-
-// this is only defined for a gauge theory
-template <class Impl>
-class TopologicalCharge : public HmcObservable<typename Impl::Field> {
-    TopologyObsParameters Pars;
-
- public:
-    // here forces the Impl to be of gauge fields
-    // if not the compiler will complain
-    INHERIT_GIMPL_TYPES(Impl);
-
-    // necessary for HmcObservable compatibility
-    typedef typename Impl::Field Field;
-
-    TopologicalCharge(int interval = 1, bool do_smearing = false):
-        Pars(interval, do_smearing){}
-    
-    TopologicalCharge(TopologyObsParameters P):Pars(P){
-        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
-    }
-
-    void TrajectoryComplete(int traj,
-                            Field &U,
-                            GridSerialRNG &sRNG,
-                            GridParallelRNG &pRNG) {
-
-    if (traj%Pars.interval == 0){
-        // Smearing
-        Field Usmear = U;
-        int def_prec = std::cout.precision();
-        
-        if (Pars.do_smearing){
-            // using wilson flow by default here
-            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
-            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
-            Real T0   = WF.energyDensityPlaquette(Usmear);
-            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
-                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
-        }
-
-        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
-        std::cout << GridLogMessage
-            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
-            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
-
-        std::cout.precision(def_prec);
-        }
-    }
-
-};
-}
-}
-
-#endif  //  HMC_TOP_CHARGE_H
@@ -1,44 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/plaquette.h
-
-Copyright (C) 2017
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-/*
-  @brief Declares base smearing class Smear
- */
-#ifndef BASE_SMEAR_
-#define BASE_SMEAR_
-
-template <class Gimpl>
-class Smear{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl) // inherits the types for the gauge fields
-
-  virtual ~Smear(){}
-  virtual void smear     (GaugeField&,const GaugeField&)const = 0;
-  virtual void derivative(GaugeField&, const GaugeField&,const GaugeField&) const = 0;
-};
-#endif
@@ -1,213 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/modules/plaquette.h
-
-Copyright (C) 2017
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef WILSONFLOW_H
-#define WILSONFLOW_H
-
-namespace Grid {
-namespace QCD {
-
-template <class Gimpl>
-class WilsonFlow: public Smear<Gimpl>{
-    unsigned int Nstep;
-    unsigned int measure_interval;
-    mutable RealD epsilon, taus;
-
-
-    mutable WilsonGaugeAction<Gimpl> SG;
-
-    void evolve_step(typename Gimpl::GaugeField&) const;
-    void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
-    RealD tau(unsigned int t)const {return epsilon*(t+1.0); }
-
- public:
-    INHERIT_GIMPL_TYPES(Gimpl)
-
-    explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
-        Nstep(Nstep),
-        epsilon(epsilon),
-        measure_interval(interval),
-        SG(WilsonGaugeAction<Gimpl>(3.0)) {
-            // WilsonGaugeAction with beta 3.0
-            assert(epsilon > 0.0);
-            LogMessage();
-    }
-
-    void LogMessage() {
-        std::cout << GridLogMessage
-            << "[WilsonFlow] Nstep   : " << Nstep << std::endl;
-        std::cout << GridLogMessage
-            << "[WilsonFlow] epsilon : " << epsilon << std::endl;
-        std::cout << GridLogMessage
-            << "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl;
-    }
-
-    virtual void smear(GaugeField&, const GaugeField&) const;
-
-    virtual void derivative(GaugeField&, const GaugeField&, const GaugeField&) const {
-        assert(0);
-        // undefined for WilsonFlow
-    }
-
-    void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
-    RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
-    RealD energyDensityPlaquette(const GaugeField& U) const;
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Implementations
-////////////////////////////////////////////////////////////////////////////////
-template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
-    GaugeField Z(U._grid);
-    GaugeField tmp(U._grid);
-    SG.deriv(U, Z);
-    Z *= 0.25;                                  // Z0 = 1/4 * F(U)
-    Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
-
-    Z *= -17.0/8.0;
-    SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
-    Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
-    Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
-
-    Z *= -4.0/3.0;
-    SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
-    Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
-    Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
-}
-
-template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
-    if (maxTau - taus < epsilon){
-        epsilon = maxTau-taus;
-    }
-    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
-    GaugeField Z(U._grid);
-    GaugeField Zprime(U._grid);
-    GaugeField tmp(U._grid), Uprime(U._grid);
-    Uprime = U;
-    SG.deriv(U, Z);
-    Zprime = -Z;
-    Z *= 0.25;                                  // Z0 = 1/4 * F(U)
-    Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
-
-    Z *= -17.0/8.0;
-    SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
-    Zprime += 2.0*tmp;
-    Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
-    Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
-    
-
-    Z *= -4.0/3.0;
-    SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
-    Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
-    Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
-
-    // Ramos 
-    Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
-    // Compute distance as norm^2 of the difference
-    GaugeField diffU = U - Uprime;
-    RealD diff = norm2(diffU);
-    // adjust integration step
-    
-    taus += epsilon;
-    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
-    
-    epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
-
-}
-
-template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
-    RealD td = tau(step);
-    return 2.0 * td * td * SG.S(U)/U._grid->gSites();
-}
-
-template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
-    return 2.0 * taus * taus * SG.S(U)/U._grid->gSites();
-}
-
-
-//#define WF_TIMING 
-
-
-
-template <class Gimpl>
-void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
-    out = in;
-    for (unsigned int step = 1; step <= Nstep; step++) {
-        auto start = std::chrono::high_resolution_clock::now();
-        evolve_step(out);
-        auto end = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double> diff = end - start;
-        #ifdef WF_TIMING
-        std::cout << "Time to evolve " << diff.count() << " s\n";
-        #endif
-        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-		  << step << "  " << tau(step) << "  " 
-		  << energyDensityPlaquette(step,out) << std::endl;
-         if( step % measure_interval == 0){
-         std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
-            << step << "  " 
-            << WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
-        }
-    }
-}
-
-template <class Gimpl>
-void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
-    out = in;
-    taus = epsilon;
-    unsigned int step = 0;
-    do{
-        step++;
-        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
-        evolve_step_adaptive(out, maxTau);
-        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-		  << step << "  " << taus << "  "
-		  << energyDensityPlaquette(out) << std::endl;
-         if( step % measure_interval == 0){
-         std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
-            << step << "  " 
-            << WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
-        }
-    } while (taus < maxTau);
-
-
-
-}
-
-
-}  // namespace QCD
-}  // namespace Grid
-
-#endif   // WILSONFLOW_H
@@ -1,197 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/scalar/CovariantLaplacian.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef COVARIANT_LAPLACIAN_H
-#define COVARIANT_LAPLACIAN_H
-
-namespace Grid {
-namespace QCD {
-
-struct LaplacianParams : Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(LaplacianParams, 
-                                  RealD, lo, 
-                                  RealD, hi, 
-                                  int,   MaxIter, 
-                                  RealD, tolerance, 
-                                  int,   degree, 
-                                  int,   precision);
-  
-  // constructor 
-  LaplacianParams(RealD lo      = 0.0, 
-                  RealD hi      = 1.0, 
-                  int maxit     = 1000,
-                  RealD tol     = 1.0e-8, 
-                  int degree    = 10,
-                  int precision = 64)
-    : lo(lo),
-      hi(hi),
-      MaxIter(maxit),
-      tolerance(tol),
-      degree(degree),
-      precision(precision){};
-};
-
-
-
-////////////////////////////////////////////////////////////
-// Laplacian operator L on adjoint fields
-//
-// phi: adjoint field
-// L: D_mu^dag D_mu
-//
-// L phi(x) = Sum_mu [ U_mu(x)phi(x+mu)U_mu(x)^dag + 
-//                     U_mu(x-mu)^dag phi(x-mu)U_mu(x-mu)
-//                     -2phi(x)]
-//
-// Operator designed to be encapsulated by
-// an HermitianLinearOperator<.. , ..>
-////////////////////////////////////////////////////////////
-
-template <class Impl>
-class LaplacianAdjointField: public Metric<typename Impl::Field> {
-  OperatorFunction<typename Impl::Field> &Solver;
-  LaplacianParams param;
-  MultiShiftFunction PowerHalf;    
-  MultiShiftFunction PowerInvHalf;    
-
- public:
-  INHERIT_GIMPL_TYPES(Impl);
-
-  LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0)
-      : U(Nd, grid), Solver(S), param(p), kappa(k){
-        AlgRemez remez(param.lo,param.hi,param.precision);
-        std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-        remez.generateApprox(param.degree,1,2);
-        PowerHalf.Init(remez,param.tolerance,false);
-        PowerInvHalf.Init(remez,param.tolerance,true);
-        
-
-      };
-
-  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
-  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
-
-  void ImportGauge(const GaugeField& _U) {
-    for (int mu = 0; mu < Nd; mu++) {
-      U[mu] = PeekIndex<LorentzIndex>(_U, mu);
-    }
-  }
-
-  void M(const GaugeField& in, GaugeField& out) {
-    // in is an antihermitian matrix
-    // test
-    //GaugeField herm = in + adj(in);
-    //std::cout << "AHermiticity: " << norm2(herm) << std::endl;
-
-    GaugeLinkField tmp(in._grid);
-    GaugeLinkField tmp2(in._grid);
-    GaugeLinkField sum(in._grid);
-
-    for (int nu = 0; nu < Nd; nu++) {
-      sum = zero;
-      GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
-      GaugeLinkField out_nu(out._grid);
-      for (int mu = 0; mu < Nd; mu++) {
-        tmp = U[mu] * Cshift(in_nu, mu, +1) * adj(U[mu]);
-        tmp2 = adj(U[mu]) * in_nu * U[mu];
-        sum += tmp + Cshift(tmp2, mu, -1) - 2.0 * in_nu;
-      }
-      out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum;
-      PokeIndex<LorentzIndex>(out, out_nu, nu);
-    }
-  }
-
-  void MDeriv(const GaugeField& in, GaugeField& der) {
-    // in is anti-hermitian
-    RealD factor = -kappa / (double(4 * Nd));
-    
-    for (int mu = 0; mu < Nd; mu++){
-      GaugeLinkField der_mu(der._grid);
-      der_mu = zero;
-      for (int nu = 0; nu < Nd; nu++){
-        GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
-        der_mu += U[mu] * Cshift(in_nu, mu, 1) * adj(U[mu]) * in_nu;
-      }
-      // the minus sign comes by using the in_nu instead of the
-      // adjoint in the last multiplication
-      PokeIndex<LorentzIndex>(der,  -2.0 * factor * der_mu, mu);
-    } 
-  }
-
-  // separating this temporarily
-  void MDeriv(const GaugeField& left, const GaugeField& right,
-              GaugeField& der) {
-    // in is anti-hermitian
-    RealD factor = -kappa / (double(4 * Nd));
-
-    for (int mu = 0; mu < Nd; mu++) {
-      GaugeLinkField der_mu(der._grid);
-      der_mu = zero;
-      for (int nu = 0; nu < Nd; nu++) {
-        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
-        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
-        der_mu += U[mu] * Cshift(left_nu, mu, 1) * adj(U[mu]) * right_nu;
-        der_mu += U[mu] * Cshift(right_nu, mu, 1) * adj(U[mu]) * left_nu;
-      }
-      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
-    }
-  }
-
-  void Minv(const GaugeField& in, GaugeField& inverted){
-    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
-    Solver(HermOp, in, inverted);
-  }
-
-  void MSquareRoot(GaugeField& P){
-    GaugeField Gp(P._grid);
-    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
-    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerHalf);
-    msCG(HermOp,P,Gp);
-    P = Gp; 
-  }
-
-  void MInvSquareRoot(GaugeField& P){
-    GaugeField Gp(P._grid);
-    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
-    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerInvHalf);
-    msCG(HermOp,P,Gp);
-    P = Gp; 
-  }
-
-
-
- private:
-  RealD kappa;
-  std::vector<GaugeLinkField> U;
-};
-
-}
-}
-
-#endif
@@ -1,193 +0,0 @@
-    /*************************************************************************************
-
-    grid` physics library, www.github.com/paboyle/Grid 
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-//#include <Grid/Grid.h>
-
-#ifndef GRID_QCD_GAUGE_FIX_H
-#define GRID_QCD_GAUGE_FIX_H
-namespace Grid {
-namespace QCD {
-
-template <class Gimpl> 
-class FourierAcceleratedGaugeFixer  : public Gimpl {
- public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  typedef typename Gimpl::GaugeLinkField GaugeMat;
-  typedef typename Gimpl::GaugeField GaugeLorentz;
-
-  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
-    for(int mu=0;mu<Nd;mu++){
-      Complex cmi(0.0,-1.0);
-      A[mu] = Ta(U[mu]) * cmi;
-    }
-  }
-  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
-    dmuAmu=zero;
-    for(int mu=0;mu<Nd;mu++){
-      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
-    }
-  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false) {
-    GridBase *grid = Umu._grid;
-
-    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
-    Real old_trace = org_link_trace;
-    Real trG;
-
-    std::vector<GaugeMat> U(Nd,grid);
-                 GaugeMat dmuAmu(grid);
-
-    for(int i=0;i<maxiter;i++){
-      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
-      if ( Fourier==false ) { 
-	trG = SteepestDescentStep(U,alpha,dmuAmu);
-      } else { 
-	trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
-      }
-      for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
-      // Monitor progress and convergence test 
-      // infrequently to minimise cost overhead
-      if ( i %20 == 0 ) { 
-	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
-
-	if (Fourier) 
-	  std::cout << GridLogMessage << "Fourier Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
-	else 
-	  std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
-	
-	Real Phi  = 1.0 - old_trace / link_trace ;
-	Real Omega= 1.0 - trG;
-
-
-	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
-	if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
-	  std::cout << GridLogMessage << "Converged ! "<<std::endl;
-	  return;
-	}
-
-	old_trace = link_trace;
-
-      }
-    }
-  };
-  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
-    GridBase *grid = U[0]._grid;
-
-    std::vector<GaugeMat> A(Nd,grid);
-    GaugeMat g(grid);
-
-    GaugeLinkToLieAlgebraField(U,A);
-    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
-
-
-    Real vol = grid->gSites();
-    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
-
-    SU<Nc>::GaugeTransform(U,g);
-
-    return trG;
-  }
-
-  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
-
-    GridBase *grid = U[0]._grid;
-
-    Real vol = grid->gSites();
-
-    FFT theFFT((GridCartesian *)grid);
-
-    LatticeComplex  Fp(grid);
-    LatticeComplex  psq(grid); psq=zero;
-    LatticeComplex  pmu(grid); 
-    LatticeComplex   one(grid); one = Complex(1.0,0.0);
-
-    GaugeMat g(grid);
-    GaugeMat dmuAmu_p(grid);
-    std::vector<GaugeMat> A(Nd,grid);
-
-    GaugeLinkToLieAlgebraField(U,A);
-
-    DmuAmu(A,dmuAmu);
-
-    theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
-
-    //////////////////////////////////
-    // Work out Fp = psq_max/ psq...
-    //////////////////////////////////
-    std::vector<int> latt_size = grid->GlobalDimensions();
-    std::vector<int> coor(grid->_ndimension,0);
-    for(int mu=0;mu<Nd;mu++) {
-
-      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
-      LatticeCoordinate(pmu,mu);
-      pmu = TwoPiL * pmu ;
-      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
-    }
-
-    Complex psqMax(16.0);
-    Fp =  psqMax*one/psq;
-
-    /*
-    static int once;
-    if ( once == 0 ) { 
-      std::cout << " Fp " << Fp <<std::endl;
-      once ++;
-      }*/
-
-    pokeSite(TComplex(1.0),Fp,coor);
-
-    dmuAmu_p  = dmuAmu_p * Fp; 
-
-    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
-
-    GaugeMat ciadmam(grid);
-    Complex cialpha(0.0,-alpha);
-    ciadmam = dmuAmu*cialpha;
-    SU<Nc>::taExp(ciadmam,g);
-
-    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
-
-    SU<Nc>::GaugeTransform(U,g);
-
-    return trG;
-  }
-
-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
-    GridBase *grid = g._grid;
-    Complex cialpha(0.0,-alpha);
-    GaugeMat ciadmam(grid);
-    DmuAmu(A,dmuAmu);
-    ciadmam = dmuAmu*cialpha;
-    SU<Nc>::taExp(ciadmam,g);
-  }  
-};
-
-}
-}
-#endif
@@ -1,226 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/hmc/integrators/Integrator.h
-
-Copyright (C) 2015
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-//--------------------------------------------------------------------
-#ifndef METRIC_H
-#define METRIC_H
-
-namespace Grid{
-namespace QCD{
-
-template <typename Field> 
-class Metric{
-public:
-  virtual void ImportGauge(const Field&)   = 0;
-  virtual void M(const Field&, Field&)     = 0;
-  virtual void Minv(const Field&, Field&)  = 0;
-  virtual void MSquareRoot(Field&) = 0;
-  virtual void MInvSquareRoot(Field&) = 0;
-  virtual void MDeriv(const Field&, Field&) = 0;
-  virtual void MDeriv(const Field&, const Field&, Field&) = 0;
-};
-
-
-// Need a trivial operator
-template <typename Field>
-class TrivialMetric : public Metric<Field>{
-public:
-  virtual void ImportGauge(const Field&){};
-  virtual void M(const Field& in, Field& out){
-    out = in;
-  }
-  virtual void Minv(const Field& in, Field& out){
-    out = in;
-  }
-  virtual void MSquareRoot(Field& P){
-    // do nothing
-  }
-  virtual void MInvSquareRoot(Field& P){
-    // do nothing
-  }
-  virtual void MDeriv(const Field& in, Field& out){
-    out = zero;
-  }
-  virtual void MDeriv(const Field& left, const Field& right, Field& out){
-    out = zero;
-  }
-
-};
-
-///////////////////////////////
-// Generalised momenta
-///////////////////////////////
-
-template <typename Implementation>
-class GeneralisedMomenta{
-public:
-  typedef typename Implementation::Field MomentaField;  //for readability
-  typedef typename Implementation::GaugeLinkField MomentaLinkField;  //for readability
-  Metric<MomentaField>& M;
-  MomentaField Mom;
-
-  // Auxiliary fields
-  // not hard coded but inherit the type from the metric
-  // created Nd new fields
-  // hide these in the metric?
-  //typedef Lattice<iVector<iScalar<iMatrix<vComplex, Nc> >, Nd/2 > > AuxiliaryMomentaType;
-  MomentaField AuxMom;
-  MomentaField AuxField;
-
-  GeneralisedMomenta(GridBase* grid, Metric<MomentaField>& M): M(M), Mom(grid), AuxMom(grid), AuxField(grid){}
-
-  // Correct
-  void MomentaDistribution(GridParallelRNG& pRNG){
-    // Generate a distribution for
-    // P^dag G P
-    // where G = M^-1
-
-    // Generate gaussian momenta
-    Implementation::generate_momenta(Mom, pRNG);
-    // Modify the distribution with the metric
-    M.MSquareRoot(Mom);
-
-    if (1) {
-      // Auxiliary momenta
-      // do nothing if trivial, so hide in the metric
-      MomentaField AuxMomTemp(Mom._grid);
-      Implementation::generate_momenta(AuxMom, pRNG);
-      Implementation::generate_momenta(AuxField, pRNG);
-      // Modify the distribution with the metric
-      // Aux^dag M Aux
-      M.MInvSquareRoot(AuxMom);  // AuxMom = M^{-1/2} AuxMomTemp
-    }
-  }
-
-  // Correct
-  RealD MomentaAction(){
-    MomentaField inv(Mom._grid);
-    inv = zero;
-    M.Minv(Mom, inv);
-    LatticeComplex Hloc(Mom._grid);
-    Hloc = zero;
-    for (int mu = 0; mu < Nd; mu++) {
-      // This is not very general
-      // hide in the metric
-      auto Mom_mu = PeekIndex<LorentzIndex>(Mom, mu);
-      auto inv_mu = PeekIndex<LorentzIndex>(inv, mu);
-      Hloc += trace(Mom_mu * inv_mu);
-    }
-
-    if (1) {
-      // Auxiliary Fields
-      // hide in the metric
-      M.M(AuxMom, inv);
-      for (int mu = 0; mu < Nd; mu++) {
-        // This is not very general
-        // hide in the operators
-        auto inv_mu = PeekIndex<LorentzIndex>(inv, mu);
-        auto am_mu = PeekIndex<LorentzIndex>(AuxMom, mu);
-        auto af_mu = PeekIndex<LorentzIndex>(AuxField, mu);
-        Hloc += trace(am_mu * inv_mu);// p M p
-        Hloc += trace(af_mu * af_mu);
-      }
-    }
-
-    Complex Hsum = sum(Hloc);
-    return Hsum.real();
-  }
-
-  // Correct
-  void DerivativeU(MomentaField& in, MomentaField& der){
-
-    // Compute the derivative of the kinetic term
-    // with respect to the gauge field
-    MomentaField MDer(in._grid);
-    MomentaField X(in._grid);
-    X = zero;
-    M.Minv(in, X);  // X = G in
-    M.MDeriv(X, MDer);  // MDer = U * dS/dU
-    der = Implementation::projectForce(MDer);  // Ta if gauge fields
-    
-  }
-
-  void AuxiliaryFieldsDerivative(MomentaField& der){
-    der = zero;
-    if (1){
-    // Auxiliary fields
-    MomentaField der_temp(der._grid);
-    MomentaField X(der._grid);
-    X=zero;
-    //M.M(AuxMom, X); // X = M Aux
-    // Two derivative terms
-    // the Mderiv need separation of left and right terms
-    M.MDeriv(AuxMom, der); 
-
-
-    // this one should not be necessary (identical to the previous one)
-    //M.MDeriv(X, AuxMom, der_temp); der += der_temp;
-
-    der = -1.0*Implementation::projectForce(der);
-    }
-  }
-
-  void DerivativeP(MomentaField& der){
-    der = zero;
-    M.Minv(Mom, der);
-    // is the projection necessary here?
-    // no for fields in the algebra
-    der = Implementation::projectForce(der); 
-  }
-
-  void update_auxiliary_momenta(RealD ep){
-    if(1){
-      AuxMom -= ep * AuxField;
-    }
-  }
-
-  void update_auxiliary_fields(RealD ep){
-    if (1) {
-      MomentaField tmp(AuxMom._grid);
-      MomentaField tmp2(AuxMom._grid);
-      M.M(AuxMom, tmp);
-      // M.M(tmp, tmp2);
-      AuxField += ep * tmp;  // M^2 AuxMom
-      // factor of 2?
-    }
-  }
-
-};
-
-
-
-
-
-
-
-
-}
-}
-
-
-#endif //METRIC_H
@@ -1,96 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/utils/WilsonLoops.h
-
-    Copyright (C) 2015
-
-Author: neo <cossu@post.kek.jp>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef SCALAR_OBJS_H
-#define SCALAR_OBJS_H
-namespace Grid {
-
-  // FIXME drop the QCD namespace in Nd
-  
-
-// Scalar field obs
-template <class Impl>
-class ScalarObs {
- public:
-  //////////////////////////////////////////////////
-  // squared field
-  //////////////////////////////////////////////////
-
-  static void phisquared(typename Impl::Field &fsq,
-                         const typename Impl::Field &f) {
-    fsq = f * f;
-  }
-  //////////////////////////////////////////////////
-  // phi^4 interaction term
-  //////////////////////////////////////////////////
-
-  static void phifourth(typename Impl::Field &fsq,
-                        const typename Impl::Field &f) {
-    fsq = f * f * f * f;
-  }
-
-  //////////////////////////////////////////////////
-  // phi(x)phi(x+mu)
-  //////////////////////////////////////////////////
-
-  static void phider(typename Impl::Field &fsq,
-                     const typename Impl::Field &f) {
-    fsq = Cshift(f, 0, -1) * f;
-    for (int mu = 1; mu < QCD::Nd; mu++) fsq += Cshift(f, mu, -1) * f;
-  }
-
-  //////////////////////////////////////////////////
-  // Vol sum of the previous obs.
-  //////////////////////////////////////////////////
-
-  static RealD sumphider(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
-    tmp = Cshift(f, 0, -1) * f;
-    for (int mu = 1; mu < QCD::Nd; mu++) {
-      tmp += Cshift(f, mu, -1) * f;
-    }
-    return -sum(trace(tmp));
-  }
-
-  static RealD sumphisquared(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
-    tmp = f * f;
-    return sum(trace(tmp));
-  }
-
-  static RealD sumphifourth(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
-    phifourth(tmp, f);
-    return sum(trace(tmp));
-  }
-};
-
-
-}
-
-#endif
@@ -1,173 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/serialisation/JSON_IO.cc
-
-    Copyright (C) 2016
-
-    Author: Guido Cossu<guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace Grid;
-
-// Writer implementation ///////////////////////////////////////////////////////
-JSONWriter::JSONWriter(const std::string &fileName)
-: fileName_(fileName), ss_("{ ", std::ostringstream::ate){}
-
-JSONWriter::~JSONWriter(void)
-{
-  // close
-  delete_comma();
-  ss_ << "}";  
-
-  // write prettified JSON to file
-  std::ofstream os(fileName_);
-  //std::cout << "JSONWriter::~JSONWriter" << std::endl;
-  os << std::setw(2) << json::parse(ss_.str()) << std::endl;
-}
-
-void JSONWriter::push(const std::string &s)
-{
-  // adding a nested object
-  if (s.size())
-    ss_ << " \""<<s<<"\" : {";
-  else
-    ss_ << " {";
-}
-
-void JSONWriter::pop(void)
-{
-  //std::cout << "JSONWriter::pop" << std::endl;
-  delete_comma();
-  ss_ << "},";
-}
-
-void JSONWriter::delete_comma()
-{
-  std::string dlast = ss_.str();
-  dlast.pop_back(); // deletes the last comma
-  ss_.str(dlast);
-}
-
-
-// here we are hitting a g++ bug (Bug 56480)
-// compiles fine with clang
-// have to wrap in the Grid namespace
-// annoying, but necessary for TravisCI
-namespace Grid
-{
-  void JSONWriter::writeDefault(const std::string &s,	const std::string &x)
-  {
-    //std::cout << "JSONWriter::writeDefault(string) : " << s <<  std::endl;
-    std::ostringstream os;
-    os << std::boolalpha << x;
-    if (s.size())
-      ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ;
-    else
-     ss_ << os.str() << " ," ;
-  }
-}// namespace Grid 
-
-
-// Reader implementation ///////////////////////////////////////////////////////
-JSONReader::JSONReader(const std::string &fileName)
-: fileName_(fileName)
-{
-  std::ifstream file(fileName_);
-  file >> jobject_;
-
-  // test
-  // serialize to standard output
-  //std::cout << "JSONReader::JSONReader : " << jobject_ << endl; 
-  jcur_ = jobject_;
-}
-
-bool JSONReader::push(const std::string &s)
-{
-  if (s.size()){
-    jold_.push_back(jcur_);
-    do_pop.push_back(true);
-    try
-    {
-      jcur_ = jcur_[s]; 
-    }
-    catch (std::out_of_range& e)
-    {
-      std::cout << "out of range: " << e.what() << '\n';
-      return false;
-    }
-    //cout << "JSONReader::push : " << s << " : "<< jcur_ << endl;
-  }
-  else
-  {
-    do_pop.push_back(false);
-  }
-
-
-  return true;
-}
-
-void JSONReader::pop(void)
-{
-  if (do_pop.back()){
-    jcur_ = jold_.back();
-    jold_.pop_back();
-    do_pop.pop_back();
-  }
-  else
-    do_pop.pop_back();
-
-  //cout << "JSONReader::pop : " << jcur_ << endl;
-}
-
-bool JSONReader::nextElement(const std::string &s)
-{
-  // Work in progress
-  // JSON dictionaries do not support multiple names 
-  // Same name objects must be packed in vectors
-  ++it_;
-  
-  //if (it_ == it_end_){
-  //  return false;
-  //}
-
-  jcur_ = *it_; 
-  //cout << "JSONReader::nextElement(string) : " << s << " : "<< jcur_ << endl;
-  //return true;
-
-    return false;
-}
-
-template <>
-void JSONReader::readDefault(const std::string &s, std::string &output)
-{
-  //cout << "JSONReader::readDefault(string) : " << s<< " " << jcur_ << endl;
-  if (s.size()){
-    //std::cout << "String: "<< jcur_[s] << std::endl;
-    output = jcur_[s];
-  }
-  else
-  {
-    //std::cout << "String: "<< jcur_ << std::endl;
-    output = jcur_;    
-  }
-}
@@ -1,262 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/serialisation/JSON_IO.h
-
-    Copyright (C) 2015
-
-		Author: Guido Cossu<guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_SERIALISATION_JSON_IO_H
-#define GRID_SERIALISATION_JSON_IO_H
-
-#include <iostream>
-#include <iomanip>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <vector>
-#include <cassert>
-
-#include <Grid/json/json.hpp>
-
-// for convenience
-using json = nlohmann::json;
-
-namespace Grid
-{
-
-  class JSONWriter: public Writer<JSONWriter>
-  {
-
-  public:
-    JSONWriter(const std::string &fileName);
-    virtual ~JSONWriter(void);
-    void push(const std::string &s);
-    void pop(void);
-    template <typename U>
-    void writeDefault(const std::string &s, const U &x);
-    template <typename U>
-    void writeDefault(const std::string &s, const std::complex<U> &x);
-    template <typename U>
-    void writeDefault(const std::string &s, const std::vector<U> &x);
-    template <typename U, typename P>
-    void writeDefault(const std::string &s, const std::pair<U,P> &x);
-
-    template<std::size_t N>
-    void writeDefault(const std::string &s, const char(&x)[N]);
-
-    void writeDefault(const std::string &s, const std::string &x);
-
-
-  private:
-    void delete_comma();
-    std::string         fileName_;
-    std::ostringstream  ss_;
-  };
-
-  class JSONReader: public Reader<JSONReader>
-  {
-  public:
-    JSONReader(const std::string &fileName);
-    virtual ~JSONReader(void) = default;
-    bool push(const std::string &s);
-    void pop(void);
-    bool nextElement(const std::string &s);
-    template <typename U>
-    void readDefault(const std::string &s, U &output);
-    template <typename U>
-    void readDefault(const std::string &s, std::complex<U> &output);
-    template <typename U>
-    void readDefault(const std::string &s, std::vector<U> &output);
-    template <typename U, typename P>
-    void readDefault(const std::string &s, std::pair<U,P> &output);
-  private:
-    json                jobject_; // main object
-    json                jcur_;  // current json object
-    std::vector<json>   jold_;  // previous json object
-    std::string         fileName_;
-    std::vector<bool>   do_pop;
-    json::iterator      it_;
-    json::iterator      it_end_;
-  };
-
-  template <>
-  struct isReader< JSONReader > {
-    static const bool value = true;
-  };
-
-  template <>
-  struct isWriter< JSONWriter > {
-    static const bool value = true;
-  };
-
-  // Writer template implementation ////////////////////////////////////////////
-  template <typename U>
-  void JSONWriter::writeDefault(const std::string &s, const U &x)
-  {
-    //std::cout << "JSONWriter::writeDefault(U) : " << s <<  " " << x <<std::endl;
-    std::ostringstream os;
-    os << std::boolalpha << x;
-    if (s.size())
-      ss_ << "\""<< s << "\" : " << os.str() << " ," ;
-    else
-     ss_ << os.str() << " ," ;
-  }
-
-  template <typename U>
-  void JSONWriter::writeDefault(const std::string &s, const std::complex<U> &x)
-  {
-    //std::cout << "JSONWriter::writeDefault(complex) : " << s <<  " " << x <<  std::endl;
-    std::ostringstream os;
-    os << "["<< std::boolalpha << x.real() << ", " << x.imag() << "]";
-    if (s.size())
-      ss_ << "\""<< s << "\" : " << os.str() << " ," ;
-    else
-     ss_ << os.str() << " ," ;
-  }
-
-  template <typename U, typename P>
-  void JSONWriter::writeDefault(const std::string &s, const std::pair<U,P> &x)
-  {
-    //std::cout << "JSONWriter::writeDefault(pair) : " << s <<  " " << x <<  std::endl;
-    std::ostringstream os;
-    os << "["<< std::boolalpha << "\""<< x.first << "\" , \"" << x.second << "\" ]";
-    if (s.size())
-      ss_ << "\""<< s << "\" : " << os.str() << " ," ;
-    else
-     ss_ << os.str() << " ," ;
-  }
-
-  template <typename U>
-  void JSONWriter::writeDefault(const std::string &s, const std::vector<U> &x)
-  {
-    //std::cout << "JSONWriter::writeDefault(vec U) : " << s <<  std::endl;
-
-    if (s.size())
-      ss_ << " \""<<s<<"\" : [";
-    else
-      ss_ << " [";
-    for (auto &x_i: x)
-    {
-      write("", x_i);
-    }
-    delete_comma();
-    ss_<< "],";
-  }
-
-  template<std::size_t N>
-  void JSONWriter::writeDefault(const std::string &s, const char(&x)[N]){
-    //std::cout << "JSONWriter::writeDefault(char U) : " << s <<  "  " << x << std::endl;
-
-    if (s.size())
-      ss_ << "\""<< s << "\" : \"" << x << "\" ," ;
-    else
-      ss_ << "\"" << x << "\" ," ;
-  }
-
-  // Reader template implementation ////////////////////////////////////////////
-  template <typename U>
-  void JSONReader::readDefault(const std::string &s, U &output)
-  {
-    //std::cout << "JSONReader::readDefault(U) : " << s << "  :  "<< jcur_ << std::endl;
-
-    if (s.size()){
-      //std::cout << "String: "<< jcur_[s] << std::endl;
-      output = jcur_[s];
-    }
-    else
-    {
-      //std::cout << "String: "<< jcur_ << std::endl;
-      output = jcur_;
-    }
-
-
-  }
-
-  // Reader template implementation ////////////////////////////////////////////
-  template <typename U, typename P>
-  void JSONReader::readDefault(const std::string &s, std::pair<U,P> &output)
-  {
-    U first;
-    P second;
-    json j;
-    if (s.size()){
-      //std::cout << "JSONReader::readDefault(pair) : " << s << "  |  "<< jcur_[s] << std::endl;
-      j = jcur_[s];
-    } else {
-      j = jcur_;
-    }
-    json::iterator it = j.begin();
-    jcur_ = *it;
-    read("", first);
-    it++;
-    jcur_ = *it;
-    read("", second);
-    output = std::pair<U,P>(first,second);
-  }
-
-
-
-  template <typename U>
-  void JSONReader::readDefault(const std::string &s, std::complex<U> &output)
-  {
-    U tmp1, tmp2;
-    //std::cout << "JSONReader::readDefault(complex U) : " << s << "  :  "<< jcur_ << std::endl;
-    json j = jcur_;
-    json::iterator it = j.begin();
-    jcur_ = *it;
-    read("", tmp1);
-    it++;
-    jcur_ = *it;
-    read("", tmp2);
-    output = std::complex<U>(tmp1,tmp2);
-  }
-
-
-  template <>
-  void JSONReader::readDefault(const std::string &s, std::string &output);
-
-  template <typename U>
-  void JSONReader::readDefault(const std::string &s, std::vector<U> &output)
-  {
-    std::string    buf;
-    unsigned int   i = 0;
-    //std::cout << "JSONReader::readDefault(vec) : " << jcur_ << std::endl;
-    if (s.size())
-      push(s);
-
-    json j = jcur_;
-    for (json::iterator it = j.begin(); it != j.end(); ++it) {
-      jcur_ = *it;
-      //std::cout << "Value: " << it.value() << "\n";
-      output.resize(i + 1);
-      read("", output[i++]);
-    }
-
-
-    jcur_ = j;
-    if (s.size())
-      pop();
-  }
-
-}
-#endif
@@ -1,439 +0,0 @@
-#ifndef GRID_SERIALISATION_VECTORUTILS_H
-#define GRID_SERIALISATION_VECTORUTILS_H
-
-#include <type_traits>
-#include <Grid/tensors/Tensors.h>
-
-namespace Grid {
-  // Pair IO utilities /////////////////////////////////////////////////////////
-  // helper function to parse input in the format "<obj1 obj2>"
-  template <typename T1, typename T2>
-  inline std::istream & operator>>(std::istream &is, std::pair<T1, T2> &buf)
-  {
-    T1 buf1;
-    T2 buf2;
-    char c;
-
-    // Search for "pair" delimiters.
-    do
-    {
-      is.get(c);
-    } while (c != '(' && !is.eof());
-    if (c == '(')
-    {
-      int start = is.tellg();
-      do
-      {
-        is.get(c);
-      } while (c != ')' && !is.eof());
-      if (c == ')')
-      {
-        int end = is.tellg();
-        int psize = end - start - 1;
-
-        // Only read data between pair limiters.
-        is.seekg(start);
-        std::string tmpstr(psize, ' ');
-        is.read(&tmpstr[0], psize);
-        std::istringstream temp(tmpstr);
-        temp >> buf1 >> buf2;
-        buf = std::make_pair(buf1, buf2);
-        is.seekg(end);
-      }
-    }
-    is.peek();
-    return is;
-  }
-  
-  // output to streams for pairs
-  template <class T1, class T2>
-  inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
-  {
-    os << "(" << p.first << " " << p.second << ")";
-    return os;
-  }
-  
-  // Grid scalar tensors to nested std::vectors //////////////////////////////////
-  template <typename T>
-  struct TensorToVec
-  {
-    typedef T type;
-  };
-
-  template <typename T>
-  struct TensorToVec<iScalar<T>>
-  {
-    typedef typename TensorToVec<T>::type type;
-  };
-
-  template <typename T, int N>
-  struct TensorToVec<iVector<T, N>>
-  {
-    typedef typename std::vector<typename TensorToVec<T>::type> type;
-  };
-
-  template <typename T, int N>
-  struct TensorToVec<iMatrix<T, N>>
-  {
-    typedef typename std::vector<std::vector<typename TensorToVec<T>::type>> type;
-  };
-
-  template <typename T>
-  void tensorDim(std::vector<size_t> &dim, const T &t, const bool wipe = true)
-  {
-    if (wipe)
-    {
-      dim.clear();
-    }
-  }
-
-  template <typename T>
-  void tensorDim(std::vector<size_t> &dim, const iScalar<T> &t, const bool wipe = true)
-  {
-    if (wipe)
-    {
-      dim.clear();
-    }
-    tensorDim(dim, t._internal, false);
-  }
-
-  template <typename T, int N>
-  void tensorDim(std::vector<size_t> &dim, const iVector<T, N> &t, const bool wipe = true)
-  {
-    if (wipe)
-    {
-      dim.clear();
-    }
-    dim.push_back(N);
-    tensorDim(dim, t._internal[0], false);
-  }
-
-  template <typename T, int N>
-  void tensorDim(std::vector<size_t> &dim, const iMatrix<T, N> &t, const bool wipe = true)
-  {
-    if (wipe)
-    {
-      dim.clear();
-    }
-    dim.push_back(N);
-    dim.push_back(N);
-    tensorDim(dim, t._internal[0][0], false);
-  }
-
-  template <typename T>
-  typename TensorToVec<T>::type tensorToVec(const T &t)
-  {
-    return t;
-  }
-
-  template <typename T>
-  typename TensorToVec<iScalar<T>>::type tensorToVec(const iScalar<T>& t)
-  {
-    return tensorToVec(t._internal);
-  }
-
-  template <typename T, int N>
-  typename TensorToVec<iVector<T, N>>::type tensorToVec(const iVector<T, N>& t)
-  {
-    typename TensorToVec<iVector<T, N>>::type v;
-
-    v.resize(N);
-    for (unsigned int i = 0; i < N; i++) 
-    {
-      v[i] = tensorToVec(t._internal[i]);
-    }
-
-    return v;
-  }
-
-  template <typename T, int N>
-  typename TensorToVec<iMatrix<T, N>>::type tensorToVec(const iMatrix<T, N>& t)
-  {
-    typename TensorToVec<iMatrix<T, N>>::type v;
-
-    v.resize(N);
-    for (unsigned int i = 0; i < N; i++)
-    {
-      v[i].resize(N);
-      for (unsigned int j = 0; j < N; j++) 
-      {
-        v[i][j] = tensorToVec(t._internal[i][j]);
-      }
-    }
-
-    return v;
-  }
-
-  template <typename T>
-  void vecToTensor(T &t, const typename TensorToVec<T>::type &v)
-  {
-    t = v;
-  }
-
-
-  template <typename T>
-  void vecToTensor(iScalar<T> &t, const typename TensorToVec<iScalar<T>>::type &v)
-  {
-    vecToTensor(t._internal, v);
-  }
-
-  template <typename T, int N>
-  void vecToTensor(iVector<T, N> &t, const typename TensorToVec<iVector<T, N>>::type &v)
-  {
-    for (unsigned int i = 0; i < N; i++) 
-    {
-      vecToTensor(t._internal[i], v[i]);
-    }
-  }
-
-  template <typename T, int N>
-  void vecToTensor(iMatrix<T, N> &t, const typename TensorToVec<iMatrix<T, N>>::type &v)
-  {
-    for (unsigned int i = 0; i < N; i++)
-    for (unsigned int j = 0; j < N; j++)
-    {
-      vecToTensor(t._internal[i][j], v[i][j]);
-    }
-  }
-
-  // Vector element trait //////////////////////////////////////////////////////  
-  template <typename T>
-  struct element
-  {
-    typedef T type;
-    static constexpr bool is_number = false;
-  };
-  
-  template <typename T>
-  struct element<std::vector<T>>
-  {
-    typedef typename element<T>::type type;
-    static constexpr bool is_number = std::is_arithmetic<T>::value
-                                      or is_complex<T>::value
-                                      or element<T>::is_number;
-  };
-  
-  // Vector flattening utility class ////////////////////////////////////////////
-  // Class to flatten a multidimensional std::vector
-  template <typename V>
-  class Flatten
-  {
-  public:
-    typedef typename element<V>::type Element;
-  public:
-    explicit                     Flatten(const V &vector);
-    const V &                    getVector(void);
-    const std::vector<Element> & getFlatVector(void);
-    const std::vector<size_t>  & getDim(void);
-  private:
-    void accumulate(const Element &e);
-    template <typename W>
-    void accumulate(const W &v);
-    void accumulateDim(const Element &e);
-    template <typename W>
-    void accumulateDim(const W &v);
-  private:
-    const V              &vector_;
-    std::vector<Element> flatVector_;
-    std::vector<size_t>  dim_;
-  };
-  
-  // Class to reconstruct a multidimensional std::vector
-  template <typename V>
-  class Reconstruct
-  {
-  public:
-    typedef typename element<V>::type Element;
-  public:
-    Reconstruct(const std::vector<Element> &flatVector,
-                const std::vector<size_t> &dim);
-    const V &                    getVector(void);
-    const std::vector<Element> & getFlatVector(void);
-    const std::vector<size_t>  & getDim(void);
-  private:
-    void fill(std::vector<Element> &v);
-    template <typename W>
-    void fill(W &v);
-    void resize(std::vector<Element> &v, const unsigned int dim);
-    template <typename W>
-    void resize(W &v, const unsigned int dim);
-  private:
-    V                          vector_;
-    const std::vector<Element> &flatVector_;
-    std::vector<size_t>        dim_;
-    size_t                     ind_{0};
-    unsigned int               dimInd_{0};
-  };
-
-  // Flatten class template implementation
-  template <typename V>
-  void Flatten<V>::accumulate(const Element &e)
-  {
-    flatVector_.push_back(e);
-  }
-  
-  template <typename V>
-  template <typename W>
-  void Flatten<V>::accumulate(const W &v)
-  {
-    for (auto &e: v)
-    {
-      accumulate(e);
-    }
-  }
-  
-  template <typename V>
-  void Flatten<V>::accumulateDim(const Element &e) {};
-  
-  template <typename V>
-  template <typename W>
-  void Flatten<V>::accumulateDim(const W &v)
-  {
-    dim_.push_back(v.size());
-    accumulateDim(v[0]);
-  }
-  
-  template <typename V>
-  Flatten<V>::Flatten(const V &vector)
-  : vector_(vector)
-  {
-    accumulate(vector_);
-    accumulateDim(vector_);
-  }
-  
-  template <typename V>
-  const V & Flatten<V>::getVector(void)
-  {
-    return vector_;
-  }
-  
-  template <typename V>
-  const std::vector<typename Flatten<V>::Element> &
-  Flatten<V>::getFlatVector(void)
-  {
-    return flatVector_;
-  }
-  
-  template <typename V>
-  const std::vector<size_t> & Flatten<V>::getDim(void)
-  {
-    return dim_;
-  }
-  
-  // Reconstruct class template implementation
-  template <typename V>
-  void Reconstruct<V>::fill(std::vector<Element> &v)
-  {
-    for (auto &e: v)
-    {
-      e = flatVector_[ind_++];
-    }
-  }
-  
-  template <typename V>
-  template <typename W>
-  void Reconstruct<V>::fill(W &v)
-  {
-    for (auto &e: v)
-    {
-      fill(e);
-    }
-  }
-  
-  template <typename V>
-  void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
-  {
-    v.resize(dim_[dim]);
-  }
-  
-  template <typename V>
-  template <typename W>
-  void Reconstruct<V>::resize(W &v, const unsigned int dim)
-  {
-    v.resize(dim_[dim]);
-    for (auto &e: v)
-    {
-      resize(e, dim + 1);
-    }
-  }
-  
-  template <typename V>
-  Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
-                              const std::vector<size_t> &dim)
-  : flatVector_(flatVector)
-  , dim_(dim)
-  {
-    resize(vector_, 0);
-    fill(vector_);
-  }
-  
-  template <typename V>
-  const V & Reconstruct<V>::getVector(void)
-  {
-    return vector_;
-  }
-  
-  template <typename V>
-  const std::vector<typename Reconstruct<V>::Element> &
-  Reconstruct<V>::getFlatVector(void)
-  {
-    return flatVector_;
-  }
-  
-  template <typename V>
-  const std::vector<size_t> & Reconstruct<V>::getDim(void)
-  {
-    return dim_;
-  }
-
-  // Vector IO utilities ///////////////////////////////////////////////////////
-  // helper function to read space-separated values
-  template <typename T>
-  std::vector<T> strToVec(const std::string s)
-  {
-    std::istringstream sstr(s);
-    T                  buf;
-    std::vector<T>     v;
-    
-    while(!sstr.eof())
-    {
-      sstr >> buf;
-      v.push_back(buf);
-    }
-    
-    return v;
-  }
-  
-  // output to streams for vectors
-  template < class T >
-  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
-  {
-    os << "[";
-    for (unsigned int i = 0; i < v.size(); ++i)
-    {
-      os << v[i];
-      if (i < v.size() - 1)
-      {
-        os << " ";
-      }
-    }
-    os << "]";
-    
-    return os;
-  }
-}
-
-// helper function to read space-separated values
-template <typename T>
-std::string vecToStr(const std::vector<T> &v)
-{
-  using Grid::operator<<;
-  
-  std::ostringstream sstr;
-
-  sstr << v;
-
-  return sstr.str();
-}
-
-#endif
@@ -1,188 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/serialisation/XmlIO.cc
-
-    Copyright (C) 2015
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/GridCore.h>
-
-using namespace Grid;
-
-void Grid::xmlCheckParse(const pugi::xml_parse_result &result, const std::string name)
-{
-  if (!result) 
-  {
-    std::cerr << "XML parsing error for " << name << std::endl;
-    std::cerr << "XML error description: " << result.description() << std::endl;
-    std::cerr << "XML error offset     : " << result.offset << std::endl;
-    abort();
-  }
-}
-
-// Writer implementation ///////////////////////////////////////////////////////
-XmlWriter::XmlWriter(const std::string &fileName, std::string toplev) : fileName_(fileName)
-{
-  if ( toplev == std::string("") ) {
-    node_=doc_;
-  } else { 
-    node_=doc_.append_child();
-    node_.set_name(toplev.c_str());
-  }
-}
-
-XmlWriter::~XmlWriter(void)
-{
-  if ( fileName_ != std::string("") ) { 
-    doc_.save_file(fileName_.c_str(), indent_.c_str());
-  }
-}
-
-void XmlWriter::push(const std::string &s)
-{
-  node_ = node_.append_child(s.c_str());
-}
-
-void XmlWriter::pushXmlString(const std::string &s)
-{
-  pugi::xml_document doc;
-  auto               result = doc.load_buffer(s.c_str(), s.size());
-
-  xmlCheckParse(result, "fragment\n'" + s +"'");
-  for (pugi::xml_node child = doc.first_child(); child; child = child.next_sibling())
-  {
-      node_ = node_.append_copy(child);
-  }
-  pop();
-}
-
-void XmlWriter::pop(void)
-{
-  node_ = node_.parent();
-}
-
-std::string XmlWriter::docString(void)
-{
-  std::ostringstream oss; 
-  doc_.save(oss, indent_.c_str());
-  return oss.str();
-}
-
-std::string XmlWriter::string(void)
-{
-  std::ostringstream oss; 
-  doc_.save(oss, indent_.c_str(), pugi::format_default | pugi::format_no_declaration);
-  return oss.str();
-}
-
-// Reader implementation ///////////////////////////////////////////////////////
-XmlReader::XmlReader(const std::string &s,  const bool isBuffer, 
-                     std::string toplev) 
-{
-  pugi::xml_parse_result result;
-  
-  if (isBuffer)
-  {
-    fileName_ = "<string>";
-    result    = doc_.load_string(s.c_str());
-    xmlCheckParse(result, "string\n'" + s + "'");
-  }
-  else
-  {
-    fileName_ = s;
-    result    = doc_.load_file(s.c_str());
-    xmlCheckParse(result, "file '" + fileName_ + "'");
-  }
-  if ( toplev == std::string("") ) {
-  node_ = doc_;
-  } else { 
-    node_ = doc_.child(toplev.c_str());
-  }
-}
-
-#define XML_SAFE_NODE(expr)\
-if (expr)\
-{\
-  node_ = expr;\
-  return true;\
-}\
-else\
-{\
-  return false;\
-}
-
-bool XmlReader::push(const std::string &s)
-{
-  if (s.empty())
-  {
-    XML_SAFE_NODE(node_.first_child());
-  }
-  else
-  {
-    XML_SAFE_NODE(node_.child(s.c_str()));
-  }
-}
-
-void XmlReader::pop(void)
-{
-  node_ = node_.parent();
-}
-
-bool XmlReader::nextElement(const std::string &s)
-{
-  if (s.empty())
-  {
-    XML_SAFE_NODE(node_.next_sibling());
-  }
-  else
-  {
-    XML_SAFE_NODE(node_.next_sibling(s.c_str()));
-  }
-}
-
-void XmlReader::readCurrentSubtree(std::string &s)
-{
-  std::ostringstream oss; 
-  pugi::xml_document doc;
-
-  doc.append_copy(node_);
-  doc.save(oss, indent_.c_str(), pugi::format_default | pugi::format_no_declaration);
-  s = oss.str();
-}
-
-template <>
-void XmlReader::readDefault(const std::string &s, std::string &output)
-{
-  if (node_.child(s.c_str()))
-  {
-    output = node_.child(s.c_str()).first_child().value();
-  }
-  else
-  {
-    std::cout << GridLogWarning << "XML: cannot open node '" << s << "'";
-    std::cout << std::endl;
-
-    output = ""; 
-  }
-}
@@ -1,599 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/simd/Grid_neon.h
-
-    Copyright (C) 2015
-
-    Author: Nils Meyer <nils.meyer@ur.de>
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: neo <cossu@post.kek.jp>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-
-/*
-
-  ARMv8 NEON intrinsics layer by
-
-  Nils Meyer <nils.meyer@ur.de>,
-  University of Regensburg, Germany
-  SFB/TRR55
-
-*/
-
-#ifndef GEN_SIMD_WIDTH
-#define GEN_SIMD_WIDTH 16u
-#endif
-
-#include "Grid_generic_types.h"
-#include <arm_neon.h>
-
-namespace Grid {
-namespace Optimization {
-
-  template<class vtype>
-  union uconv {
-    float32x4_t f;
-    vtype v;
-  };
-  union u128f {
-    float32x4_t v;
-    float f[4];
-  };
-  union u128d {
-    float64x2_t v;
-    double f[2];
-  };
-  // half precision
-  union u128h {
-    float16x8_t v;
-    uint16_t f[8];
-  };
-
-  struct Vsplat{
-    //Complex float
-    inline float32x4_t operator()(float a, float b){
-      float tmp[4]={a,b,a,b};
-      return vld1q_f32(tmp);
-    }
-    // Real float
-    inline float32x4_t operator()(float a){
-      return vdupq_n_f32(a);
-    }
-    //Complex double
-    inline float64x2_t operator()(double a, double b){
-      double tmp[2]={a,b};
-      return vld1q_f64(tmp);
-    }
-    //Real double
-    inline float64x2_t operator()(double a){
-      return vdupq_n_f64(a);
-    }
-    //Integer
-    inline uint32x4_t operator()(Integer a){
-      return vdupq_n_u32(a);
-    }
-  };
-
-  struct Vstore{
-    //Float
-    inline void operator()(float32x4_t a, float* F){
-      vst1q_f32(F, a);
-    }
-    //Double
-    inline void operator()(float64x2_t a, double* D){
-      vst1q_f64(D, a);
-    }
-    //Integer
-    inline void operator()(uint32x4_t a, Integer* I){
-      vst1q_u32(I, a);
-    }
-
-  };
-
-  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
-    //Float // N:generic
-    inline void operator()(float * a, float32x4_t b){
-      memcpy(a,&b,4*sizeof(float));
-    }
-    //Double // N:generic
-    inline void operator()(double * a, float64x2_t b){
-      memcpy(a,&b,2*sizeof(double));
-    }
-
-
-  };
-
-  // Nils: Vset untested; not used currently in Grid at all;
-  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
-  struct Vset{
-    // Complex float
-    inline float32x4_t operator()(Grid::ComplexF *a){
-      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
-      return vld1q_f32(tmp);
-    }
-    // Complex double
-    inline float64x2_t operator()(Grid::ComplexD *a){
-      double tmp[2]={a[0].imag(),a[0].real()};
-      return vld1q_f64(tmp);
-    }
-    // Real float
-    inline float32x4_t operator()(float *a){
-      float tmp[4]={a[3],a[2],a[1],a[0]};
-      return vld1q_f32(tmp);
-    }
-    // Real double
-    inline float64x2_t operator()(double *a){
-      double tmp[2]={a[1],a[0]};
-      return vld1q_f64(tmp);
-    }
-    // Integer
-    inline uint32x4_t operator()(Integer *a){
-      return vld1q_dup_u32(a);
-    }
-  };
-
-  template <typename Out_type, typename In_type>
-  struct Reduce{
-    //Need templated class to overload output type
-    //General form must generate error if compiled
-      inline Out_type operator()(In_type in){
-      printf("Error, using wrong Reduce function\n");
-      exit(1);
-      return 0;
-    }
-  };
-
-  /////////////////////////////////////////////////////
-  // Arithmetic operations
-  /////////////////////////////////////////////////////
-  struct Sum{
-    //Complex/Real float
-    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return vaddq_f32(a,b);
-    }
-    //Complex/Real double
-    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      return vaddq_f64(a,b);
-    }
-    //Integer
-    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      return vaddq_u32(a,b);
-    }
-  };
-
-  struct Sub{
-    //Complex/Real float
-    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return vsubq_f32(a,b);
-    }
-    //Complex/Real double
-    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      return vsubq_f64(a,b);
-    }
-    //Integer
-    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      return vsubq_u32(a,b);
-    }
-  };
-
-  struct MultRealPart{
-    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t re = vtrn1q_f32(a, a);
-      return vmulq_f32(re, b);
-    }
-    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      float64x2_t re = vzip1q_f64(a, a);
-      return vmulq_f64(re, b);
-    }
-  };
-
-  struct MaddRealPart{
-    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
-      float32x4_t re = vtrn1q_f32(a, a);
-      return vfmaq_f32(c, re, b);
-    }
-    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
-      float64x2_t re = vzip1q_f64(a, a);
-      return vfmaq_f64(c, re, b);
-    }
-  };
-
-  struct Div{
-    // Real float
-    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return vdivq_f32(a, b);
-    }
-    // Real double
-    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      return vdivq_f64(a, b);
-    }
-  };
-
-  struct MultComplex{
-    // Complex float
-    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-
-      float32x4_t r0, r1, r2, r3, r4;
-
-      // a = ar ai Ar Ai
-      // b = br bi Br Bi
-      // collect real/imag part, negate bi and Bi
-      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
-      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
-      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
-
-      // the fun part
-      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
-      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
-
-      // fma(a,b,c) = a+b*c
-      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
-
-      // no fma, use mul and add
-      // float32x4_t r5;
-      // r5 = vmulq_f32(r0, a);
-      // return vaddq_f32(r4, r5);
-    }
-    // Complex double
-    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-
-      float64x2_t r0, r1, r2, r3, r4;
-
-      // b = br bi
-      // collect real/imag part, negate bi
-      r0 = vtrn1q_f64(b, b);       //  br  br
-      r1 = vnegq_f64(b);           // -br -bi
-      r2 = vtrn2q_f64(b, r1);      //  bi -bi
-
-      // the fun part
-      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
-      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
-
-      // fma(a,b,c) = a+b*c
-      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
-
-      // no fma, use mul and add
-      // float64x2_t r5;
-      // r5 = vmulq_f64(r0, a);
-      // return vaddq_f64(r4, r5);
-    }
-  };
-
-  struct Mult{
-    // Real float
-    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
-      //return vaddq_f32(vmulq_f32(b,c),a);
-      return vfmaq_f32(a, b, c);
-    }
-    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
-      //return vaddq_f64(vmulq_f64(b,c),a);
-      return vfmaq_f64(a, b, c);
-    }
-    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return vmulq_f32(a,b);
-    }
-    // Real double
-    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      return vmulq_f64(a,b);
-    }
-    // Integer
-    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      return vmulq_u32(a,b);
-    }
-  };
-
-  struct Conj{
-    // Complex single
-    inline float32x4_t operator()(float32x4_t in){
-      // ar ai br bi -> ar -ai br -bi
-      float32x4_t r0, r1;
-      r0 = vnegq_f32(in);        // -ar -ai -br -bi
-      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
-      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
-    }
-    // Complex double
-    inline float64x2_t operator()(float64x2_t in){
-
-      float64x2_t r0, r1;
-      r0 = vextq_f64(in, in, 1);    //  ai  ar
-      r1 = vnegq_f64(r0);           // -ai -ar
-      return vextq_f64(r0, r1, 1);  //  ar -ai
-    }
-    // do not define for integer input
-  };
-
-  struct TimesMinusI{
-    //Complex single
-    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      // ar ai br bi -> ai -ar ai -br
-      float32x4_t r0, r1;
-      r0 = vnegq_f32(in);        // -ar -ai -br -bi
-      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
-      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
-    }
-    //Complex double
-    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
-      // a ib -> b -ia
-      float64x2_t tmp;
-      tmp = vnegq_f64(in);
-      return vextq_f64(in, tmp, 1);
-    }
-  };
-
-  struct TimesI{
-    //Complex single
-    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      // ar ai br bi -> -ai ar -bi br
-      float32x4_t r0, r1;
-      r0 = vnegq_f32(in);        // -ar -ai -br -bi
-      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
-      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
-    }
-    //Complex double
-    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
-      // a ib -> -b ia
-      float64x2_t tmp;
-      tmp = vnegq_f64(in);
-      return vextq_f64(tmp, in, 1);
-    }
-  };
-
-  struct Permute{
-
-    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
-      // AB CD -> CD AB
-      return vextq_f32(in, in, 2);
-    };
-    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
-      // AB CD -> BA DC
-      return vrev64q_f32(in);
-    };
-    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
-      return in;
-    };
-    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
-      return in;
-    };
-
-    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
-      // AB -> BA
-      return vextq_f64(in, in, 1);
-    };
-    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
-      return in;
-    };
-    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
-      return in;
-    };
-    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
-      return in;
-    };
-
-  };
-
-  struct Rotate{
-
-    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
-      switch(n){
-      case 0: // AB CD -> AB CD
-        return tRotate<0>(in);
-        break;
-      case 1: // AB CD -> BC DA
-        return tRotate<1>(in);
-        break;
-      case 2: // AB CD -> CD AB
-        return tRotate<2>(in);
-        break;
-      case 3: // AB CD -> DA BC
-        return tRotate<3>(in);
-        break;
-      default: assert(0);
-      }
-    }
-    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
-      switch(n){
-      case 0: // AB -> AB
-        return tRotate<0>(in);
-        break;
-      case 1: // AB -> BA
-        return tRotate<1>(in);
-        break;
-      default: assert(0);
-      }
-    }
-
-    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
-    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
-
-  };
-
-  struct PrecisionChange {
-
-    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
-      float16x4_t h = vcvt_f16_f32(a);
-      return vcvt_high_f16_f32(h, b);
-    }
-    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
-      sb = vcvt_high_f32_f16(h);
-      // there is no direct conversion from lower float32x4_t to float64x2_t
-      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
-      // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
-      // workaround for clang
-      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
-      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
-      sa = vcvt_high_f32_f16(h1);
-    }
-    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
-      float32x2_t s = vcvt_f32_f64(a);
-      return vcvt_high_f32_f64(s, b);
-
-    }
-    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
-      b = vcvt_high_f64_f32(s);
-      // there is no direct conversion from lower float32x4_t to float64x2_t
-      float32x4_t s1 = vextq_f32(s, s, 2);
-      a = vcvt_high_f64_f32(s1);
-
-    }
-    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
-      float32x4_t s1 = DtoS(a, b);
-      float32x4_t s2 = DtoS(c, d);
-      return StoH(s1, s2);
-    }
-    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
-      float32x4_t s1, s2;
-      HtoS(h, s1, s2);
-      StoD(s1, a, b);
-      StoD(s2, c, d);
-    }
-  };
-
-  //////////////////////////////////////////////
-  // Exchange support
-
-  struct Exchange{
-    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
-      // in1: ABCD -> out1: ABEF
-      // in2: EFGH -> out2: CDGH
-
-      // z: CDAB
-      float32x4_t z = vextq_f32(in1, in1, 2);
-      // out1: ABEF
-      out1 = vextq_f32(z, in2, 2);
-
-      // z: GHEF
-      z = vextq_f32(in2, in2, 2);
-      // out2: CDGH
-      out2 = vextq_f32(in1, z, 2);
-    };
-
-    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
-      // in1: ABCD -> out1: AECG
-      // in2: EFGH -> out2: BFDH
-      out1 = vtrn1q_f32(in1, in2);
-      out2 = vtrn2q_f32(in1, in2);
-    };
-    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
-      assert(0);
-      return;
-    };
-    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
-      assert(0);
-      return;
-    };
-    // double precision
-    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
-      // in1: AB -> out1: AC
-      // in2: CD -> out2: BD
-      out1 = vzip1q_f64(in1, in2);
-      out2 = vzip2q_f64(in1, in2);
-    };
-    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
-      assert(0);
-      return;
-    };
-    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
-      assert(0);
-      return;
-    };
-    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
-      assert(0);
-      return;
-    };
-  };
-
-  //////////////////////////////////////////////
-  // Some Template specialization
-
-
-  //Complex float Reduce
-  template<>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
-    float32x4_t v1; // two complex
-    v1 = Optimization::Permute::Permute0(in);
-    v1 = vaddq_f32(v1,in);
-    u128f conv;    conv.v=v1;
-    return Grid::ComplexF(conv.f[0],conv.f[1]);
-  }
-  //Real float Reduce
-  template<>
-  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    return vaddvq_f32(in);
-  }
-
-
-  //Complex double Reduce
-  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
-    u128d conv; conv.v = in;
-    return Grid::ComplexD(conv.f[0],conv.f[1]);
-  }
-
-  //Real double Reduce
-  template<>
-  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    return vaddvq_f64(in);
-  }
-
-  //Integer Reduce
-  template<>
-  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
-    return vaddvq_u32(in);
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////
-// Here assign types
-
-// typedef Optimization::vech SIMD_Htype; // Reduced precision type
-  typedef float16x8_t  SIMD_Htype; // Half precision type
-  typedef float32x4_t  SIMD_Ftype; // Single precision type
-  typedef float64x2_t  SIMD_Dtype; // Double precision type
-  typedef uint32x4_t   SIMD_Itype; // Integer type
-
-  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
-  inline void prefetch_HINT_T0(const char *ptr){};
-
-
-  // Function name aliases
-  typedef Optimization::Vsplat   VsplatSIMD;
-  typedef Optimization::Vstore   VstoreSIMD;
-  typedef Optimization::Vset     VsetSIMD;
-  typedef Optimization::Vstream  VstreamSIMD;
-  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
-
-
-
-
-  // Arithmetic operations
-  typedef Optimization::Sum         SumSIMD;
-  typedef Optimization::Sub         SubSIMD;
-  typedef Optimization::Div         DivSIMD;
-  typedef Optimization::Mult        MultSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
-  typedef Optimization::MultRealPart MultRealPartSIMD;
-  typedef Optimization::MaddRealPart MaddRealPartSIMD;
-  typedef Optimization::Conj        ConjSIMD;
-  typedef Optimization::TimesMinusI TimesMinusISIMD;
-  typedef Optimization::TimesI      TimesISIMD;
-
-}
-
@@ -1,2 +0,0 @@
-downloaded file from : http://www.sitmo.com/wp-content/uploads/2016/03/prng_engine.hpp
-Unmodified beyond filename
@@ -1,144 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/tensors/Tensor_exp.h
-
-    Copyright (C) 2015
-
-Author: neo <cossu@post.kek.jp>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_MATH_EXP_H
-#define GRID_MATH_EXP_H
-
-#define DEFAULT_MAT_EXP 12
-
-namespace Grid {
-
-  /////////////////////////////////////////////// 
-  // Exponentiate function for scalar, vector, matrix
-  /////////////////////////////////////////////// 
-
-
-  template<class vtype> inline iScalar<vtype> Exponentiate(const iScalar<vtype>&r, RealD alpha ,  Integer Nexp = DEFAULT_MAT_EXP)
-    {
-      iScalar<vtype> ret;
-      ret._internal = Exponentiate(r._internal, alpha, Nexp);
-      return ret;
-    }
-
-template<class vtype, int N> inline iVector<vtype, N> Exponentiate(const iVector<vtype,N>&r, RealD alpha ,  Integer Nexp = DEFAULT_MAT_EXP)
-    {
-      iVector<vtype, N> ret;
-      for (int i = 0; i < N; i++)
-        ret._internal[i] = Exponentiate(r._internal[i], alpha, Nexp);
-      return ret;
-    }
-
-
-
-    // Specialisation: Cayley-Hamilton exponential for SU(3)
-    template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
-    inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
-    {
-    // for SU(3) 2x faster than the std implementation using Nexp=12
-    // notice that it actually computes
-    // exp ( input matrix )
-    // the i sign is coming from outside
-    // input matrix is anti-hermitian NOT hermitian
-      typedef iMatrix<vtype,3> mat;
-      typedef iScalar<vtype> scalar;
-      mat unit(1.0);
-      mat temp(unit);
-      const Complex one_over_three = 1.0 / 3.0;
-      const Complex one_over_two = 1.0 / 2.0;
-
-      scalar c0, c1, tmp, c0max, theta, u, w;
-      scalar xi0, u2, w2, cosw;
-      scalar fden, h0, h1, h2;
-      scalar e2iu, emiu, ixi0, qt;
-      scalar f0, f1, f2;
-      scalar unity(1.0);
-      
-      mat iQ2 = arg*arg*alpha*alpha;
-      mat iQ3 = arg*iQ2*alpha;   
-      // sign in c0 from the conventions on the Ta
-      scalar imQ3, reQ2;
-      imQ3 = imag( trace(iQ3) );
-      reQ2 = real( trace(iQ2) );
-      c0 = -imQ3 * one_over_three;  
-      c1 = -reQ2 * one_over_two;
-
-      // Cayley Hamilton checks to machine precision, tested
-      tmp = c1 * one_over_three;
-      c0max = 2.0 * pow(tmp, 1.5);
-
-      theta = acos(c0 / c0max) * one_over_three;
-      u = sqrt(tmp) * cos(theta);
-      w = sqrt(c1) * sin(theta);
-
-      xi0 = sin(w) / w;
-      u2 = u * u;
-      w2 = w * w;
-      cosw = cos(w);
-
-      ixi0 = timesI(xi0);
-      emiu = cos(u) - timesI(sin(u));
-      e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
-
-      h0 = e2iu * (u2 - w2) +
-           emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0));
-      h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0);
-      h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0);
-
-      fden = unity / (9.0 * u2 - w2);  // reals
-      f0 = h0 * fden;
-      f1 = h1 * fden;
-      f2 = h2 * fden;
-
-      return (f0 * unit + timesMinusI(f1) * arg*alpha - f2 * iQ2);
-    }
-
-
-
-// General exponential
-template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
-    inline iMatrix<vtype,N> Exponentiate(const iMatrix<vtype,N> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
-    {
-    // notice that it actually computes
-    // exp ( input matrix )
-    // the i sign is coming from outside
-    // input matrix is anti-hermitian NOT hermitian
-      typedef iMatrix<vtype,N> mat;
-      mat unit(1.0);
-      mat temp(unit);
-      for(int i=Nexp; i>=1;--i){
-	      temp *= alpha/RealD(i);
-	      temp = unit + temp*arg;
-      }
-      return temp;
-
-    }
-
-
-
-
-}
-#endif
--- a/Show More
+++ b/Show More