Reorganise a little to let the PV inverter be defined outside

the Reconstruct class. This lets the multiple choices for PV inversion be composed without changing the routine and no if/else case enumeration. Implemented SchurDiagMooee PV inversion (red black) and Unprec PV inversion. Red black cuts from 190 iterations to 90 iterations at 10^-12 on 8^4 test system Will revisit multiple Schur options and add a Fourier based multishift PV inverse, similar to the one Rudy Arthur did in BFM
4d 5d reconstruction code & test
2026-06-17 09:23:43 +01:00 · 2018-10-10 13:22:01 +01:00 · 2018-10-09 18:37:20 +01:00 · 2018-10-09 17:41:56 +01:00 · 2018-10-05 11:29:40 +01:00 · 2018-10-04 18:57:41 +01:00
726 changed files with 97070 additions and 20614 deletions
@@ -83,6 +83,7 @@ ltmain.sh
 .Trashes
 ehthumbs.db
 Thumbs.db
 .dirstamp
 # build directory #
 ###################
@@ -92,28 +93,24 @@ build*/*
 #####################
 *.xcodeproj/*
 build.sh
 .vscode
 *.code-workspace
 # Eigen source #
 ################
-lib/Eigen/*
+Grid/Eigen
-
+Eigen/*
 # FFTW source #
 ################
 lib/fftw/*
 # libtool macros #
 ##################
 m4/lt*
 m4/libtool.m4
-# Buck files #
+# github pages #
-##############
+################
-.buck*
+gh-pages/
 buck-out
 BUCK
 make-bin-BUCK.sh
 # generated sources #
 #####################
-lib/qcd/spin/gamma-gen/*.h
+Grid/qcd/spin/gamma-gen/*.h
-lib/qcd/spin/gamma-gen/*.cc
+Grid/qcd/spin/gamma-gen/*.cc
@@ -9,62 +9,11 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
-    - compiler: gcc
+      env: PREC=single
-      addons:
+    - os:        osx
-        apt:
+      osx_image: xcode8.3
-          sources:
+      compiler: clang
-            - ubuntu-toolchain-r-test
+      env: PREC=double
          packages:
            - g++-4.9
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-5
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
@@ -72,32 +21,41 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
 install:
    - export CWD=`pwd`
    - echo $CWD
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
    - which autoconf
    - autoconf  --version
    - which automake
    - automake  --version
    - which $CC
    - $CC  --version
    - which $CXX
    - $CXX --version
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
 script:
    - ./bootstrap.sh
    - mkdir build
    - cd build
-    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
+    - mkdir lime
    - cd lime
    - mkdir build
    - cd build
    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
    - tar xf lime-1.3.2.tar.gz
    - cd lime-1.3.2
    - ./configure --prefix=$CWD/build/lime/install
    - make -j4
    - make install
    - cd $CWD/build
    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
-    - echo make clean
+    - make check
    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - echo make clean
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make -j4; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
@@ -0,0 +1,37 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/DisableWarnings.h
 Copyright (C) 2016
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef DISABLE_WARNINGS_H
 #define DISABLE_WARNINGS_H
 //disables and intel compiler specific warning (in json.hpp)
 #pragma warning disable 488  
 #endif
@@ -41,7 +41,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
 #include <Grid/qcd/utils/GaugeFix.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/parallelIO/MetaData.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
 #endif
@@ -38,28 +38,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_BASE_H
 #define GRID_BASE_H
-///////////////////
+#include <Grid/GridStd.h>
 // Std C++ dependencies
 ///////////////////
 #include <cassert>
 #include <complex>
 #include <vector>
 #include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
 ///////////////////
 // Grid headers
 ///////////////////
 #include "Config.h"
 #include <Grid/perfmon/Timer.h>
 #include <Grid/perfmon/PerfCount.h>
@@ -69,6 +48,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/serialisation/Serialisation.h>
 #include <Grid/threads/Threads.h>
 #include <Grid/util/Util.h>
 #include <Grid/util/Sha.h>
 #include <Grid/communicator/Communicator.h> 
 #include <Grid/cartesian/Cartesian.h>    
 #include <Grid/tensors/Tensors.h>      
@@ -0,0 +1,29 @@
 #ifndef GRID_STD_H
 #define GRID_STD_H
 ///////////////////
 // Std C++ dependencies
 ///////////////////
 #include <cassert>
 #include <complex>
 #include <vector>
 #include <string>
 #include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
 #include <zlib.h>
 ///////////////////
 // Grid config
 ///////////////////
 #include "Config.h"
 #endif /* GRID_STD_H */
@@ -0,0 +1,14 @@
 #pragma once
 // Force Eigen to use MKL if Grid has been configured with --enable-mkl
 #ifdef USE_MKL
 #define EIGEN_USE_MKL_ALL
 #endif
 #if defined __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 #include <Grid/Eigen/Dense>
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -0,0 +1,63 @@
 extra_sources=
 extra_headers=
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
  extra_sources+=communicator/SharedMemoryMPI.cc
  extra_sources+=communicator/SharedMemory.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
  extra_sources+=communicator/SharedMemoryNone.cc
  extra_sources+=communicator/SharedMemory.cc
 endif
 if BUILD_HDF5
  extra_sources+=serialisation/Hdf5IO.cc 
  extra_headers+=serialisation/Hdf5IO.h
  extra_headers+=serialisation/Hdf5Type.h
 endif
 all: version-cache
 version-cache:
 	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
 		a="uncommited changes";\
 	else\
 		a="clean";\
 	fi;\
 	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d $$a\\"%n" HEAD`" > vertmp;\
 	if [ -e version-cache ]; then\
 		d=`diff vertmp version-cache`;\
 		if [ "$${d}" != "" ]; then\
 			mv vertmp version-cache;\
 			rm -f Version.h;\
 		fi;\
 	else\
 		mv vertmp version-cache;\
 		rm -f Version.h;\
 	fi;\
 	rm -f vertmp
 Version.h:
 	cp version-cache Version.h
 .PHONY: version-cache
 #
 # Libraries
 #
 include Make.inc
 include Eigen.inc
 lib_LIBRARIES = libGrid.a
 CCFILES += $(extra_sources)
 HFILES  += $(extra_headers) Config.h Version.h
 libGrid_a_SOURCES              = $(CCFILES)
 libGrid_adir                   = $(includedir)/Grid
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files)
@@ -1,6 +1,6 @@
    /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/Algorithms.h
@@ -37,37 +37,26 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/Chebyshev.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
-
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
-// Lanczos support
+#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 //#include <Grid/algorithms/iterative/MatrixUtils.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>
-// Eigen/lanczos
+
 // EigCg
 // MCR
 // Pcg
 // Multishift CG
 // Hdcg
 // GCR
 // etc..
 // integrator/Leapfrog
 // integrator/Omelyan
 // integrator/ForceGradient
 // montecarlo/hmc
 // montecarlo/rhmc
 // montecarlo/metropolis
 // etc...
 #endif
@@ -103,29 +103,32 @@ namespace Grid {
    GridBase *CoarseGrid;
    GridBase *FineGrid;
    std::vector<Lattice<Fobj> > subspace;
    int checkerboard;
-    Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid) : 
+  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
-      CoarseGrid(_CoarseGrid),
+    CoarseGrid(_CoarseGrid),
      FineGrid(_FineGrid),
-      subspace(nbasis,_FineGrid)
+      subspace(nbasis,_FineGrid),
      checkerboard(_checkerboard)
 	{
 	};
    void Orthogonalise(void){
      CoarseScalar InnerProd(CoarseGrid); 
      std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
      blockOrthogonalise(InnerProd,subspace);
      std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
      blockOrthogonalise(InnerProd,subspace);
      //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
      //      CheckOrthogonal();
    } 
    void CheckOrthogonal(void){
      CoarseVector iProj(CoarseGrid); 
      CoarseVector eProj(CoarseGrid); 
      Lattice<CComplex> pokey(CoarseGrid);
      for(int i=0;i<nbasis;i++){
 	blockProject(iProj,subspace[i],subspace);
 	eProj=zero; 
-	for(int ss=0;ss<CoarseGrid->oSites();ss++){
+	parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
 	  eProj._odata[ss](i)=CComplex(1.0);
 	}
 	eProj=eProj - iProj;
@@ -137,6 +140,7 @@ namespace Grid {
      blockProject(CoarseVec,FineVec,subspace);
    }
    void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
      FineVec.checkerboard = subspace[0].checkerboard;
      blockPromote(CoarseVec,FineVec,subspace);
    }
    void CreateSubspaceRandom(GridParallelRNG &RNG){
@@ -147,6 +151,7 @@ namespace Grid {
      Orthogonalise();
    }
    /*
    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
    {
      // Run a Lanczos with sloppy convergence
@@ -195,7 +200,7 @@ namespace Grid {
 	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 	}
    }
-
+    */
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
      RealD scale;
@@ -230,6 +230,7 @@ namespace Grid {
      // Barrel shift and collect global pencil
      std::vector<int> lcoor(Nd), gcoor(Nd);
      result = source;
      int pc = processor_coor[dim];
      for(int p=0;p<processors[dim];p++) {
        PARALLEL_REGION
        {
@@ -240,7 +241,8 @@ namespace Grid {
          for(int idx=0;idx<sgrid->lSites();idx++) {
            sgrid->LocalIndexToLocalCoor(idx,cbuf);
            peekLocalSite(s,result,cbuf);
-            cbuf[dim]+=p*L;
+	    cbuf[dim]+=((pc+p) % processors[dim])*L;
 	    //            cbuf[dim]+=p*L;
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
@@ -278,7 +280,6 @@ namespace Grid {
      flops+= flops_call*NN;
      // writing out result
      int pc = processor_coor[dim];
      PARALLEL_REGION
      {
        std::vector<int> clbuf(Nd), cgbuf(Nd);
@@ -51,7 +51,7 @@ namespace Grid {
      virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
      virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
-      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
      virtual void HermOp(const Field &in, Field &out)=0;
    };
@@ -162,15 +162,10 @@ namespace Grid {
 	_Mat.M(in,out);
      }
      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	ComplexD dot;
 	_Mat.M(in,out);
-	dot= innerProduct(in,out);
+	ComplexD dot= innerProduct(in,out); n1=real(dot);
-	n1=real(dot);
+	n2=norm2(out);
 	dot = innerProduct(out,out);
 	n2=real(dot);
      }
      void HermOp(const Field &in, Field &out){
 	_Mat.M(in,out);
@@ -188,14 +183,16 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) =0;
      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
-	Field tmp(in._grid);
+      Field tmp(in._grid);
      tmp.checkerboard = in.checkerboard;
 	ni=Mpc(in,tmp);
 	no=MpcDag(tmp,out);
      }
-      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
      out.checkerboard = in.checkerboard;
 	MpcDagMpc(in,out,n1,n2);
      }
-      void HermOp(const Field &in, Field &out){
+      virtual void HermOp(const Field &in, Field &out){
 	RealD n1,n2;
 	HermOpAndNorm(in,out,n1,n2);
      }
@@ -212,7 +209,6 @@ namespace Grid {
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
    };
    template<class Matrix,class Field>
      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
@@ -221,13 +217,15 @@ namespace Grid {
    public:
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
-	Field tmp(in._grid);
+      Field tmp(in._grid);
-//	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
+      tmp.checkerboard = !in.checkerboard;
 	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
      //std::cout << "cb in " << in.checkerboard << "  cb out " << out.checkerboard << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
@@ -235,7 +233,7 @@ namespace Grid {
 	Field tmp(in._grid);
 	_Mat.MeooeDag(in,tmp);
-	_Mat.MooeeInvDag(tmp,out);
+        _Mat.MooeeInvDag(tmp,out);
 	_Mat.MeooeDag(out,tmp);
 	_Mat.MooeeDag(in,out);
@@ -270,7 +268,6 @@ namespace Grid {
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    template<class Matrix,class Field>
      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
    protected:
@@ -299,6 +296,82 @@ namespace Grid {
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    //  Staggered use
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field>
      class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
      Field tmp;
      RealD mass;
      double tMpc;
      double tIP;
      double tMeo;
      double taxpby_norm;
      uint64_t ncall;
    public:
      void Report(void)
      {
 	std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
 	std::cout << GridLogMessage << " HermOpAndNorm.IP  "<< tIP /ncall<<" usec "<<std::endl;
 	std::cout << GridLogMessage << " Mpc.MeoMoe        "<< tMeo/ncall<<" usec "<<std::endl;
 	std::cout << GridLogMessage << " Mpc.axpby_norm    "<< taxpby_norm/ncall<<" usec "<<std::endl;
      }
      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
      { 
 	assert( _Mat.isTrivialEE() );
 	mass = _Mat.Mass();
 	tMpc=0;
 	tIP =0;
        tMeo=0;
        taxpby_norm=0;
 	ncall=0;
      }
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	ncall++;
 	tMpc-=usecond();
 	n2 = Mpc(in,out);
 	tMpc+=usecond();
 	tIP-=usecond();
 	ComplexD dot= innerProduct(in,out);
 	tIP+=usecond();
 	n1 = real(dot);
      }
      virtual void HermOp(const Field &in, Field &out){
 	ncall++;
 	tMpc-=usecond();
 	_Mat.Meooe(in,out);
 	_Mat.Meooe(out,tmp);
 	tMpc+=usecond();
 	taxpby_norm-=usecond();
 	axpby(out,-1.0,mass*mass,tmp,in);
 	taxpby_norm+=usecond();
      }
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	tMeo-=usecond();
 	_Mat.Meooe(in,out);
 	_Mat.Meooe(out,tmp);
 	tMeo+=usecond();
 	taxpby_norm-=usecond();
 	RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
 	taxpby_norm+=usecond();
 	return nn;
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	return Mpc(in,out);
      }
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
 	assert(0);// Never need with staggered
      }
    };
    template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
    /////////////////////////////////////////////////////////////
@@ -314,6 +387,14 @@ namespace Grid {
      virtual void operator() (const Field &in, Field &out) = 0;
    };
    template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
    public:
      void operator() (const Field &in, Field &out){
 	out = in;
      };
    };
    /////////////////////////////////////////////////////////////
    // Base classes for Multishift solvers for operators
    /////////////////////////////////////////////////////////////
@@ -336,6 +417,64 @@ namespace Grid {
     };
    */
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hermitian operator Linear function and operator function
  ////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field>
      class HermOpOperatorFunction : public OperatorFunction<Field> {
      void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 	Linop.HermOp(in,out);
      };
    };
    template<typename Field>
      class PlainHermOp : public LinearFunction<Field> {
    public:
      LinearOperatorBase<Field> &_Linop;
      PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
      {}
      void operator()(const Field& in, Field& out) {
 	_Linop.HermOp(in,out);
      }
    };
    template<typename Field>
    class FunctionHermOp : public LinearFunction<Field> {
    public:
      OperatorFunction<Field>   & _poly;
      LinearOperatorBase<Field> &_Linop;
      FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop) 
 	: _poly(poly), _Linop(linop) {};
      void operator()(const Field& in, Field& out) {
 	_poly(_Linop,in,out);
      }
    };
  template<class Field>
  class Polynomial : public OperatorFunction<Field> {
  private:
    std::vector<RealD> Coeffs;
  public:
    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      Field AtoN(in._grid);
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
      }
    };
  };
 }
@@ -8,6 +8,7 @@
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <clehner@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,41 +34,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-  ////////////////////////////////////////////////////////////////////////////////////////////
+struct ChebyParams : Serializable {
-  // Simple general polynomial with user supplied coefficients
+  GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
-  ////////////////////////////////////////////////////////////////////////////////////////////
+				  RealD, alpha,  
-  template<class Field>
+				  RealD, beta,   
-  class HermOpOperatorFunction : public OperatorFunction<Field> {
+				  int, Npoly);
-    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+};
      Linop.HermOp(in,out);
    };
  };
  template<class Field>
  class Polynomial : public OperatorFunction<Field> {
  private:
    std::vector<RealD> Coeffs;
  public:
    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      Field AtoN(in._grid);
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
 //            std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
 //            std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
 //            std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
 //		std::cout << n<<" " <<norm2(out)<<std::endl;
      }
    };
  };
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Generic Chebyshev approximations
@@ -82,8 +54,10 @@ namespace Grid {
  public:
    void csv(std::ostream &out){
-	RealD diff = hi-lo;
+      RealD diff = hi-lo;
-      for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
+      RealD delta = (hi-lo)*1.0e-9;
      for (RealD x=lo; x<hi; x+=delta) {
 	delta*=1.1;
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
@@ -99,6 +73,7 @@ namespace Grid {
    };
    Chebyshev(){};
    Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
@@ -193,12 +168,54 @@ namespace Grid {
      return sum;
    };
    RealD approxD(RealD x)
    {
      RealD Un;
      RealD Unm;
      RealD Unp;
      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
      RealD U0=1;
      RealD U1=2*y;
      RealD sum;
      sum = Coeffs[1]*U0;
      sum+= Coeffs[2]*U1*2.0;
      Un =U1;
      Unm=U0;
      for(int i=2;i<order-1;i++){
 	Unp=2*y*Un-Unm;
 	Unm=Un;
 	Un =Unp;
 	sum+= Un*Coeffs[i+1]*(i+1.0);
      }
      return sum/(0.5*(hi-lo));
    };
    RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
      RealD x = x0;
      RealD eps;
      int i;
      for (i=0;i<maxiter;i++) {
 	eps = approx(x) - z;
 	if (fabs(eps / z) < resid)
 	  return x;
 	x = x - eps / approxD(x);
      }
      return std::numeric_limits<double>::quiet_NaN();
    }
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      GridBase *grid=in._grid;
-//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
+
-//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
+      // std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
      //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
      int vol=grid->gSites();
@@ -0,0 +1,152 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/approx/Forecast.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef INCLUDED_FORECAST_H
 #define INCLUDED_FORECAST_H
 namespace Grid {
  // Abstract base class.
  // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
  // and returns a forecasted solution to the system D*psi = phi (psi).
  template<class Matrix, class Field>
  class Forecast
  {
    public:
      virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
  };
  // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
  // used to forecast solutions across poles of the EOFA heatbath.
  //
  // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
  template<class Matrix, class Field>
  class ChronoForecast : public Forecast<Matrix,Field>
  {
    public:
      Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
      {
        int degree = prev_solns.size();
        Field chi(phi); // forecasted solution
        // Trivial cases
        if(degree == 0){ chi = zero; return chi; }
        else if(degree == 1){ return prev_solns[0]; }
        RealD dot;
        ComplexD xp;
        Field r(phi); // residual
        Field Mv(phi);
        std::vector<Field> v(prev_solns); // orthonormalized previous solutions
        std::vector<Field> MdagMv(degree,phi);
        // Array to hold the matrix elements
        std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
        // Solution and source vectors
        std::vector<ComplexD> a(degree);
        std::vector<ComplexD> b(degree);
        // Orthonormalize the vector basis
        for(int i=0; i<degree; i++){
          v[i] *= 1.0/std::sqrt(norm2(v[i]));
          for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
        }
        // Perform sparse matrix multiplication and construct rhs
        for(int i=0; i<degree; i++){
          b[i] = innerProduct(v[i],phi);
          Mat.M(v[i],Mv);
          Mat.Mdag(Mv,MdagMv[i]);
          G[i][i] = innerProduct(v[i],MdagMv[i]);
        }
        // Construct the matrix
        for(int j=0; j<degree; j++){
        for(int k=j+1; k<degree; k++){
          G[j][k] = innerProduct(v[j],MdagMv[k]);
          G[k][j] = std::conj(G[j][k]);
        }}
        // Gauss-Jordan elimination with partial pivoting
        for(int i=0; i<degree; i++){
          // Perform partial pivoting
          int k = i;
          for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
          if(k != i){
            xp = b[k];
            b[k] = b[i];
            b[i] = xp;
            for(int j=0; j<degree; j++){
              xp = G[k][j];
              G[k][j] = G[i][j];
              G[i][j] = xp;
            }
          }
          // Convert matrix to upper triangular form
          for(int j=i+1; j<degree; j++){
            xp = G[j][i]/G[i][i];
            b[j] -= xp * b[i];
            for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
          }
        }
        // Use Gaussian elimination to solve equations and calculate initial guess
        chi = zero;
        r = phi;
        for(int i=degree-1; i>=0; i--){
          a[i] = 0.0;
          for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
          a[i] = (b[i]-a[i])/G[i][i];
          chi += a[i]*v[i];
          r -= a[i]*MdagMv[i];
        }
        RealD true_r(0.0);
        ComplexD tmp;
        for(int i=0; i<degree; i++){
          tmp = -b[i];
          for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
          tmp = std::conj(tmp)*tmp;
          true_r += std::sqrt(tmp.real());
        }
        RealD error = std::sqrt(norm2(r)/norm2(phi));
        std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
        return chi;
      };
  };
 }
 #endif
@@ -16,7 +16,7 @@
 #define INCLUDED_ALG_REMEZ_H
 #include <stddef.h>
-#include <Config.h>
+#include <Grid/GridStd.h>
 #ifdef HAVE_LIBGMP
 #include "bigfloat.h"
@@ -0,0 +1,606 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
 Copyright (C) 2017
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
 #define GRID_BLOCK_CONJUGATE_GRADIENT_H
 namespace Grid {
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:
  typedef typename Field::scalar_type scomplex;
  int blockDim ;
  int Nblock;
  BlockCGtype CGtype;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
  {};
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 Eigen::MatrixXcd &C,
 		 Eigen::MatrixXcd &Cinv,
 		 Field & Q,
 		 const Field & R)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
  //
  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
  //
  //   Q  C = R => Q = R C^{-1}
  //
  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
  //
  // Set C = L^{dag}, and then Q^dag Q = ident 
  //
  // Checks:
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  // Force manifest hermitian to avoid rounding related
  m_rr = 0.5*(m_rr+m_rr.adjoint());
 #if 0
  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
  auto  D_ldlt = m_rr.ldlt().vectorD(); 
  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
 #endif
  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Q = R C^{-1}
  //
  // Q_j  = R_i Cinv(i,j) 
  //
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
  } else if (CGtype == BlockCG ) {
    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
 //--------------------------
 // X is guess/Solution
 // B is RHS
 // Solve A X_i = B_i    ;        i refers to Nblock index
 ////////////////////////////////////////////////////////////////////////////
 void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  X.checkerboard = B.checkerboard;
  conformable(X, B);
  Field tmp(B);
  Field Q(B);
  Field D(B);
  Field Z(B);
  Field AD(B);
  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,X,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  /************************************************************************
   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
   ************************************************************************
   * Dimensions:
   *
   *   X,B==(Nferm x Nblock)
   *   A==(Nferm x Nferm)
   *  
   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
   * 
   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
   * for k: 
   *   Z  = AD
   *   M  = [D^dag Z]^{-1}
   *   X  = X + D MC
   *   QS = Q - ZM
   *   D  = Q + D S^dag
   *   C  = S C
   */
  ///////////////////////////////////////
  // Initial block: initial search dir is guess
  ///////////////////////////////////////
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch QRTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    //3. Z  = AD
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
    sliceMaddTimer.Start();
    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
    sliceMaddTimer.Stop();
    //6. QS = Q - ZM
    sliceMaddTimer.Start();
    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
    sliceMaddTimer.Stop();
    QRTimer.Start();
    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
    QRTimer.Stop();
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
    //8. C  = S C
    m_C = m_S*m_C;
    /*********************
     * convergence monitor
     *********************
     */
    m_rr = m_C.adjoint() * m_C;
    RealD max_resid=0;
    RealD rrsum=0;
    RealD rr;
    for(int b=0;b<Nblock;b++) {
      rrsum+=real(m_rr(b,b));
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(X, AD);
      AD = AD-B;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  Psi.checkerboard = Src.checkerboard;
  conformable(Psi, Src);
  Field P(Src);
  Field AP(Src);
  Field R(Src);
  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,Src,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,Src,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,Psi,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  // Initial search dir is guess
  Linop.HermOp(Psi, AP);
  /************************************************************************
   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
   ************************************************************************
   * O'Leary : R = B - A X
   * O'Leary : P = M R ; preconditioner M = 1
   * O'Leary : alpha = PAP^{-1} RMR
   * O'Leary : beta  = RMR^{-1}_old RMR_new
   * O'Leary : X=X+Palpha
   * O'Leary : R_new=R_old-AP alpha
   * O'Leary : P=MR_new+P beta
   */
  R = Src - AP;  
  P = R;
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    RealD rrsum=0;
    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
    MatrixTimer.Start();
    Linop.HermOp(P, AP);
    MatrixTimer.Stop();
    // Alpha
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    m_pAp_inv = m_pAp.inverse();
    m_alpha   = m_pAp_inv * m_rr ;
    // Psi, R update
    sliceMaddTimer.Start();
    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
    sliceMaddTimer.Stop();
    // Beta
    m_rr_inv = m_rr.inverse();
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_rr,R,R,Orthog);
    sliceInnerTimer.Stop();
    m_beta = m_rr_inv *m_rr;
    // Search update
    sliceMaddTimer.Start();
    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
    sliceMaddTimer.Stop();
    P= AP;
    /*********************
     * convergence monitor
     *********************
     */
    RealD max_resid=0;
    RealD rr;
    for(int b=0;b<Nblock;b++){
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
 void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  Psi.checkerboard = Src.checkerboard;
  conformable(Psi, Src);
  Field P(Src);
  Field AP(Src);
  Field R(Src);
  std::vector<ComplexD> v_pAp(Nblock);
  std::vector<RealD> v_rr (Nblock);
  std::vector<RealD> v_rr_inv(Nblock);
  std::vector<RealD> v_alpha(Nblock);
  std::vector<RealD> v_beta(Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,Src,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,Src,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,Psi,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  // Initial search dir is guess
  Linop.HermOp(Psi, AP);
  R = Src - AP;  
  P = R;
  sliceNorm(v_rr,R,Orthog);
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch sliceNormTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    RealD rrsum=0;
    for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
    MatrixTimer.Start();
    Linop.HermOp(P, AP);
    MatrixTimer.Stop();
    // Alpha
    sliceInnerTimer.Start();
    sliceInnerProductVector(v_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    for(int b=0;b<Nblock;b++){
      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
    }
    // Psi, R update
    sliceMaddTimer.Start();
    sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
    sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
    sliceMaddTimer.Stop();
    // Beta
    for(int b=0;b<Nblock;b++){
      v_rr_inv[b] = 1.0/v_rr[b];
    }
    sliceNormTimer.Start();
    sliceNorm(v_rr,R,Orthog);
    sliceNormTimer.Stop();
    for(int b=0;b<Nblock;b++){
      v_beta[b] = v_rr_inv[b] *v_rr[b];
    }
    // Search update
    sliceMaddTimer.Start();
    sliceMaddVector(P,v_beta,P,R,Orthog);
    sliceMaddTimer.Stop();
    /*********************
     * convergence monitor
     *********************
     */
    RealD max_resid=0;
    for(int b=0;b<Nblock;b++){
      RealD rr = v_rr[b]/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 };
 }
 #endif
@@ -52,8 +52,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
        MaxIterations(maxit),
        ErrorOnNoConverge(err_on_no_conv){};
-  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
-                  Field &psi) {
+
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
@@ -69,7 +70,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
    Linop.HermOpAndNorm(psi, mmp, d, b);
    r = src - mmp;
    p = r;
@@ -78,12 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
    cp = a;
    ssq = norm2(src);
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   mmp " << b << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:  cp,r " << cp << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:     p " << a << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:     p " << a << std::endl;
    RealD rsq = Tolerance * Tolerance * ssq;
@@ -92,37 +92,46 @@ class ConjugateGradient : public OperatorFunction<Field> {
      return;
    }
-    std::cout << GridLogIterative << std::setprecision(4)
+    std::cout << GridLogIterative << std::setprecision(8)
              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
    GridStopWatch LinalgTimer;
    GridStopWatch InnerTimer;
    GridStopWatch AxpyNormTimer;
    GridStopWatch LinearCombTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    int k;
-    for (k = 1; k <= MaxIterations; k++) {
+    for (k = 1; k <= MaxIterations*1000; k++) {
      c = cp;
      MatrixTimer.Start();
-      Linop.HermOpAndNorm(p, mmp, d, qq);
+      Linop.HermOp(p, mmp);
      MatrixTimer.Stop();
      LinalgTimer.Start();
      //  RealD    qqck = norm2(mmp);
      //  ComplexD dck  = innerProduct(p,mmp);
      InnerTimer.Start();
      ComplexD dc  = innerProduct(p,mmp);
      InnerTimer.Stop();
      d = dc.real();
      a = c / d;
      b_pred = a * (a * qq - d) / c;
      AxpyNormTimer.Start();
      cp = axpy_norm(r, -a, mmp, r);
      AxpyNormTimer.Stop();
      b = cp / c;
-      // Fuse these loops ; should be really easy
+      LinearCombTimer.Start();
-      psi = a * p + psi;
+      parallel_for(int ss=0;ss<src._grid->oSites();ss++){
-      p = p * b + r;
+	vstream(psi[ss], a      *  p[ss] + psi[ss]);
-
+	vstream(p  [ss], b      *  p[ss] + r[ss]);
      }
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
                << " residual " << cp << " target " << rsq << std::endl;
@@ -132,8 +141,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
        RealD mmpnorm = sqrt(norm2(mmp));
        RealD psinorm = sqrt(norm2(psi));
        RealD srcnorm = sqrt(norm2(src));
        RealD resnorm = sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
@@ -147,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -157,8 +167,10 @@ class ConjugateGradient : public OperatorFunction<Field> {
    }
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
              << std::endl;
    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
  }
 };
 }
@@ -43,6 +43,7 @@ namespace Grid {
 public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
    int verbose;
    MultiShiftFunction shifts;
@@ -163,7 +164,16 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  for(int s=0;s<nshift;s++) {
    axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
  }
-  
+ 
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
  GridStopWatch AXPYTimer;
  GridStopWatch ShiftTimer;
  GridStopWatch QRTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  // Iteration loop
  int k;
@@ -171,7 +181,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  for (k=1;k<=MaxIterations;k++){
    a = c /cp;
    AXPYTimer.Start();
    axpy(p,a,p,r);
    AXPYTimer.Stop();
    // Note to self - direction ps is iterated seperately
    // for each shift. Does not appear to have any scope
@@ -180,6 +192,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
    // However SAME r is used. Could load "r" and update
    // ALL ps[s]. 2/3 Bandwidth saving
    // New Kernel: Load r, vector of coeffs, vector of pointers ps
    AXPYTimer.Start();
    for(int s=0;s<nshift;s++){
      if ( ! converged[s] ) { 
 	if (s==0){
@@ -190,22 +203,34 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	}
      }
    }
    AXPYTimer.Stop();
    cp=c;
    MatrixTimer.Start();  
    //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
    // The below is faster on KNL
    Linop.HermOp(p,mmp); 
    d=real(innerProduct(p,mmp));
-    Linop.HermOpAndNorm(p,mmp,d,qq);
+    MatrixTimer.Stop();  
    AXPYTimer.Start();
    axpy(mmp,mass[0],p,mmp);
    AXPYTimer.Stop();
    RealD rn = norm2(p);
    d += rn*mass[0];
    bp=b;
    b=-cp/d;
    AXPYTimer.Start();
    c=axpy_norm(r,b,mmp,r);
    AXPYTimer.Stop();
    // Toggle the recurrence history
    bs[0] = b;
    iz = 1-iz;
    ShiftTimer.Start();
    for(int s=1;s<nshift;s++){
      if((!converged[s])){
 	RealD z0 = z[s][1-iz];
@@ -215,6 +240,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
      }
    }
    ShiftTimer.Stop();
    for(int s=0;s<nshift;s++){
      int ss = s;
@@ -257,6 +283,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
    if ( all_converged ){
    SolverTimer.Stop();
      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
@@ -269,8 +298,19 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	RealD cn = norm2(src);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
      IterationsToComplete = k;	
      return;
    }
  }
  // ugly hack
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
@@ -0,0 +1,256 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
 #define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
 namespace Grid {
  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
  public:
    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
    // Defaults true.
    RealD Tolerance;
    Integer MaxIterations;
    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
    Integer ReliableUpdatesPerformed;
    bool DoFinalCleanup; //Final DP cleanup, defaults to true
    Integer IterationsToCleanup; //Final DP cleanup step iterations
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    GridBase* SinglePrecGrid;
    RealD Delta; //reliable update parameter
    //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
    LinearOperatorBase<FieldF> *Linop_fallback;
    RealD fallback_transition_tol;
    ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
      : Tolerance(tol),
        MaxIterations(maxit),
 	Delta(_delta),
 	Linop_f(_Linop_f),
 	Linop_d(_Linop_d),
 	SinglePrecGrid(_sp_grid),
        ErrorOnNoConverge(err_on_no_conv),
 	DoFinalCleanup(true),
 	Linop_fallback(NULL)
    {};
    void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
      Linop_fallback = &_Linop_fallback;
      fallback_transition_tol = _fallback_transition_tol;      
    }
    void operator()(const FieldD &src, FieldD &psi) {
      LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
      bool using_fallback = false;
      psi.checkerboard = src.checkerboard;
      conformable(psi, src);
      RealD cp, c, a, d, b, ssq, qq, b_pred;
      FieldD p(src);
      FieldD mmp(src);
      FieldD r(src);
      // Initial residual computation & set up
      RealD guess = norm2(psi);
      assert(std::isnan(guess) == 0);
      Linop_d.HermOpAndNorm(psi, mmp, d, b);
      r = src - mmp;
      p = r;
      a = norm2(p);
      cp = a;
      ssq = norm2(src);
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
      RealD rsq = Tolerance * Tolerance * ssq;
      // Check if guess is really REALLY good :)
      if (cp <= rsq) {
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
 	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 	return;
      }
      //Single prec initialization
      FieldF r_f(SinglePrecGrid);
      r_f.checkerboard = r.checkerboard;
      precisionChange(r_f, r);
      FieldF psi_f(r_f);
      psi_f = zero;
      FieldF p_f(r_f);
      FieldF mmp_f(r_f);
      RealD MaxResidSinceLastRelUp = cp; //initial residual    
      std::cout << GridLogIterative << std::setprecision(4)
 		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
      GridStopWatch LinalgTimer;
      GridStopWatch MatrixTimer;
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      int k = 0;
      int l = 0;
      for (k = 1; k <= MaxIterations; k++) {
 	c = cp;
 	MatrixTimer.Start();
 	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
 	MatrixTimer.Stop();
 	LinalgTimer.Start();
 	a = c / d;
 	b_pred = a * (a * qq - d) / c;
 	cp = axpy_norm(r_f, -a, mmp_f, r_f);
 	b = cp / c;
 	// Fuse these loops ; should be really easy
 	psi_f = a * p_f + psi_f;
 	//p_f = p_f * b + r_f;
 	LinalgTimer.Stop();
 	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
 		  << " residual " << cp << " target " << rsq << std::endl;
 	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
 	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
 	if(cp > MaxResidSinceLastRelUp){
 	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
 	  MaxResidSinceLastRelUp = cp;
 	}
 	// Stopping condition
 	if (cp <= rsq) {
 	  //Although not written in the paper, I assume that I have to add on the final solution
 	  precisionChange(mmp, psi_f);
 	  psi = psi + mmp;
 	  SolverTimer.Stop();
 	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	  p = mmp - src;
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm / srcnorm;
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
 	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
 	  std::cout << GridLogMessage << "Time breakdown "<<std::endl;
 	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 	  IterationsToComplete = k;	
 	  ReliableUpdatesPerformed = l;
 	  if(DoFinalCleanup){
 	    //Do a final CG to cleanup
 	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
 	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
 	    CG.ErrorOnNoConverge = ErrorOnNoConverge;
 	    CG(Linop_d,src,psi);
 	    IterationsToCleanup = CG.IterationsToComplete;
 	  }
 	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
 	  return;
 	}
 	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
 	  precisionChange(mmp, psi_f);
 	  psi = psi + mmp;
 	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	  r = src - mmp;
 	  psi_f = zero;
 	  precisionChange(r_f, r);
 	  cp = norm2(r);
 	  MaxResidSinceLastRelUp = cp;
 	  b = cp/c;
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
 	  l = l+1;
 	}
 	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
 	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
 	  Linop_f_use = Linop_fallback;
 	  using_fallback = true;
 	}
      }
      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
 		<< std::endl;
      if (ErrorOnNoConverge) assert(0);
      IterationsToComplete = k;
      ReliableUpdatesPerformed = l;      
    }    
  };
 };
 #endif
@@ -0,0 +1,104 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DEFLATION_H
 #define GRID_DEFLATION_H
 namespace Grid { 
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = zero; };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
 ////////////////////////////////
 // Fine grid deflation
 ////////////////////////////////
 template<class Field>
 class DeflatedGuesser: public LinearFunction<Field> {
 private:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
 public:
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
  virtual void operator()(const Field &src,Field &guess) {
    guess = zero;
    assert(evec.size()==eval.size());
    auto N = evec.size();
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
    }
    guess.checkerboard = src.checkerboard;
  }
 };
 template<class FineField, class CoarseField>
 class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
 private:
  const std::vector<FineField>   &subspace;
  const std::vector<CoarseField> &evec_coarse;
  const std::vector<RealD>       &eval_coarse;
 public:
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
    : subspace(_subspace), 
      evec_coarse(_evec_coarse), 
      eval_coarse(_eval_coarse)  
  {
  }
  void operator()(const FineField &src,FineField &guess) { 
    int N = (int)evec_coarse.size();
    CoarseField src_coarse(evec_coarse[0]._grid);
    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
    blockProject(src_coarse,src,subspace);    
    for (int i=0;i<N;i++) {
      const CoarseField & tmp = evec_coarse[i];
      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
    }
    blockPromote(guess_coarse,guess,subspace);
    guess.checkerboard = src.checkerboard;
  };
 };
 }
 #endif
@@ -0,0 +1,842 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 Author: Christoph Lehner <clehner@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_BIRL_H
 #define GRID_BIRL_H
 #include <string.h> //memset
 //#include <zlib.h>
 #include <sys/stat.h>
 namespace Grid { 
  ////////////////////////////////////////////////////////
  // Move following 100 LOC to lattice/Lattice_basis.h
  ////////////////////////////////////////////////////////
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
  }
 }
 template<class Field>
 void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
 {
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0]._grid;
  parallel_region
  {
    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
    parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
      for(int j=j0; j<j1; ++j) B[j]=0.;
      for(int j=j0; j<j1; ++j){
 	for(int k=k0; k<k1; ++k){
 	  B[j] +=Qt(j,k) * basis[k]._odata[ss];
 	}
      }
      for(int j=j0; j<j1; ++j){
 	  basis[j]._odata[ss] = B[j];
      }
    }
  }
 }
 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0]._grid;
  result.checkerboard = basis[0].checkerboard;
  parallel_for(int ss=0;ss < grid->oSites();ss++){
    vobj B = zero;
    for(int k=k0; k<k1; ++k){
      B +=Qt(j,k) * basis[k]._odata[ss];
    }
    result._odata[ss] = B;
  }
 }
 template<class Field>
 void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
 {
  int vlen = idx.size();
  assert(vlen>=1);
  assert(vlen<=sort_vals.size());
  assert(vlen<=_v.size());
  for (size_t i=0;i<vlen;i++) {
    if (idx[i] != i) {
      //////////////////////////////////////
      // idx[i] is a table of desired sources giving a permutation.
      // Swap v[i] with v[idx[i]].
      // Find  j>i for which _vnew[j] = _vold[i],
      // track the move idx[j] => idx[i]
      // track the move idx[i] => i
      //////////////////////////////////////
      size_t j;
      for (j=i;j<idx.size();j++)
 	if (idx[j]==i)
 	  break;
      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
      std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
      std::swap(sort_vals[i],sort_vals[idx[i]]);
      idx[j] = idx[i];
      idx[i] = i;
    }
  }
 }
 inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
 {
  std::vector<int> idx(sort_vals.size());
  std::iota(idx.begin(), idx.end(), 0);
  // sort indexes based on comparing values in v
  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
  });
  return idx;
 }
 template<class Field>
 void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
 {
  std::vector<int> idx = basisSortGetIndex(sort_vals);
  if (reverse)
    std::reverse(idx.begin(), idx.end());
  basisReorderInPlace(_v,sort_vals,idx);
 }
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
 template<class Field> class ImplicitlyRestartedLanczosTester 
 {
 public:
  virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
  virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
 };
 enum IRLdiagonalisation { 
  IRLdiagonaliseWithDSTEGR,
  IRLdiagonaliseWithQR,
  IRLdiagonaliseWithEigen
 };
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
  {
    return TestConvergence(j,resid,B,eval,evalMaxApprox);
  }
  int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
  {
    Field v(B);
    RealD eval_poly = eval;
    // Apply operator
    _HermOp(B,v);
    RealD vnum = real(innerProduct(B,v)); // HermOp.
    RealD vden = norm2(B);
    RealD vv0  = norm2(v);
    eval   = vnum/vden;
    v -= eval*B;
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
 };
 template<class Field> 
 class ImplicitlyRestartedLanczos {
 private:
  const RealD small = 1.0e-8;
  int MaxIter;
  int MinRestart; // Minimum number of restarts; only check for convergence after
  int Nstop;   // Number of evecs checked for convergence
  int Nk;      // Number of converged sought
  //  int Np;      // Np -- Number of spare vecs in krylov space //  == Nm - Nk
  int Nm;      // Nm -- total number of vectors
  IRLdiagonalisation diagonalisation;
  int orth_period;
  RealD OrthoTime;
  RealD eresid, betastp;
  ////////////////////////////////
  // Embedded objects
  ////////////////////////////////
  LinearFunction<Field>       &_PolyOp;
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosTester<Field> &_Tester;
  // Default tester provided (we need a ref to something in default case)
  ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
  /////////////////////////
  // Constructor
  /////////////////////////
 public:       
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
  // Too many options  & knobs. 
  // Eliminate:
  //   orth_period
  //   betastp
  //   MinRestart
  //
  // Do we really need orth_period
  // What is the theoretical basis & guarantees of betastp ?
  // Nstop=Nk viable?
  // MinRestart avoidable with new convergence test?
  // Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
  // HermOp could be eliminated if we dropped the Power method for max eval.
  // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
  //////////////////////////////////////////////////////////////////
 ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
 			    LinearFunction<Field> & HermOp,
 			    ImplicitlyRestartedLanczosTester<Field> & Tester,
 			    int _Nstop, // sought vecs
 			    int _Nk, // sought vecs
 			    int _Nm, // spare vecs
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
 			    int _MinRestart=1, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
    ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
 			       LinearFunction<Field> & HermOp,
 			       int _Nstop, // sought vecs
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _MaxIter, // Max iterations
 			       RealD _betastp=0.0, // if beta(k) < betastp: converged
 			       int _MinRestart=1, int _orth_period = 1,
 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
  ////////////////////////////////
  // Helpers
  ////////////////////////////////
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  void orthogonalize(Field& w, std::vector<Field>& evec,int k)
  {
    OrthoTime-=usecond()/1e6;
    basisOrthogonalize(evec,w,k);
    normalise(w);
    OrthoTime+=usecond()/1e6;
  }
 /* Rudy Arthur's thesis pp.137
 ------------------------
 Require: M > K P = M − K †
 Compute the factorization AVM = VM HM + fM eM 
 repeat
  Q=I
  for i = 1,...,P do
    QiRi =HM −θiI Q = QQi
    H M = Q †i H M Q i
  end for
  βK =HM(K+1,K) σK =Q(M,K)
  r=vK+1βK +rσK
  VK =VM(1:M)Q(1:M,1:K)
  HK =HM(1:K,1:K)
  →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
 until convergence
 */
  void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false)
  {
    GridBase *grid = src._grid;
    assert(grid == evec[0]._grid);
    GridLogIRL.TimingMode(1);
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
    std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
      std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
      std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
      std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
    }
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    assert(Nm <= evec.size() && Nm <= eval.size());
    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
    RealD evalMaxApprox = 0.0;
    {
      auto src_n = src;
      auto tmp = src;
      const int _MAX_ITER_IRL_MEVAPP_ = 50;
      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
 	normalise(src_n);
 	_HermOp(src_n,tmp);
 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
 	RealD na = vnum/vden;
 	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	src_n = tmp;
      }
    }
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
    std::vector<RealD> eval2(Nm);
    std::vector<RealD> eval2_copy(Nm);
    Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
    Field f(grid);
    Field v(grid);
    int k1 = 1;
    int k2 = Nk;
    RealD beta_k;
    Nconv = 0;
    // Set initial vector
    evec[0] = src;
    normalise(evec[0]);
    // Initial Nk steps
    OrthoTime=0.;
    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
    std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
    std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
    //////////////////////////////////
    // Restarting loop begins
    //////////////////////////////////
    int iter;
    for(iter = 0; iter<MaxIter; ++iter){
      OrthoTime=0.;
      std::cout<< GridLogMessage <<" **********************"<< std::endl;
      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
      std::cout<< GridLogMessage <<" **********************"<< std::endl;
      std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
      f *= lme[Nm-1];
      std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
      std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
      //////////////////////////////////
      // getting eigenvalues
      //////////////////////////////////
      for(int k=0; k<Nm; ++k){
 	eval2[k] = eval[k+k1-1];
 	lme2[k] = lme[k+k1-1];
      }
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
      std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
      //////////////////////////////////
      // sorting
      //////////////////////////////////
      eval2_copy = eval2;
      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
      const int chunk=8;
      for(int io=0; io<k2;io+=chunk){
 	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
 	for(int ii=0;ii<chunk;ii++){
 	  if ( (io+ii)<k2 )
 	    std::cout<< " "<< std::setw(12)<< eval2[io+ii];
 	}
 	std::cout << std::endl;
      }
      //////////////////////////////////
      // Implicitly shifted QR transformations
      //////////////////////////////////
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      for(int ip=k2; ip<Nm; ++ip){ 
 	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
      }
      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
      assert(k2<Nm);      assert(k2<Nm);      assert(k1>0);
      basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
      std::cout<<GridLogIRL <<"basisRotated  by Qt"<<std::endl;
      ////////////////////////////////////////////////////
      // Compressed vector f and beta(k2)
      ////////////////////////////////////////////////////
      f *= Qt(k2-1,Nm-1);
      f += lme[k2-1] * evec[k2];
      beta_k = norm2(f);
      beta_k = sqrt(beta_k);
      std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
      RealD betar = 1.0/beta_k;
      evec[k2] = betar * f;
      lme[k2-1] = beta_k;
      ////////////////////////////////////////////////////
      // Convergence test
      ////////////////////////////////////////////////////
      for(int k=0; k<Nm; ++k){    
 	eval2[k] = eval[k];
 	lme2[k] = lme[k];
      }
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
      std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
      Nconv = 0;
      if (iter >= MinRestart) {
 	std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
 	Field B(grid); B.checkerboard = evec[0].checkerboard;
 	//  power of two search pattern;  not every evalue in eval2 is assessed.
 	int allconv =1;
 	for(int jj = 1; jj<=Nstop; jj*=2){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
 	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
 	    allconv=0;
 	  }
 	}
 	// Do evec[0] for good measure
 	{ 
 	  int j=0;
 	  RealD e = eval2_copy[0]; 
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
 	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
 	}
 	if ( allconv ) Nconv = Nstop;
 	// test if we converged, if so, terminate
 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
 	//	if( Nconv>=Nstop || beta_k < betastp){
 	if( Nconv>=Nstop){
 	  goto converged;
 	}
      } else {
 	std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
      } // end of iter loop
    }
    std::cout<<GridLogError<<"\n NOT converged.\n";
    abort();
  converged:
    {
      Field B(grid); B.checkerboard = evec[0].checkerboard;
      basisRotate(evec,Qt,0,Nk,0,Nk,Nm);	    
      std::cout << GridLogIRL << " Rotated basis"<<std::endl;
      Nconv=0;
      //////////////////////////////////////////////////////////////////////
      // Full final convergence test; unconditionally applied
      //////////////////////////////////////////////////////////////////////
      for(int j = 0; j<=Nk; j++){
 	B=evec[j];
 	if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
 	  Nconv++;
 	}
      }
      if ( Nconv < Nstop )
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
      eval=eval2;
      //Keep only converged
      eval.resize(Nconv);// Nstop?
      evec.resize(Nconv,grid);// Nstop?
      basisSortInPlace(evec,eval,reverse);
    }
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL << " -- Iterations  = "<< iter   << "\n";
    std::cout << GridLogIRL << " -- beta(k)     = "<< beta_k << "\n";
    std::cout << GridLogIRL << " -- Nconv       = "<< Nconv  << "\n";
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
  }
 private:
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
 3. wk:=Avk−βkv_{k−1}      
 4. αk:=(wk,vk)       // 
 5. wk:=wk−αkvk       // wk orthog vk 
 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
 7. vk+1 := wk/βk+1
 8. EndDo
 */
  void step(std::vector<RealD>& lmd,
 	    std::vector<RealD>& lme, 
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
    const RealD tiny = 1.0e-20;
    assert( k< Nm );
    GridStopWatch gsw_op,gsw_o;
    Field& evec_k = evec[k];
    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
    if(k>0) w -= lme[k-1] * evec[k-1];
    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
    RealD     alph = real(zalph);
    w = w - alph * evec_k;// 5. wk:=wk−αkvk
    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
    // 7. vk+1 := wk/βk+1
    lmd[k] = alph;
    lme[k] = beta;
    if (k>0 && k % orth_period == 0) {
      orthogonalize(w,evec,k); // orthonormalise
      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
    }
    if(k < Nm-1) evec[k+1] = w;
    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
  }
  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 			 int Nk, int Nm,  
 			 Eigen::MatrixXd & Qt, // Nm x Nm
 			 GridBase *grid)
  {
    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
    for (int i = 0; i < Nk; i++) {
      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
    }
    for (int i = 0; i < Nk; i++) {
      for (int j = 0; j < Nk; j++) {
 	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
      }
    }
  }
  ///////////////////////////////////////////////////////////////////////////
  // File could end here if settle on Eigen ??? !!!
  ///////////////////////////////////////////////////////////////////////////
  void QR_decomp(std::vector<RealD>& lmd,   // Nm 
 		 std::vector<RealD>& lme,   // Nm 
 		 int Nk, int Nm,            // Nk, Nm
 		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
 		 RealD Dsh, int kmin, int kmax)
  {
    int k = kmin-1;
    RealD x;
    RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
    RealD c = ( lmd[k] -Dsh) *Fden;
    RealD s = -lme[k] *Fden;
    RealD tmpa1 = lmd[k];
    RealD tmpa2 = lmd[k+1];
    RealD tmpb  = lme[k];
    lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
    lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
    lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
    x        =-s*lme[k+1];
    lme[k+1] = c*lme[k+1];
    for(int i=0; i<Nk; ++i){
      RealD Qtmp1 = Qt(k,i);
      RealD Qtmp2 = Qt(k+1,i);
      Qt(k,i)  = c*Qtmp1 - s*Qtmp2;
      Qt(k+1,i)= s*Qtmp1 + c*Qtmp2; 
    }
    // Givens transformations
    for(int k = kmin; k < kmax-1; ++k){
      RealD Fden = 1.0/hypot(x,lme[k-1]);
      RealD c = lme[k-1]*Fden;
      RealD s = - x*Fden;
      RealD tmpa1 = lmd[k];
      RealD tmpa2 = lmd[k+1];
      RealD tmpb  = lme[k];
      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
      lme[k-1] = c*lme[k-1] -s*x;
      if(k != kmax-2){
 	x = -s*lme[k+1];
 	lme[k+1] = c*lme[k+1];
      }
      for(int i=0; i<Nk; ++i){
 	RealD Qtmp1 = Qt(k,i);
 	RealD Qtmp2 = Qt(k+1,i);
 	Qt(k,i)     = c*Qtmp1 -s*Qtmp2;
 	Qt(k+1,i)   = s*Qtmp1 +c*Qtmp2;
      }
    }
  }
  void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 		   int Nk, int Nm,   
 		   Eigen::MatrixXd & Qt,
 		   GridBase *grid)
  {
    Qt = Eigen::MatrixXd::Identity(Nm,Nm);
    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
      diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
      diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
      diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
    } else { 
      assert(0);
    }
  }
 #ifdef USE_LAPACK
 void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
                   double *vl, double *vu, int *il, int *iu, double *abstol,
                   int *m, double *w, double *z, int *ldz, int *isuppz,
                   double *work, int *lwork, int *iwork, int *liwork,
                   int *info);
 #endif
 void diagonalize_lapack(std::vector<RealD>& lmd,
 			std::vector<RealD>& lme, 
 			int Nk, int Nm,  
 			Eigen::MatrixXd& Qt,
 			GridBase *grid)
 {
 #ifdef USE_LAPACK
  const int size = Nm;
  int NN = Nk;
  double evals_tmp[NN];
  double evec_tmp[NN][NN];
  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
  double DD[NN];
  double EE[NN];
  for (int i = 0; i< NN; i++) {
    for (int j = i - 1; j <= i + 1; j++) {
      if ( j < NN && j >= 0 ) {
 	if (i==j) DD[i] = lmd[i];
 	if (i==j) evals_tmp[i] = lmd[i];
 	if (j==(i-1)) EE[j] = lme[j];
      }
    }
  }
  int evals_found;
  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
  int liwork =  3+NN*10 ;
  int iwork[liwork];
  double work[lwork];
  int isuppz[2*NN];
  char jobz = 'V'; // calculate evals & evecs
  char range = 'I'; // calculate all evals
  //    char range = 'A'; // calculate all evals
  char uplo = 'U'; // refer to upper half of original matrix
  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
  int ifail[NN];
  int info;
  int total = grid->_Nprocessors;
  int node  = grid->_processor;
  int interval = (NN/total)+1;
  double vl = 0.0, vu = 0.0;
  int il = interval*node+1 , iu = interval*(node+1);
  if (iu > NN)  iu=NN;
  double tol = 0.0;
  if (1) {
    memset(evals_tmp,0,sizeof(double)*NN);
    if ( il <= NN){
      LAPACK_dstegr(&jobz, &range, &NN,
 		    (double*)DD, (double*)EE,
 		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
 		    &tol, // tolerance
 		    &evals_found, evals_tmp, (double*)evec_tmp, &NN,
 		    isuppz,
 		    work, &lwork, iwork, &liwork,
 		    &info);
      for (int i = iu-1; i>= il-1; i--){
 	evals_tmp[i] = evals_tmp[i - (il-1)];
 	if (il>1) evals_tmp[i-(il-1)]=0.;
 	for (int j = 0; j< NN; j++){
 	  evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
 	  if (il>1) evec_tmp[i-(il-1)][j]=0.;
 	}
      }
    }
    {
      grid->GlobalSumVector(evals_tmp,NN);
      grid->GlobalSumVector((double*)evec_tmp,NN*NN);
    }
  } 
  // Safer to sort instead of just reversing it, 
  // but the document of the routine says evals are sorted in increasing order. 
  // qr gives evals in decreasing order.
  for(int i=0;i<NN;i++){
    lmd [NN-1-i]=evals_tmp[i];
    for(int j=0;j<NN;j++){
      Qt((NN-1-i),j)=evec_tmp[i][j];
    }
  }
 #else 
  assert(0);
 #endif
 }
 void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 		    int Nk, int Nm,   
 		    Eigen::MatrixXd & Qt,
 		    GridBase *grid)
 {
  int QRiter = 100*Nm;
  int kmin = 1;
  int kmax = Nk;
  // (this should be more sophisticated)
  for(int iter=0; iter<QRiter; ++iter){
    // determination of 2x2 leading submatrix
    RealD dsub = lmd[kmax-1]-lmd[kmax-2];
    RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
    RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
    // (Dsh: shift)
    // transformation
    QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
    // Convergence criterion (redef of kmin and kamx)
    for(int j=kmax-1; j>= kmin; --j){
      RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
      if(fabs(lme[j-1])+dds > dds){
 	kmax = j+1;
 	goto continued;
      }
    }
    QRiter = iter;
    return;
  continued:
    for(int j=0; j<kmax-1; ++j){
      RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
      if(fabs(lme[j])+dds > dds){
 	kmin = j+1;
 	break;
      }
    }
  }
  std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
  abort();
 }
 };
 }
 #endif
@@ -0,0 +1,406 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
    Copyright (C) 2015
 Author: Christoph Lehner <clehner@bnl.gov>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
 namespace Grid { 
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
 				  ChebyParams, Cheby,/*Chebyshev*/
 				  int, Nstop,    /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
 				  int, Nk,       /*Vecs in Lanczos seek converge*/
 				  int, Nm,       /*Total vecs in Lanczos include restart*/
 				  RealD, resid,  /*residual*/
 				  int, MaxIt, 
 				  RealD, betastp,  /* ? */
 				  int, MinRes);    // Must restart
 };
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
 				  bool, saveEvecs,
 				  bool, doFine,
 				  bool, doFineRead,
 				  bool, doCoarse,
 	       			  bool, doCoarseRead,
 				  LanczosParams, FineParams,
 				  LanczosParams, CoarseParams,
 				  ChebyParams,   Smoother,
 				  RealD        , coarse_relax_tol,
 				  std::vector<int>, blockSize,
 				  std::string, config,
 				  std::vector < std::complex<double>  >, omega,
 				  RealD, mass,
 				  RealD, M5);
 };
 // Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LinearOperatorBase<FineField> &_Linop;
  std::vector<FineField>        &subspace;
  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
    _Linop(linop), subspace(_subspace)
  {  
    assert(subspace.size() >0);
  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
    FineField fout(FineGrid);   fout.checkerboard = checkerboard;
    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
  std::vector<FineField>        &subspace;
  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
 			  LinearOperatorBase<FineField>& linop, 
 			  std::vector<FineField> & _subspace) :
    _poly(poly),
    _Linop(linop),
    subspace(_subspace)
  {  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
    FineField fin (FineGrid); fin.checkerboard =checkerboard;
    FineField fout(FineGrid);fout.checkerboard =checkerboard;
    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
 {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
  RealD                          _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
    // Apply operator
    _Poly(B,v);
    RealD vnum = real(innerProduct(B,v)); // HermOp.
    RealD vden = norm2(B);
    RealD vv0  = norm2(v);
    eval   = vnum/vden;
    v -= eval*B;
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    GridBase *FineGrid = _subspace[0]._grid;    
    int checkerboard   = _subspace[0].checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;
    blockPromote(B,fv,_subspace);  
    _smoother(_Linop,fv,fB); 
    RealD eval_poly = eval;
    _Linop.HermOp(fB,fv);
    RealD vnum = real(innerProduct(fB,fv)); // HermOp.
    RealD vden = norm2(fB);
    RealD vv0  = norm2(fv);
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
 };
 ////////////////////////////////////////////
 // Make serializable Lanczos params
 ////////////////////////////////////////////
 template<class Fobj,class CComplex,int nbasis>
 class LocalCoherenceLanczos 
 {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<Fobj>                       FineField;
 protected:
  GridBase *_CoarseGrid;
  GridBase *_FineGrid;
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
  std::vector<RealD>                              &evals_fine;
  std::vector<RealD>                              &evals_coarse; 
  std::vector<FineField>                          &subspace;
  std::vector<CoarseField>                        &evec_coarse;
 private:
  std::vector<RealD>                              _evals_fine;
  std::vector<RealD>                              _evals_coarse; 
  std::vector<FineField>                          _subspace;
  std::vector<CoarseField>                        _evec_coarse;
 public:
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _FineOp(FineOp),
    _checkerboard(checkerboard),
    evals_fine  (_evals_fine),
    evals_coarse(_evals_coarse),
    subspace    (_subspace),
    evec_coarse(_evec_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  //////////////////////////////////////////////////////////////////////////
  // Alternate constructore, external storage for use by Hadrons module
  //////////////////////////////////////////////////////////////////////////
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard,
 			std::vector<FineField>   &ext_subspace,
 			std::vector<CoarseField> &ext_coarse,
 			std::vector<RealD>       &ext_eval_fine,
 			std::vector<RealD>       &ext_eval_coarse
 			) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _FineOp(FineOp),
    _checkerboard(checkerboard),
    evals_fine  (ext_eval_fine), 
    evals_coarse(ext_eval_coarse),
    subspace    (ext_subspace),
    evec_coarse (ext_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
  };
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = ::sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  /*
  void fakeFine(void)
  {
    int Nk = nbasis;
    subspace.resize(Nk,_FineGrid);
    subspace[0]=1.0;
    subspace[0].checkerboard=_checkerboard;
    normalise(subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
      subspace[k].checkerboard=_checkerboard;
      Op(subspace[k-1],subspace[k]);
      normalise(subspace[k]);
    }
  }
  */
  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
    assert(subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
    }
  }
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
    assert(subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
 	assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
      } else { 
 	assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
      }
    }
  }
  void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, 
 		RealD MaxIt, RealD betastp, int MinRes)
  {
    assert(nbasis<=Nm);
    Chebyshev<FineField>      Cheby(cheby_parms);
    FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
    PlainHermOp<FineField>    Op(_FineOp);
    evals_fine.resize(Nm);
    subspace.resize(Nm,_FineGrid);
    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
    int Nconv;
    IRL.calc(evals_fine,subspace,src,Nconv,false);
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
    assert(Nconv>=Nstop);
    evals_coarse.resize(Nstop);
    evec_coarse.resize (Nstop,_CoarseGrid);
    for (int i=0;i<Nstop;i++){
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
 };
 }
 #endif
@@ -0,0 +1,186 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 namespace Grid {
 namespace QCD {
 template<class Field>
 class PauliVillarsSolverUnprec
 {
 public:
  ConjugateGradient<Field> & CG;
  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    RealD m = _Matrix.Mass();
    Field A  (_Matrix.FermionGrid());
    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
    _Matrix.SetMass(1.0);
    _Matrix.Mdag(src,A);
    CG(HermOp,A,sol);
    _Matrix.SetMass(m);
  };
 };
 template<class Field>
 class PauliVillarsSolverRBprec
 {
 public:
  ConjugateGradient<Field> & CG;
  PauliVillarsSolverRBprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    RealD m = _Matrix.Mass();
    Field A  (_Matrix.FermionGrid());
    _Matrix.SetMass(1.0);
    SchurRedBlackDiagMooeeSolve<Field> SchurSolver(CG);
    SchurSolver(_Matrix,src,sol);
    _Matrix.SetMass(m);
  };
 };
 template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
 private:
  PVinverter & PauliVillarsSolver;
 public:
 /////////////////////////////////////////////////////
 // First cut works, 10 Oct 2018.
 //
 // Must form a plan to get this into production for Zmobius acceleration
 // of the Mobius exact AMA corrections.
 //
 // TODO : understand absence of contact term in eqns in Hantao's thesis
 //        sol4 is contact term subtracted.
 //
 // Options
 // a) Defect correction approach:
 //    1) Compute defect from current soln (initially guess).
 //       This is ...... outerToInner check !!!!
 //    2) Deflated Zmobius solve to get 4d soln
 //       Ensure deflation is working
 //    3) Refine 5d Outer using the inner 4d delta soln
 // 
 // Step 1: localise PV inverse in a routine. [DONE]
 // Step 2: Schur based PV inverse            [DONE]
 // Step 3: Fourier accelerated PV inverse
 // Step 4: 
 /////////////////////////////////////////////////////
  Reconstruct5DfromPhysical(PVinverter &_PauliVillarsSolver) 
    : PauliVillarsSolver(_PauliVillarsSolver) 
  { 
  };
   template<class Matrix>
   void PV(Matrix &_Matrix,const Field &src,Field &sol)
   {
     RealD m = _Matrix.Mass();
     _Matrix.SetMass(1.0);
     _Matrix.M(src,sol);
     _Matrix.SetMass(m);
   }
   template<class Matrix>
   void PVdag(Matrix &_Matrix,const Field &src,Field &sol)
   {
     RealD m = _Matrix.Mass();
     _Matrix.SetMass(1.0);
     _Matrix.Mdag(src,sol);
     _Matrix.SetMass(m);
   }
  template<class Matrix>
  void operator() (Matrix & _Matrix,const Field &sol4,const Field &src4, Field &sol5){
    int Ls =  _Matrix.Ls;
    Field psi4(_Matrix.GaugeGrid());
    Field psi(_Matrix.FermionGrid());
    Field A  (_Matrix.FermionGrid());
    Field B  (_Matrix.FermionGrid());
    Field c  (_Matrix.FermionGrid());
    typedef typename Matrix::Coeff_t Coeff_t;
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    std::cout << GridLogMessage<< " Reconstruct5Dprop: c.f. MADWF algorithm         " << std::endl;
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    ///////////////////////////////////////
    //Import source, include Dminus factors
    ///////////////////////////////////////
    _Matrix.ImportPhysicalFermionSource(src4,B); 
    ///////////////////////////////////////
    // Set up c from src4
    ///////////////////////////////////////
    PauliVillarsSolver(_Matrix,B,A);
    _Matrix.Pdag(A,c);
    //////////////////////////////////////
    // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
    //////////////////////////////////////
    psi4 = - sol4;
    InsertSlice(psi4, psi, 0   , 0);
    for (int s=1;s<Ls;s++) {
      ExtractSlice(psi4,c,s,0);
       InsertSlice(psi4,psi,s,0);
    }
    /////////////////////////////
    // Pdag PV^-1 Dm P 
    /////////////////////////////
    _Matrix.P(psi,B);
    _Matrix.M(B,A);
    PauliVillarsSolver(_Matrix,A,B);
    _Matrix.Pdag(B,A);
    //////////////////////////////
    // Reinsert surface prop
    //////////////////////////////
    InsertSlice(sol4,A,0,0);
    //////////////////////////////
    // Convert from y back to x 
    //////////////////////////////
    _Matrix.P(A,sol5);
  }
 };
 }
 }
@@ -0,0 +1,503 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_SCHUR_RED_BLACK_H
 #define GRID_SCHUR_RED_BLACK_H
  /*
   * Red black Schur decomposition
   *
   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
   *                =         L                     D                     U
   *
   * L^-1 = (1              0 )
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
   *        (0    1           )
   * U^{dag} = ( 1                 0)
   *           (Meo^dag Mee^{-dag} 1)
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   ***********************
   *     M psi = eta
   ***********************
   *Odd
   * i)                 D_oo psi_o =  L^{-1}  eta_o
   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *
   * Wilson:
   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
   * Stag:
   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
   *
   * L^-1 eta_o= (1              0 ) (e
   *             (-MoeMee^{-1}   1 )   
   *
   *Even
   * ii)  Mee psi_e + Meo psi_o = src_e
   *
   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
   *
   * 
   * TODO: Other options:
   * 
   * a) change checkerboards for Schur e<->o
   *
   * Left precon by Moo^-1
   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
   *
   * Right precon by Moo^-1
   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *                              psi_o = M_oo^-1 phi_o
   * TODO: Deflation 
   */
 namespace Grid {
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Now make the norm reflect extra factor of Mee
  template<class Field> class SchurRedBlackStaggeredSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise=0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      //src_o = tmp;     assert(src_o.checkerboard ==Odd);
      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
      guess(src_o, sol_o);
      Mtmp = sol_o;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
      // Fionn A2A boolean behavioural control
      if (subGuess)        sol_o = sol_o-Mtmp;
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagMooeeSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false)  :  _HermitianRBSolver(HermitianRBSolver) 
  { 
    CBfactorise=cb;
    subtractGuess(initSubGuess);
  };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
      guess(src_o,sol_o);
      Mtmp = sol_o;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      // Fionn A2A boolean behavioural control
      if (subGuess)        sol_o = sol_o-Mtmp;
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise = 0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix,class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      guess(src_o,tmp);
      Mtmp = tmp;
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      // Fionn A2A boolean behavioural control
      if (subGuess)      tmp = tmp-Mtmp;
      _Matrix.MooeeInv(tmp,sol_o);       assert(  sol_o.checkerboard   ==Odd);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoMixed {
  private:
    LinearFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise=0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      guess(src_o,tmp);
      Mtmp = tmp;
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      // Fionn A2A boolean behavioural control
      if (subGuess)      tmp = tmp-Mtmp;
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
 }
 #endif
@@ -0,0 +1,125 @@
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 namespace Grid {
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 int PointerCache::victim;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
 void *PointerCache::Insert(void *ptr,size_t bytes) {
  if (bytes < 4096 ) return ptr;
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  void * ret = NULL;
  int v = -1;
  for(int e=0;e<Ncache;e++) {
    if ( Entries[e].valid==0 ) {
      v=e; 
      break;
    }
  }
  if ( v==-1 ) {
    v=victim;
    victim = (victim+1)%Ncache;
  }
  if ( Entries[v].valid ) {
    ret = Entries[v].address;
    Entries[v].valid = 0;
    Entries[v].address = NULL;
    Entries[v].bytes = 0;
  }
  Entries[v].address=ptr;
  Entries[v].bytes  =bytes;
  Entries[v].valid  =1;
  return ret;
 }
 void *PointerCache::Lookup(size_t bytes) {
 if (bytes < 4096 ) return NULL;
 #ifdef _OPENMP
  assert(omp_in_parallel()==0);
 #endif 
  for(int e=0;e<Ncache;e++){
    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
      Entries[e].valid = 0;
      return Entries[e].address;
    }
  }
  return NULL;
 }
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
  int fd = open("/proc/self/pagemap", O_RDONLY);
  assert(fd >= 0);
  const int page_size = 4096;
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
  n4ktotal = 0;
  nnothuge = 0;
  for (int i = 0; i < nhugepages; ++i) {
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
    for (int j = 0; j < 512; ++j) {
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
      ++n4ktotal;
      if (pageaddr != baseaddr + j * page_size)
 	++nnothuge;
      }
  }
  int rank = CartesianCommunicator::RankWorld();
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 #endif
 }
 std::string sizeString(const size_t bytes)
 {
  constexpr unsigned int bufSize = 256;
  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
  char                   buf[256];
  size_t                 s     = 0;
  double                 count = bytes;
  while (count >= 1024 && s < 7)
  {
      s++;
      count /= 1024;
  }
  if (count - floor(count) == 0.0)
  {
      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
  }
  else
  {
      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
  }
  return std::string(buf);
 }
 }
@@ -63,6 +63,66 @@ namespace Grid {
    static void *Lookup(size_t bytes) ;
  };
  std::string sizeString(size_t bytes);
  struct MemoryStats
  {
    size_t totalAllocated{0}, maxAllocated{0}, 
           currentlyAllocated{0}, totalFreed{0};
  };
  class MemoryProfiler
  {
  public:
    static MemoryStats *stats;
    static bool        debug;
  };
  #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
  #define profilerDebugPrint \
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
              << std::endl;\
  }
  #define profilerAllocate(bytes)\
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    s->totalAllocated     += (bytes);\
    s->currentlyAllocated += (bytes);\
    s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated);\
  }\
  if (MemoryProfiler::debug)\
  {\
    std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
    profilerDebugPrint;\
  }
  #define profilerFree(bytes)\
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    s->totalFreed         += (bytes);\
    s->currentlyAllocated -= (bytes);\
  }\
  if (MemoryProfiler::debug)\
  {\
    std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
    profilerDebugPrint;\
  }
  void check_huge_pages(void *Buf,uint64_t BYTES);
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
@@ -90,20 +150,39 @@ public:
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
-    
+    //    if ( ptr != NULL ) 
-#ifdef HAVE_MM_MALLOC_H
+    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
 #else
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 #endif
    //////////////////
    // Hack 2MB align; could make option probably doesn't need configurability
    //////////////////
 //define GRID_ALLOC_ALIGN (128)
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
    // First touch optimise in threaded loop
    uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
 #pragma omp parallel for
 #endif
    for(size_type n=0;n<bytes;n+=4096){
      cp[n]=0;
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
@@ -154,10 +233,13 @@ public:
 #ifdef GRID_COMMS_SHMEM
  pointer allocate(size_type __n, const void* _p= 0)
  {
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
 #ifdef CRAY
-    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
+    _Tp *ptr = (_Tp *) shmem_align(bytes,64);
 #else
-    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
+    _Tp *ptr = (_Tp *) shmem_align(64,bytes);
 #endif
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
@@ -175,20 +257,39 @@ public:
 #endif 
    return ptr;
  }
-  void deallocate(pointer __p, size_type) { 
+  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n*sizeof(_Tp);
    profilerFree(bytes);
    shmem_free((void *)__p);
  }
 #else
  pointer allocate(size_type __n, const void* _p= 0) 
  {
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    _Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
 #endif
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
 #ifdef GRID_OMP
 #pragma omp parallel for schedule(static)
 #endif
      for(size_type n=0;n<bytes;n+=4096){
 	cp[n]=0;
      }
    }
    return ptr;
  }
-  void deallocate(pointer __p, size_type) { 
+  void deallocate(pointer __p, size_type __n) {
    size_type bytes = __n*sizeof(_Tp);
    profilerFree(bytes);
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
@@ -6,8 +6,9 @@
    Copyright (C) 2015
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -43,11 +44,20 @@ namespace Grid{
  class GridBase : public CartesianCommunicator , public GridThread {
 public:
-
+    int dummy;
    // Give Lattice access
    template<class object> friend class Lattice;
    GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
    GridBase(const std::vector<int> & processor_grid,
 	     const CartesianCommunicator &parent,
 	     int &split_rank) 
      : CartesianCommunicator(processor_grid,parent,split_rank) {};
    GridBase(const std::vector<int> & processor_grid,
 	     const CartesianCommunicator &parent) 
      : CartesianCommunicator(processor_grid,parent,dummy) {};
    virtual ~GridBase() = default;
    // Physics Grid information.
@@ -62,13 +72,14 @@ public:
    int _isites;
    int _fsites;                  // _isites*_osites = product(dimensions).
    int _gsites;
-    std::vector<int> _slice_block;   // subslice information
+    std::vector<int> _slice_block;// subslice information
    std::vector<int> _slice_stride;
    std::vector<int> _slice_nblock;
-    // Might need these at some point
+    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
-    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
+    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
-    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
+
    bool _isCheckerBoarded; 
 public:
@@ -99,7 +110,7 @@ public:
    virtual int oIndex(std::vector<int> &coor)
    {
        int idx=0;
-	// Works with either global or local coordinates
+        // Works with either global or local coordinates
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
@@ -121,6 +132,12 @@ public:
      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
    }
    inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
      lcoor.resize(_ndimension);
      for (int d = 0; d < _ndimension; d++)
        lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
    }
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
@@ -128,6 +145,7 @@ public:
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
    }
    inline int PermuteDim(int dimension){
      return _simd_layout[dimension]>1;
    }
@@ -145,15 +163,15 @@ public:
      // Distance should be either 0,1,2..
      //
      if ( _simd_layout[dimension] > 2 ) { 
-	for(int d=0;d<_ndimension;d++){
+        for(int d=0;d<_ndimension;d++){
-	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
+          if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
-	}
+        }
-	permute_type = RotateBit; // How to specify distance; this is not just direction.
+        permute_type = RotateBit; // How to specify distance; this is not just direction.
-	return permute_type;
+        return permute_type;
      }
      for(int d=_ndimension-1;d>dimension;d--){
-	if (_simd_layout[d]>1 ) permute_type++;
+        if (_simd_layout[d]>1 ) permute_type++;
      }
      return permute_type;
    }
@@ -168,11 +186,31 @@ public:
    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
    inline int Nd    (void) const { return _ndimension;};
    inline const std::vector<int> LocalStarts(void)             { return _lstart;    };
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
    ////////////////////////////////////////////////////////////////
    // Utility to print the full decomposition details 
    ////////////////////////////////////////////////////////////////
    void show_decomposition(){
      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
    } 
    ////////////////////////////////////////////////////////////////
    // Global addressing
    ////////////////////////////////////////////////////////////////
@@ -188,8 +226,8 @@ public:
      gidx=0;
      int mult=1;
      for(int mu=0;mu<_ndimension;mu++) {
-	gidx+=mult*gcoor[mu];
+        gidx+=mult*gcoor[mu];
-	mult*=_gdimensions[mu];
+        mult*=_gdimensions[mu];
      }
    }
    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
@@ -197,9 +235,9 @@ public:
      pcoor.resize(_ndimension);
      lcoor.resize(_ndimension);
      for(int mu=0;mu<_ndimension;mu++){
-	int _fld  = _fdimensions[mu]/_processors[mu];
+        int _fld  = _fdimensions[mu]/_processors[mu];
-	pcoor[mu] = gcoor[mu]/_fld;
+        pcoor[mu] = gcoor[mu]/_fld;
-	lcoor[mu] = gcoor[mu]%_fld;
+        lcoor[mu] = gcoor[mu]%_fld;
      }
    }
    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
@@ -211,9 +249,9 @@ public:
      /*
      std::vector<int> cblcoor(lcoor);
      for(int d=0;d<cblcoor.size();d++){
-	if( this->CheckerBoarded(d) ) {
+        if( this->CheckerBoarded(d) ) {
-	  cblcoor[d] = lcoor[d]/2;
+          cblcoor[d] = lcoor[d]/2;
-	}
+        }
      }
      */
      i_idx= iIndex(lcoor);
@@ -239,7 +277,7 @@ public:
    {
      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
      if(CheckerBoarded(0)){
-	fcoor[0] = fcoor[0]*2+cb;
+        fcoor[0] = fcoor[0]*2+cb;
      }
    }
    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
@@ -0,0 +1,174 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cartesian/Cartesian_full.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_FULL_H
 #define GRID_CARTESIAN_FULL_H
 namespace Grid{
 /////////////////////////////////////////////////////////////////////////////////////////
 // Grid Support.
 /////////////////////////////////////////////////////////////////////////////////////////
 class GridCartesian: public GridBase {
 public:
    int dummy;
    virtual int  CheckerBoardFromOindexTable (int Oindex) {
      return 0;
    }
    virtual int  CheckerBoardFromOindex (int Oindex)
    {
      return 0;
    }
    virtual int CheckerBoarded(int dim){
      return 0;
    }
    virtual int CheckerBoard(const std::vector<int> &site){
        return 0;
    }
    virtual int CheckerBoardDestination(int cb,int shift,int dim){
        return 0;
    }
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
      return shift;
    }
    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
      return shift;
    }
    /////////////////////////////////////////////////////////////////////////
    // Constructor takes a parent grid and possibly subdivides communicator.
    /////////////////////////////////////////////////////////////////////////
    GridCartesian(const std::vector<int> &dimensions,
 		  const std::vector<int> &simd_layout,
 		  const std::vector<int> &processor_grid,
 		  const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
    {
      Init(dimensions,simd_layout,processor_grid);
    }
    GridCartesian(const std::vector<int> &dimensions,
 		  const std::vector<int> &simd_layout,
 		  const std::vector<int> &processor_grid,
 		  const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
    {
      Init(dimensions,simd_layout,processor_grid);
    }
    /////////////////////////////////////////////////////////////////////////
    // Construct from comm world
    /////////////////////////////////////////////////////////////////////////
    GridCartesian(const std::vector<int> &dimensions,
 		  const std::vector<int> &simd_layout,
 		  const std::vector<int> &processor_grid) : GridBase(processor_grid)
    {
      Init(dimensions,simd_layout,processor_grid);
    }
    virtual ~GridCartesian() = default;
    void Init(const std::vector<int> &dimensions,
 	      const std::vector<int> &simd_layout,
 	      const std::vector<int> &processor_grid)
    {
      ///////////////////////
      // Grid information
      ///////////////////////
      _isCheckerBoarded = false;
      _ndimension = dimensions.size();
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
      _ldimensions.resize(_ndimension);
      _rdimensions.resize(_ndimension);
      _simd_layout.resize(_ndimension);
      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
      _ostride.resize(_ndimension);
      _istride.resize(_ndimension);
      _fsites = _gsites = _osites = _isites = 1;
      for (int d = 0; d < _ndimension; d++)
      {
        _fdimensions[d] = dimensions[d];   // Global dimensions
        _gdimensions[d] = _fdimensions[d]; // Global dimensions
        _simd_layout[d] = simd_layout[d];
        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
        // Use a reduced simd grid
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
        //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
        _osites *= _rdimensions[d];
        _isites *= _simd_layout[d];
        // Addressing support
        if (d == 0)
        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
        else
        {
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
      }
      ///////////////////////
      // subplane information
      ///////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
      int block = 1;
      int nblock = 1;
      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
      for (int d = 0; d < _ndimension; d++)
      {
        nblock /= _rdimensions[d];
        _slice_block[d] = block;
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
    };
 };
 }
 #endif
@@ -112,151 +112,209 @@ public:
      }
    };
-    GridRedBlackCartesian(const GridBase *base) : GridRedBlackCartesian(base->_fdimensions,base->_simd_layout,base->_processors)  {};
+    ////////////////////////////////////////////////////////////
    // Create Redblack from original grid; require full grid pointer ?
    ////////////////////////////////////////////////////////////
    GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
    {
      int dims = base->_ndimension;
      std::vector<int> checker_dim_mask(dims,1);
      int checker_dim = 0;
      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
    };
-    GridRedBlackCartesian(const std::vector<int> &dimensions,
+    ////////////////////////////////////////////////////////////
    // Create redblack from original grid, with non-trivial checker dim mask
    ////////////////////////////////////////////////////////////
    GridRedBlackCartesian(const GridBase *base,
 			  const std::vector<int> &checker_dim_mask,
 			  int checker_dim
 			  ) :  GridBase(base->_processors,*base) 
    {
      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ;
    }
    virtual ~GridRedBlackCartesian() = default;
 #if 0
    ////////////////////////////////////////////////////////////
    // Create redblack grid ;; deprecate these. Should not
    // need direct creation of redblack without a full grid to base on
    ////////////////////////////////////////////////////////////
    GridRedBlackCartesian(const GridBase *base,
 			  const std::vector<int> &dimensions,
 			  const std::vector<int> &simd_layout,
 			  const std::vector<int> &processor_grid,
 			  const std::vector<int> &checker_dim_mask,
 			  int checker_dim
-			  ) :  GridBase(processor_grid) 
+			  ) :  GridBase(processor_grid,*base) 
    {
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
    }
-    GridRedBlackCartesian(const std::vector<int> &dimensions,
+
    ////////////////////////////////////////////////////////////
    // Create redblack grid
    ////////////////////////////////////////////////////////////
    GridRedBlackCartesian(const GridBase *base,
 			  const std::vector<int> &dimensions,
 			  const std::vector<int> &simd_layout,
-			  const std::vector<int> &processor_grid) : GridBase(processor_grid) 
+			  const std::vector<int> &processor_grid) : GridBase(processor_grid,*base) 
    {
      std::vector<int> checker_dim_mask(dimensions.size(),1);
-      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
+      int checker_dim = 0;
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
    }
 #endif
    void Init(const std::vector<int> &dimensions,
-	      const std::vector<int> &simd_layout,
+              const std::vector<int> &simd_layout,
-	      const std::vector<int> &processor_grid,
+              const std::vector<int> &processor_grid,
-	      const std::vector<int> &checker_dim_mask,
+              const std::vector<int> &checker_dim_mask,
-	      int checker_dim)
+              int checker_dim)
    {
-    ///////////////////////
+
-    // Grid information
+      _isCheckerBoarded = true;
    ///////////////////////
      _checker_dim = checker_dim;
-      assert(checker_dim_mask[checker_dim]==1);
+      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
-      assert(checker_dim_mask.size()==_ndimension);
+      assert(checker_dim_mask.size() == _ndimension);
-      assert(processor_grid.size()==_ndimension);
+      assert(processor_grid.size() == _ndimension);
-      assert(simd_layout.size()==_ndimension);
+      assert(simd_layout.size() == _ndimension);
-      
+
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
      _ldimensions.resize(_ndimension);
      _rdimensions.resize(_ndimension);
      _simd_layout.resize(_ndimension);
-      
+      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
      _ostride.resize(_ndimension);
      _istride.resize(_ndimension);
-      
+
      _fsites = _gsites = _osites = _isites = 1;
      _checker_dim_mask=checker_dim_mask;
-      for(int d=0;d<_ndimension;d++){
+      _checker_dim_mask = checker_dim_mask;
 	_fdimensions[d] = dimensions[d];
 	_gdimensions[d] = _fdimensions[d];
 	_fsites = _fsites * _fdimensions[d];
 	_gsites = _gsites * _gdimensions[d];
 	if (d==_checker_dim) {
 	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
 	}
 	_ldimensions[d] = _gdimensions[d]/_processors[d];
-	// Use a reduced simd grid
+      for (int d = 0; d < _ndimension; d++)
-	_simd_layout[d] = simd_layout[d];
+      {
-	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+        _fdimensions[d] = dimensions[d];
-	assert(_rdimensions[d]>0);
+        _gdimensions[d] = _fdimensions[d];
        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
-	// all elements of a simd vector must have same checkerboard.
+        if (d == _checker_dim)
-	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
+        {
-	if ( _simd_layout[d]>1 ) {
+          assert((_gdimensions[d] & 0x1) == 0);
-	  if ( checker_dim_mask[d] ) { 
+          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
-	    assert( (_rdimensions[d]&0x1) == 0 );
+	  _gsites /= 2;
-	  }
+        }
-	}
+        _ldimensions[d] = _gdimensions[d] / _processors[d];
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
-	_osites *= _rdimensions[d];
+        // Use a reduced simd grid
-	_isites *= _simd_layout[d];
+        _simd_layout[d] = simd_layout[d];
-        
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
-	// Addressing support
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
-	if ( d==0 ) {
+        assert(_rdimensions[d] > 0);
 	  _ostride[d] = 1;
 	  _istride[d] = 1;
 	} else {
 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 	}
        // all elements of a simd vector must have same checkerboard.
        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
        if (_simd_layout[d] > 1)
        {
          if (checker_dim_mask[d])
          {
            assert((_rdimensions[d] & 0x1) == 0);
          }
        }
        _osites *= _rdimensions[d];
        _isites *= _simd_layout[d];
        // Addressing support
        if (d == 0)
        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
        else
        {
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
      }
-            
+
      ////////////////////////////////////////////////////////////////////////////////////////////
      // subplane information
      ////////////////////////////////////////////////////////////////////////////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
-        
+
-      int block =1;
+      int block = 1;
-      int nblock=1;
+      int nblock = 1;
-      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
+      for (int d = 0; d < _ndimension; d++)
-      
+        nblock *= _rdimensions[d];
-      for(int d=0;d<_ndimension;d++){
+
-	nblock/=_rdimensions[d];
+      for (int d = 0; d < _ndimension; d++)
-	_slice_block[d] =block;
+      {
-	_slice_stride[d]=_ostride[d]*_rdimensions[d];
+        nblock /= _rdimensions[d];
-	_slice_nblock[d]=nblock;
+        _slice_block[d] = block;
-	block = block*_rdimensions[d];
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	rvol=rvol * _rdimensions[d];
+      {
        rvol = rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
-      for(int osite=0;osite<_osites;osite++){
+      for (int osite = 0; osite < _osites; osite++)
-	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      {
        _checker_board[osite] = CheckerBoardFromOindex(osite);
      }
    };
-protected:
+
  protected:
    virtual int oIndex(std::vector<int> &coor)
    {
-      int idx=0;
+      int idx = 0;
-      for(int d=0;d<_ndimension;d++) {
+      for (int d = 0; d < _ndimension; d++)
-	if( d==_checker_dim ) {
+      {
-	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
+        if (d == _checker_dim)
-	} else {
+        {
-	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
-	}
+        }
        else
        {
          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
        }
      }
      return idx;
    };
-        
+
    virtual int iIndex(std::vector<int> &lcoor)
    {
-        int idx=0;
+      int idx = 0;
-        for(int d=0;d<_ndimension;d++) {
+      for (int d = 0; d < _ndimension; d++)
-	  if( d==_checker_dim ) {
+      {
-	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
+        if (d == _checker_dim)
-	  } else { 
+        {
-	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
-	  }
+        }
-	}
+        else
-        return idx;
+        {
          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
        }
      }
      return idx;
    }
 };
 }
 #endif
@@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H
 #include <Grid/communicator/SharedMemory.h>
 #include <Grid/communicator/Communicator_base.h>
 #endif
@@ -26,40 +26,24 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/mman.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
-void *              CartesianCommunicator::ShmCommBuf;
+CartesianCommunicator::CommunicatorPolicy_t  
-uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
+CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+int CartesianCommunicator::nCommThreads = -1;
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  if (heap_bytes >= MAX_MPI_SHM_BYTES) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
    std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
    assert(heap_bytes<MAX_MPI_SHM_BYTES);
  }
  return ptr;
 }
 void CartesianCommunicator::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::Dimensions(void)        { return _ndimension; };
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
@@ -87,43 +71,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
 }
 #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int xmit_to_rank,
 						       void *recv,
 						       int recv_from_rank,
 						       int bytes)
 {
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 }
 #endif
 }
@@ -32,114 +32,57 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 // Processor layout information
 ///////////////////////////////////
-#ifdef GRID_COMMS_MPI
+#include <Grid/communicator/SharedMemory.h>
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3L
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
 namespace Grid {
-class CartesianCommunicator {
+class CartesianCommunicator : public SharedMemory {
  public:    
-  // 65536 ranks per node adequate for now
+public:    
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
-  static const int      MAXLOG2RANKSPERNODE = 16;            
+  ////////////////////////////////////////////
-  static uint64_t MAX_MPI_SHM_BYTES;
+  // Policies
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  static int       nCommThreads;
  ////////////////////////////////////////////
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  std::vector<int> _processor_coor;  // linear processor coordinate
-  unsigned long _ndimension;
+  unsigned long    _ndimension;
-
+  static Grid_MPI_Comm      communicator_world;
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
+  Grid_MPI_Comm             communicator;
-  static MPI_Comm communicator_world;
+  std::vector<Grid_MPI_Comm> communicator_halo;
         MPI_Comm communicator;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
 #endif
  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
  // Longer term; drop this in favour of a master / slave model with 
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_MPI3
  static int ShmRank;
  static int ShmSize;
  static int GroupRank;
  static int GroupSize;
  static int WorldRank;
  static int WorldSize;
  std::vector<int>  WorldDims;
  std::vector<int>  GroupDims;
  std::vector<int>  ShmDims;
  std::vector<int> GroupCoor;
  std::vector<int> ShmCoor;
  std::vector<int> WorldCoor;
  static std::vector<int> GroupRanks; 
  static std::vector<int> MyGroup;
  static int ShmSetup;
  static MPI_Win ShmWindow; 
  static MPI_Comm ShmComm;
  std::vector<int>  LexicographicToWorldRank;
  static std::vector<void *> ShmCommBufs;
 #else 
  static void ShmInitGeneric(void);
  static commVector<uint8_t> ShmBufStorageVector;
 #endif 
  /////////////////////////////////
  // Grid information and queries
  // Implemented in Communicator_base.C
  /////////////////////////////////
  static void * ShmCommBuf;
  // Isend/Irecv/Wait, or Sendrecv blocking
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  size_t heap_top;
  size_t heap_bytes;
  void *ShmBufferSelf(void);
  void *ShmBuffer(int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void ShmBufferFreeAll(void) ;
  ////////////////////////////////////////////////
  // Must call in Grid startup
  ////////////////////////////////////////////////
  static void Init(int *argc, char ***argv);
-  
+
  ////////////////////////////////////////////////
-  // Constructor of any given grid
+  // Constructors to sub-divide a parent communicator
  // and default to comm world
  ////////////////////////////////////////////////
  CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
  CartesianCommunicator(const std::vector<int> &pdimensions_in);
  virtual ~CartesianCommunicator();
 private:
  ////////////////////////////////////////////////
  // Private initialise from an MPI communicator
  // Can use after an MPI_Comm_split, but hidden from user so private
  ////////////////////////////////////////////////
  void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
 public:
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
@@ -148,13 +91,13 @@ class CartesianCommunicator {
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  int                      Dimensions(void)        ;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  int                      NodeCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@@ -175,6 +118,8 @@ class CartesianCommunicator {
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
@@ -207,14 +152,21 @@ class CartesianCommunicator {
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
 			       int recv_from_rank,
 			       int bytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				  void *xmit,
+				    void *xmit,
-				  int xmit_to_rank,
+				    int xmit_to_rank,
-				  void *recv,
+				    void *recv,
-				  int recv_from_rank,
+				    int recv_from_rank,
-				  int bytes);
+				    int bytes,int dir);
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);
  ////////////////////////////////////////////////////////////
@@ -226,6 +178,23 @@ class CartesianCommunicator {
  // Broadcast a buffer and composite larger
  ////////////////////////////////////////////////////////////
  void Broadcast(int root,void* data, int bytes);
  ////////////////////////////////////////////////////////////
  // All2All down one dimension
  ////////////////////////////////////////////////////////////
  template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
    assert(dim>=0);
    assert(dim<_ndimension);
    assert(in.size()==out.size());
    int numnode = _processors[dim];
    uint64_t bytes=sizeof(T);
    uint64_t words=in.size()/numnode;
    assert(numnode * words == in.size());
    assert(words < (1ULL<<31));
    AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
  }
  void AllToAll(int dim  ,void *in,void *out,uint64_t words,uint64_t bytes);
  void AllToAll(void  *in,void *out,uint64_t words         ,uint64_t bytes);
  template<class obj> void Broadcast(int root,obj &data)
    {
@@ -0,0 +1,514 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/communicator/SharedMemory.h>
 namespace Grid {
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 ////////////////////////////////////////////
 // First initialise of comms system
 ////////////////////////////////////////////
 void CartesianCommunicator::Init(int *argc, char ***argv) 
 {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
      assert(0);
  }
  Grid_quiesce_nodes();
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  GlobalSharedMemory::Init(communicator_world);
  GlobalSharedMemory::SharedMemoryAllocate(
 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
 		   GlobalSharedMemory::Hugepages);
 }
 ///////////////////////////////////////////////////////////////////////////
 // Use cartesian communicators now even in MPI3
 ///////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Initialises from communicator_world
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
  MPI_Comm optimal_comm;
  ////////////////////////////////////////////////////
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
  // Free the temp communicator
  ///////////////////////////////////////////////////
  MPI_Comm_free(&optimal_comm);
 }
 //////////////////////////////////
 // Try to subdivide communicator
 //////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)    
 {
  _ndimension = processors.size();
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  std::vector<int> parent_processor_coor(_ndimension,0);
  std::vector<int> parent_processors    (_ndimension,1);
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  int Nparent = parent._processors ; 
  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
  //  std::cout << " Parent size  "<<Nparent <<std::endl;
  int childsize=1;
  for(int d=0;d<processors.size();d++) {
    childsize *= processors[d];
  }
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);
  //  std::cout << " child size  "<<childsize <<std::endl;
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
  for(int d=0;d<_ndimension;d++){
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
  int crank;  
  // Mpi uses the reverse Lexico convention to us; so reversed routines called
  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);      // ssize is the number of split grids
  MPI_Comm comm_split;
  if ( Nchild > 1 ) { 
    if(0){
      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
      std::cout<<std::endl;
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      // Declare victory
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
 		<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
      std::cout << " Split communicator " <<comm_split <<std::endl;
    }
    ////////////////////////////////////////////////////////////////
    // Split the communicator
    ////////////////////////////////////////////////////////////////
    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
    assert(ierr==0);
  } else {
    srank = 0;
    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
    assert(ierr==0);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Set up from the new split communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  InitFromMPICommunicator(processors,comm_split);
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take the right SHM buffers
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  SetCommunicator(comm_split);
  ///////////////////////////////////////////////
  // Free the temp communicator 
  ///////////////////////////////////////////////
  MPI_Comm_free(&comm_split);
  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
    for(int d=0;d<processors.size();d++){
      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
    }
  }
  for(int d=0;d<processors.size();d++){
    assert(_processor_coor[d] == ccoor[d] );
  }
 }
 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
  ////////////////////////////////////////////////////
  // Creates communicator, and the communicator_halo
  ////////////////////////////////////////////////////
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);
  /////////////////////////////////
  // Count the requested nodes
  /////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  std::vector<int> periodic(_ndimension,1);
  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  if ( 0 && (communicator_base != communicator_world) ) {
    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
    for(int d=0;d<_processors.size();d++){
      std::cout << _processor_coor[d]<<" ";
    }
    std::cout << std::endl;
  }
  int Size;
  MPI_Comm_size(communicator,&Size);
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  assert(Size==_Nprocessors);
 }
 CartesianCommunicator::~CartesianCommunicator()
 {
  int MPI_is_finalised;
  MPI_Finalized(&MPI_is_finalised);
  if (communicator && !MPI_is_finalised) {
    MPI_Comm_free(&communicator);
    for(int i=0;i<communicator_halo.size();i++){
      MPI_Comm_free(&communicator_halo[i]);
    }
  }  
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
 						     int from,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,
 							 void *recv,
 							 int from,
 							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  int gdest = ShmRanks[dest];
  int gfrom = ShmRanks[from];
  int gme   = ShmRanks[_processor];
  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  std::vector<int> row(_ndimension,1);
  assert(dim>=0 && dim<_ndimension);
  //  Split the communicator
  row[dim] = _processors[dim];
  int me;
  CartesianCommunicator Comm(row,*this,me);
  Comm.AllToAll(in,out,words,bytes);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
  // (Turns up on 32^3 x 64 Gparity too)
  MPI_Datatype object;
  int iwords; 
  int ibytes;
  iwords = words;
  ibytes = bytes;
  assert(words == iwords); // safe to cast to int ?
  assert(bytes == ibytes); // safe to cast to int ?
  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
  MPI_Type_commit(&object);
  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
  MPI_Type_free(&object);
 }
 }
@@ -32,10 +32,21 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
-  ShmInitGeneric();
+  GlobalSharedMemory::Init(communicator_world);
  GlobalSharedMemory::SharedMemoryAllocate(
 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
 		   GlobalSharedMemory::Hugepages);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
  srank=0;
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@@ -51,14 +62,19 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
    assert(_processors[d]==1);
    _processor_coor[d] = 0;
  }
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::~CartesianCommunicator(){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
@@ -93,6 +109,14 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
 {
  assert(0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
 }
 int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
@@ -106,6 +130,36 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
  dest=0;
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,
 						     void *recv,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
 }
@@ -0,0 +1,92 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 namespace Grid { 
 // static data
 uint64_t            GlobalSharedMemory::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
 int                 GlobalSharedMemory::Hugepages = 0;
 int                 GlobalSharedMemory::_ShmSetup;
 int                 GlobalSharedMemory::_ShmAlloc;
 uint64_t            GlobalSharedMemory::_ShmAllocBytes;
 std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
 Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 int                 GlobalSharedMemory::WorldShmRank;
 int                 GlobalSharedMemory::WorldShmSize;
 std::vector<int>    GlobalSharedMemory::WorldShmRanks;
 Grid_MPI_Comm       GlobalSharedMemory::WorldComm;
 int                 GlobalSharedMemory::WorldSize;
 int                 GlobalSharedMemory::WorldRank;
 int                 GlobalSharedMemory::WorldNodes;
 int                 GlobalSharedMemory::WorldNode;
 void GlobalSharedMemory::SharedMemoryFree(void)
 {
  assert(_ShmAlloc);
  assert(_ShmAllocBytes>0);
  for(int r=0;r<WorldShmSize;r++){
    munmap(WorldShmCommBufs[r],_ShmAllocBytes);
  }
  _ShmAlloc = 0;
  _ShmAllocBytes = 0;
 }
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *SharedMemory::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  if (heap_bytes >= heap_size) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
    assert(heap_bytes<heap_size);
  }
  return ptr;
 }
 void SharedMemory::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 void *SharedMemory::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
 }
@@ -0,0 +1,165 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 // TODO
 // 1) move includes into SharedMemory.cc
 //
 // 2) split shared memory into a) optimal communicator creation from comm world
 // 
 //                             b) shared memory buffers container
 //                                -- static globally shared; init once
 //                                -- per instance set of buffers.
 //                                   
 #pragma once 
 #include <Grid/GridCore.h>
 #if defined (GRID_COMMS_MPI3) 
 #include <mpi.h>
 #endif 
 #include <semaphore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/types.h>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
 #ifdef HAVE_NUMAIF_H
 #include <numaif.h>
 #endif
 namespace Grid {
 #if defined (GRID_COMMS_MPI3) 
  typedef MPI_Comm    Grid_MPI_Comm;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
  typedef int Grid_MPI_Comm;
 #endif
 class GlobalSharedMemory {
 private:
  static const int     MAXLOG2RANKSPERNODE = 16;            
  // Init once lock on the buffer allocation
  static int      _ShmSetup;
  static int      _ShmAlloc;
  static uint64_t _ShmAllocBytes;
 public:
  static int      ShmSetup(void)      { return _ShmSetup; }
  static int      ShmAlloc(void)      { return _ShmAlloc; }
  static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
  static uint64_t      MAX_MPI_SHM_BYTES;
  static int           Hugepages;
  static std::vector<void *> WorldShmCommBufs;
  static Grid_MPI_Comm WorldComm;
  static int           WorldRank;
  static int           WorldSize;
  static Grid_MPI_Comm WorldShmComm;
  static int           WorldShmRank;
  static int           WorldShmSize;
  static int           WorldNodes;
  static int           WorldNode;
  static std::vector<int>  WorldShmRanks;
  //////////////////////////////////////////////////////////////////////////////////////
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
  static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
 };
 //////////////////////////////
 // one per communicator
 //////////////////////////////
 class SharedMemory 
 {
 private:
  static const int     MAXLOG2RANKSPERNODE = 16;            
  size_t heap_top;
  size_t heap_bytes;
  size_t heap_size;
 protected:
  Grid_MPI_Comm    ShmComm; // for barriers
  int    ShmRank; 
  int    ShmSize;
  std::vector<void *> ShmCommBufs;
  std::vector<int>    ShmRanks;// Mapping comm ranks to Shm ranks
 public:
  SharedMemory() {};
  ~SharedMemory();
  ///////////////////////////////////////////////////////////////////////////////////////
  // set the buffers & sizes
  ///////////////////////////////////////////////////////////////////////////////////////
  void SetCommunicator(Grid_MPI_Comm comm);
  ////////////////////////////////////////////////////////////////////////
  // For this instance ; disjoint buffer sets between splits if split grid
  ////////////////////////////////////////////////////////////////////////
  void ShmBarrier(void); 
  ///////////////////////////////////////////////////
  // Call on any instance
  ///////////////////////////////////////////////////
  void SharedMemoryTest(void);
  void *ShmBufferSelf(void);
  void *ShmBuffer    (int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void  ShmBufferFreeAll(void) ;
  //////////////////////////////////////////////////////////////////////////
  // Make info on Nodes & ranks and Shared memory available
  //////////////////////////////////////////////////////////////////////////
  int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
  int RankCount(void) { return GlobalSharedMemory::WorldSize;};
 };
 }
@@ -0,0 +1,651 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <pwd.h>
 namespace Grid { 
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
  assert(_ShmSetup==0);
  WorldComm = comm;
  MPI_Comm_rank(WorldComm,&WorldRank);
  MPI_Comm_size(WorldComm,&WorldSize);
  // WorldComm, WorldSize, WorldRank
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  // WorldShmComm, WorldShmSize, WorldShmRank
  // WorldNodes
  WorldNodes = WorldSize/WorldShmSize;
  assert( (WorldNodes * WorldShmSize) == WorldSize );
  // FIXME: Check all WorldShmSize are the same ?
  /////////////////////////////////////////////////////////////////////
  // find world ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (WorldComm, &WorldGroup); 
  MPI_Comm_group (WorldShmComm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize);   for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  WorldShmRanks.resize(WorldSize); 
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and nominate the leader
  ///////////////////////////////////////////////////////////////////
  int g=0;
  std::vector<int> MyGroup;
  MyGroup.resize(WorldShmSize);
  for(int rank=0;rank<WorldSize;rank++){
    if(WorldShmRanks[rank]!=MPI_UNDEFINED){
      assert(g<WorldShmSize);
      MyGroup[g++] = rank;
    }
  }
  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
  int myleader = MyGroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(WorldNodes,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the node of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  WorldNode=-1;
  for(int g=0;g<WorldNodes;g++){
    if (myleader == leaders_group[g]){
      WorldNode=g;
    }
  }
  assert(WorldNode!=-1);
  _ShmSetup=1;
 }
 // Gray encode support 
 int BinaryToGray (int  binary) {
  int gray = (binary>>1)^binary;
  return gray;
 }
 int Log2Size(int TwoToPower,int MAXLOG2)
 {
  int log2size = -1;
  for(int i=0;i<=MAXLOG2;i++){
    if ( (0x1<<i) == TwoToPower ) {
      log2size = i;
      break;
    }
  }
  return log2size;
 }
 void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
 #ifdef HYPERCUBE
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify the hypercube coordinate of this node using hostname
  ////////////////////////////////////////////////////////////////
  // n runs 0...7 9...16 18...25 27...34     (8*4)  5 bits
  // i runs 0..7                                    3 bits
  // r runs 0..3                                    2 bits
  // 2^10 = 1024 nodes
  const int maxhdim = 10; 
  std::vector<int> HyperCubeCoords(maxhdim,0);
  std::vector<int> RootHyperCubeCoords(maxhdim,0);
  int R;
  int I;
  int N;
  const int namelen = _POSIX_HOST_NAME_MAX;
  char name[namelen];
  // Parse ICE-XA hostname to get hypercube location
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
  assert(nscan==3);
  int nlo = N%9;
  int nhi = N/9;
  uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
  uint32_t rootcoor  = hypercoor;
  //////////////////////////////////////////////////////////////////
  // Print debug info
  //////////////////////////////////////////////////////////////////
  for(int d=0;d<maxhdim;d++){
    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
  }
  std::string hname(name);
  std::cout << "hostname "<<hname<<std::endl;
  std::cout << "R " << R << " I " << I << " N "<< N
            << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
  //////////////////////////////////////////////////////////////////
  // broadcast node 0's base coordinate for this partition.
  //////////////////////////////////////////////////////////////////
  MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); 
  hypercoor=hypercoor-rootcoor;
  assert(hypercoor<WorldSize);
  assert(hypercoor>=0);
  //////////////////////////////////////
  // Printing
  //////////////////////////////////////
  for(int d=0;d<maxhdim;d++){
    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
  }
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
  std::vector<int> processor_coor(ndimension);
  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
  std::vector<int> HyperCoor(ndimension);
  int dim = 0;
  for(int l2=0;l2<log2size;l2++){
    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<ndimension;d++){
    NodeDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Map Hcube according to physical lattice 
  // must partition. Loop over dims and find out who would join.
  ////////////////////////////////////////////////////////////////
  int hcoor = hypercoor;
  for(int d=0;d<ndimension;d++){
     int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
     int msk  = (0x1<<bits)-1;
     HyperCoor[d]=hcoor & msk;  
     HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
     hcoor = hcoor >> bits;
  } 
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  int Nprocessors=1;
  for(int i=0;i<ndimension;i++){
    Nprocessors*=processors[i];
  }
  assert(WorldSize==Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  ////////////////////////////////////////////////////////////////
  int rank;
  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
  for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
  /////////////////////////////////////////////////////////////////
  // Build the new communicator
  /////////////////////////////////////////////////////////////////
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 #else 
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
  std::vector<int> processor_coor(ndimension);
  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
  int dim = 0;
  for(int l2=0;l2<log2size;l2++){
    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<ndimension;d++){
    NodeDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  int Nprocessors=1;
  for(int i=0;i<ndimension;i++){
    Nprocessors*=processors[i];
  }
  assert(WorldSize==Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  ////////////////////////////////////////////////////////////////
  int rank;
  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
  /////////////////////////////////////////////////////////////////
  // Build the new communicator
  /////////////////////////////////////////////////////////////////
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 #endif
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // SHMGET
 ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  std::vector<int> shmids(WorldShmSize);
  if ( WorldShmRank == 0 ) {
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes;
      key_t key   = IPC_PRIVATE;
      int flags = IPC_CREAT | SHM_R | SHM_W;
 #ifdef SHM_HUGETLB
      if (Hugepages) flags|=SHM_HUGETLB;
 #endif
      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
        int errsv = errno;
        printf("Errno %d\n",errsv);
        printf("key   %d\n",key);
        printf("size  %lld\n",size);
        printf("flags %d\n",flags);
        perror("shmget");
        exit(1);
      }
    }
  }
  MPI_Barrier(WorldShmComm);
  MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
  MPI_Barrier(WorldShmComm);
  for(int r=0;r<WorldShmSize;r++){
    WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
    if (WorldShmCommBufs[r] == (uint64_t *)-1) {
      perror("Shared memory attach failure");
      shmctl(shmids[r], IPC_RMID, NULL);
      exit(2);
    }
  }
  MPI_Barrier(WorldShmComm);
  ///////////////////////////////////
  // Mark for clean up
  ///////////////////////////////////
  for(int r=0;r<WorldShmSize;r++){
    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
  }
  MPI_Barrier(WorldShmComm);
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 }
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbfs and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
  char shm_name [NAME_MAX];
  for(int r=0;r<WorldShmSize;r++){
    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
    if ( fd == -1) { 
      printf("open %s failed\n",shm_name);
      perror("open hugetlbfs");
      exit(0);
    }
    int mmap_flag = MAP_SHARED ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
 #endif // MMAP
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbf and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
  char shm_name [NAME_MAX];
  assert(WorldShmSize == 1);
  for(int r=0;r<WorldShmSize;r++){
    int fd=-1;
    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
 #endif // MMAP
 #ifdef GRID_MPI3_SHMOPEN
 ////////////////////////////////////////////////////////////////////////////////////////////
 // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
 // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
 // the posix shm virtual file system
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  char shm_name [NAME_MAX];
  if ( WorldShmRank == 0 ) {
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes;
      struct passwd *pw = getpwuid (getuid());
      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
      int mmap_flag = MAP_SHARED;
 #ifdef MAP_POPULATE 
      mmap_flag |= MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
      if (flags) mmap_flag |= MAP_HUGETLB;
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
      }
      assert(((uint64_t)ptr&0x3F)==0);
      WorldShmCommBufs[r] =ptr;
      close(fd);
    }
  }
  MPI_Barrier(WorldShmComm);
  if ( WorldShmRank != 0 ) { 
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes ;
      struct passwd *pw = getpwuid (getuid());
      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
      int fd=shm_open(shm_name,O_RDWR,0666);
      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
      WorldShmCommBufs[r] =ptr;
      close(fd);
    }
  }
  _ShmAlloc=1;
  _ShmAllocBytes = bytes;
 }
 #endif
  ////////////////////////////////////////////////////////
  // Global shared functionality finished
  // Now move to per communicator functionality
  ////////////////////////////////////////////////////////
 void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 {
  int rank, size;
  MPI_Comm_rank(comm,&rank);
  MPI_Comm_size(comm,&size);
  ShmRanks.resize(size);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  ShmCommBufs.resize(ShmSize);
  //////////////////////////////////////////////////////////////////////
  // Map ShmRank to WorldShmRank and use the right buffer
  //////////////////////////////////////////////////////////////////////
  assert (GlobalSharedMemory::ShmAlloc()==1);
  heap_size = GlobalSharedMemory::ShmAllocBytes();
  for(int r=0;r<ShmSize;r++){
    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
  }
  ShmBufferFreeAll();
  /////////////////////////////////////////////////////////////////////
  // find comm ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group FullGroup, ShmGroup;
  MPI_Comm_group (comm   , &FullGroup); 
  MPI_Comm_group (ShmComm, &ShmGroup);
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
 //////////////////////////////////////////////////////////////////
 void SharedMemory::ShmBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Test the shared memory is working
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 void SharedMemory::SharedMemoryTest(void)
 {
  ShmBarrier();
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      uint64_t * check = (uint64_t *) ShmCommBufs[r];
      check[0] = GlobalSharedMemory::WorldNode;
      check[1] = r;
      check[2] = 0x5A5A5A;
    }
  }
  ShmBarrier();
  for(int r=0;r<ShmSize;r++){
    uint64_t * check = (uint64_t *) ShmCommBufs[r];
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==0x5A5A5A);
  }
  ShmBarrier();
 }
 void *SharedMemory::ShmBuffer(int rank)
 {
  int gpeer = ShmRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  static int count =0;
  int gpeer = ShmRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    return (void *) remote;
  }
 }
 SharedMemory::~SharedMemory()
 {
  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
  if ( !MPI_is_finalised ) { 
    MPI_Comm_free(&ShmComm);
  }
 };
 }
@@ -0,0 +1,128 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 namespace Grid { 
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
  assert(_ShmSetup==0);
  WorldComm = 0;
  WorldRank = 0;
  WorldSize = 1;
  WorldShmComm = 0 ;
  WorldShmRank = 0 ;
  WorldShmSize = 1 ;
  WorldNodes   = 1 ;
  WorldNode    = 0 ;
  WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
  WorldShmCommBufs.resize(1);
  _ShmSetup=1;
 }
 void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
  optimal_comm = WorldComm;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended, use anonymous mmap
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  int mmap_flag =0;
 #ifdef MAP_ANONYMOUS
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
 #endif
 #ifdef MAP_ANON
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
 #endif
 #ifdef MAP_HUGETLB
  if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
  ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) {
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
 #ifdef MADV_HUGEPAGE
  if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
 #endif
  bzero(ShmCommBuf,bytes);
  WorldShmCommBufs[0] = ShmCommBuf;
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 };
  ////////////////////////////////////////////////////////
  // Global shared functionality finished
  // Now move to per communicator functionality
  ////////////////////////////////////////////////////////
 void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 {
  assert(GlobalSharedMemory::ShmAlloc()==1);
  ShmRanks.resize(1);
  ShmCommBufs.resize(1);
  ShmRanks[0] = 0;
  ShmRank     = 0;
  ShmSize     = 1;
  //////////////////////////////////////////////////////////////////////
  // Map ShmRank to WorldShmRank and use the right buffer
  //////////////////////////////////////////////////////////////////////
  ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
  heap_size      = GlobalSharedMemory::ShmAllocBytes();
  ShmBufferFreeAll();
  return;
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
 //////////////////////////////////////////////////////////////////
 void SharedMemory::ShmBarrier(void){ return ; }
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Test the shared memory is working
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 void SharedMemory::SharedMemoryTest(void) { return; }
 void *SharedMemory::ShmBuffer(int rank)
 {
  return NULL;
 }
 void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  return NULL;
 }
 SharedMemory::~SharedMemory()
 {};
 }
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
@@ -30,21 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 template<class vobj>
 class SimpleCompressor {
 public:
  void Point(int) {};
  vobj operator() (const vobj &arg) {
    return arg;
  }
 };
 ///////////////////////////////////////////////////////////////////
-// Gather for when there is no need to SIMD split with compression
+// Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
-template<class vobj,class cobj,class compressor> void 
+template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@@ -55,40 +45,41 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen
  int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int ent = 0;
  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
-	buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
+	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
      }
    }
  } else { 
     int bo=0;
     std::vector<std::pair<int,int> > table;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*stride;
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	 if ( ocb &cbmask ) {
-	   table.push_back(std::pair<int,int> (bo++,o+b));
+	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 	 }
       }
     }
-     parallel_for(int i=0;i<table.size();i++){
+  }
-       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+  parallel_for(int i=0;i<ent;i++){
-     }
+    buffer[table[i].first]=rhs._odata[table[i].second];
  }
 }
 ///////////////////////////////////////////////////////////////////
-// Gather for when there *is* need to SIMD split with compression
+// Gather for when there *is* need to SIMD split 
 ///////////////////////////////////////////////////////////////////
-template<class cobj,class vobj,class compressor> void 
+template<class vobj> void 
-Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_object *> pointers,int dimension,int plane,int cbmask,compressor &compress)
+Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@@ -109,8 +100,8 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 	int o      =   n*n1;
 	int offset = b+n*e2;
-	cobj temp =compress(rhs._odata[so+o+b]);
+	vobj temp =rhs._odata[so+o+b];
-	extract<cobj>(temp,pointers,offset);
+	extract<vobj>(temp,pointers,offset);
      }
    }
@@ -127,32 +118,14 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
-	  cobj temp =compress(rhs._odata[so+o+b]);
+	  vobj temp =rhs._odata[so+o+b];
-	  extract<cobj>(temp,pointers,offset);
+	  extract<vobj>(temp,pointers,offset);
 	}
      }
    }
  }
 }
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
 template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  SimpleCompressor<vobj> dontcompress;
  Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
 }
 //////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
 template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
 {
  SimpleCompressor<vobj> dontcompress;
  Gather_plane_extract<vobj,vobj,decltype(dontcompress)>(rhs,pointers,dimension,plane,cbmask,dontcompress);
 }
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
@@ -169,38 +142,42 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int stride=rhs._grid->_slice_stride[dimension];
-  
+
  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
  int ent    =0;
  if ( cbmask ==0x3 ) {
-    parallel_for_nest2(int n=0;n<e1;n++){
+
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
-	rhs._odata[so+o+b]=buffer[bo+b];
+	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
      }
    }
  } else { 
    std::vector<std::pair<int,int> > table;
    int bo=0;
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
-	  table.push_back(std::pair<int,int> (so+o+b,bo++));
+	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
      }
    }
-    parallel_for(int i=0;i<table.size();i++){
+  }
-       //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
+
-       rhs._odata[table[i].first]=buffer[table[i].second];
+  parallel_for(int i=0;i<ent;i++){
-     }
+    rhs._odata[table[i].first]=buffer[table[i].second];
  }
 }
 //////////////////////////////////////////////////////
 // Scatter for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
- template<class vobj,class cobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<cobj *> pointers,int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@@ -257,29 +234,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
  int stride = rhs._grid->_slice_stride[dimension];
  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
  int ent=0;
  if(cbmask == 0x3 ){
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
+	table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
    }
  } else { 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
+	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 	}
      }
    }
  }
-  
+
  parallel_for(int i=0;i<ent;i++){
    lhs._odata[table[i].first]=rhs._odata[table[i].second];
  }
 }
 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
@@ -298,16 +278,28 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int e2=rhs._grid->_slice_block [dimension];
  int stride = rhs._grid->_slice_stride[dimension];
-  parallel_for_nest2(int n=0;n<e1;n++){
+  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
-  for(int b=0;b<e2;b++){
+  int ent=0;
  double t_tab,t_perm;
  if ( cbmask == 0x3 ) {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
+      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
-	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
+    }}
-      }
+  }
-  }}
+  parallel_for(int i=0;i<ent;i++){
    permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
  }
 }
 //////////////////////////////////////////////////////
@@ -320,6 +312,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
  double t_local;
  if ( sshift[0] == sshift[1] ) {
    Cshift_local(ret,rhs,dimension,shift,0x3);
  } else {
@@ -328,7 +322,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
  }
 }
-template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid = rhs._grid;
  int fd = grid->_fdimensions[dimension];
@@ -354,11 +348,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;
    // FIXME : This must change where we have a 
    // Rotate slice.
    // Document how this works ; why didn't I do this when I first wrote it...
    // wrap is whether sshift > rd.
    //  num is sshift mod rd.
    // 
@@ -394,10 +384,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
  }
  return ret;
 }
 }
 #endif
@@ -54,13 +54,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  if ( !comm_dim ) {
-    //    std::cout << "Cshift_local" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
-    //    std::cout << "Cshift_comms_simd" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
-    //    std::cout << "Cshift_comms" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  return ret;
@@ -91,9 +91,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
    //std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
  } else {
    //std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
@@ -154,13 +157,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   recv_from_rank,
 			   bytes);
      grid->Barrier();
-      /*
+
      for(int i=0;i<send_buf.size();i++){
 	assert(recv_buf.size()==buffer_size);
 	assert(send_buf.size()==buffer_size);
 	std::cout << "SendRecv_Cshift_comms ["<<i<<" "<< dimension<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl;
      }
      */
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
    }
  }
@@ -181,6 +178,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;
  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
@@ -246,13 +247,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     (void *)&recv_buf_extract[i][0],
 			     recv_from_rank,
 			     bytes);
 	/*
 	for(int w=0;w<recv_buf_extract[i].size();w++){
 	  assert(recv_buf_extract[i].size()==buffer_size);
 	  assert(send_buf_extract[i].size()==buffer_size);
 	  std::cout << "SendRecv_Cshift_comms ["<<w<<" "<< dimension<<"] recv "<<recv_buf_extract[i][w]<<" send " << send_buf_extract[nbr_lane][w]  << cbmask<<std::endl;
 	}
 	*/	
 	grid->Barrier();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
@@ -244,19 +244,11 @@ namespace Grid {
  template<class sobj,class vobj> strong_inline
  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
-    ret.checkerboard = x.checkerboard;
+    return axpy_norm_fast(ret,a,x,y);
    conformable(ret,x);
    conformable(x,y);
    axpy(ret,a,x,y);
    return norm2(ret);
  }
  template<class sobj,class vobj> strong_inline
  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
-    ret.checkerboard = x.checkerboard;
+    return axpby_norm_fast(ret,a,b,x,y);
    conformable(ret,x);
    conformable(x,y);
    axpby(ret,a,b,x,y);
    return norm2(ret); // FIXME implement parallel norm in ss loop
  }
 }
@@ -235,64 +235,98 @@ public:
    }
  };
-    //////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////
-    // Constructor requires "grid" passed.
+  // Constructor requires "grid" passed.
-    // what about a default grid?
+  // what about a default grid?
-    //////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////
-    Lattice(GridBase *grid) : _odata(grid->oSites()) {
+  Lattice(GridBase *grid) : _odata(grid->oSites()) {
-        _grid = grid;
+    _grid = grid;
    //        _odata.reserve(_grid->oSites());
    //        _odata.resize(_grid->oSites());
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
-        assert((((uint64_t)&_odata[0])&0xF) ==0);
+    assert((((uint64_t)&_odata[0])&0xF) ==0);
-        checkerboard=0;
+    checkerboard=0;
-    }
+  }
  Lattice(const Lattice& r){ // copy constructor
    _grid = r._grid;
    checkerboard = r.checkerboard;
    _odata.resize(_grid->oSites());// essential
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      _odata[ss]=r._odata[ss];
    }  	
  }
-    Lattice(const Lattice& r){ // copy constructor
+  Lattice(Lattice&& r){ // move constructor
-    	_grid = r._grid;
+    _grid = r._grid;
-    	checkerboard = r.checkerboard;
+    checkerboard = r.checkerboard;
-    	_odata.resize(_grid->oSites());// essential
+    _odata=std::move(r._odata);
-	parallel_for(int ss=0;ss<_grid->oSites();ss++){
+  }
-            _odata[ss]=r._odata[ss];
+  
-        }  	
+  inline Lattice<vobj> & operator = (Lattice<vobj> && r)
-    }
+  {
    _grid        = r._grid;
    checkerboard = r.checkerboard;
    _odata       =std::move(r._odata);
    return *this;
  }
-
+  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
-
+    _grid        = r._grid;
-    virtual ~Lattice(void) = default;
+    checkerboard = r.checkerboard;
    _odata.resize(_grid->oSites());// essential
-    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
-      parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      _odata[ss]=r._odata[ss];
-            this->_odata[ss]=r;
+    }  	
-        }
+    return *this;
-        return *this;
+  }
    }
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
      parallel_for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
        }
        return *this;
    }
-    // *=,+=,-= operators inherit behvour from correspond */+/- operation
+  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
-    template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
+    this->checkerboard = r.checkerboard;
-        *this = (*this)*r;
+    conformable(*this,r);
-        return *this;
+    
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      this->_odata[ss]=r._odata[ss];
    }
    return *this;
  }
-    template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
+  virtual ~Lattice(void) = default;
-        *this = (*this)-r;
+    
-        return *this;
+  void reset(GridBase* grid) {
    if (_grid != grid) {
      _grid = grid;
      _odata.resize(grid->oSites());
      checkerboard = 0;
    }
-    template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
+  }
-        *this = (*this)+r;
+  
        return *this;
    }
 }; // class Lattice
  template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      this->_odata[ss]=r;
    }
    return *this;
  }
  // *=,+=,-= operators inherit behvour from correspond */+/- operation
  template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
    *this = (*this)*r;
    return *this;
  }
  template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
    *this = (*this)-r;
    return *this;
  }
  template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
    *this = (*this)+r;
    return *this;
  }
 }; // class Lattice
  template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
    std::vector<int> gcoor;
    typedef typename vobj::scalar_object sobj;
@@ -310,7 +344,7 @@ public:
    }
    return stream;
  }
-
+  
 }
@@ -179,7 +179,7 @@ namespace Grid {
      return ret;
    }
-#define DECLARE_RELATIONAL(op,functor) \
+#define DECLARE_RELATIONAL_EQ(op,functor) \
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
    {\
@@ -198,11 +198,6 @@ namespace Grid {
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
    {									\
      return lhs._internal op rhs._internal;				\
    }									\
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
    {									\
@@ -212,14 +207,21 @@ namespace Grid {
    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
    {									\
      return lhs op rhs._internal;					\
-    }									
+    }									\
 #define DECLARE_RELATIONAL(op,functor) \
  DECLARE_RELATIONAL_EQ(op,functor)    \
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
    {									\
      return lhs._internal op rhs._internal;				\
    }									
 DECLARE_RELATIONAL(<,slt);
 DECLARE_RELATIONAL(<=,sle);
 DECLARE_RELATIONAL(>,sgt);
 DECLARE_RELATIONAL(>=,sge);
-DECLARE_RELATIONAL(==,seq);
+DECLARE_RELATIONAL_EQ(==,seq);
 DECLARE_RELATIONAL(!=,sne);
 #undef DECLARE_RELATIONAL
@@ -52,23 +52,5 @@ namespace Grid {
      }
    };
    // LatticeCoordinate();
    // FIXME for debug; deprecate this; made obscelete by 
    template<class vobj> void lex_sites(Lattice<vobj> &l){
      Real *v_ptr = (Real *)&l._odata[0];
      size_t o_len = l._grid->oSites();
      size_t v_len = sizeof(vobj)/sizeof(vRealF);
      size_t vec_len = vRealF::Nsimd();
      for(int i=0;i<o_len;i++){
 	for(int j=0;j<v_len;j++){
          for(int vv=0;vv<vec_len;vv+=2){
 	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
 	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
 	  }
 	}}
    }
 }
 #endif
@@ -1,81 +1,126 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reduction.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_REDUCTION_H
 #define GRID_LATTICE_REDUCTION_H
-#include <Grid/Eigen/Dense>
+#include <Grid/Grid_Eigen_Dense.h>
 namespace Grid {
 #ifdef GRID_WARN_SUBOPTIMAL
 #warning "Optimisation alert all these reduction loops are NOT threaded "
 #endif     
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Deterministic Reduction operations
+  // Deterministic Reduction operations
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
-  ComplexD nrm = innerProduct(arg,arg);
+  auto nrm = innerProduct(arg,arg);
  return std::real(nrm); 
 }
 // Double inner product
 template<class vobj>
-inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) 
+inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  scalar_type  nrm;
  GridBase *grid = left._grid;
-  
+  const int pad = 8;
-  std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
+
-  
+  ComplexD  inner;
  Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
    int nwork, mywork, myoff;
    GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
-    decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
+    decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
    for(int ss=myoff;ss<mywork+myoff; ss++){
-      vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
+      vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
    }
-    sumarray[thr]=TensorRemove(vnrm) ;
+    // All threads sum across SIMD; reduce serial work at end
    // one write per cacheline with streaming store
    ComplexD tmp = Reduce(TensorRemove(vinner)) ;
    vstream(sumarray[thr*pad],tmp);
  }
-  vector_type vvnrm; vvnrm=zero;  // sum across threads
+  inner=0.0;
  for(int i=0;i<grid->SumArraySize();i++){
-    vvnrm = vvnrm+sumarray[i];
+    inner = inner+sumarray[i*pad];
  } 
-  nrm = Reduce(vvnrm);// sum across simd
+  right._grid->GlobalSum(inner);
-  right._grid->GlobalSum(nrm);
+  return inner;
  return nrm;
 }
 /////////////////////////
 // Fast axpby_norm
 // z = a x + b y
 // return norm z
 /////////////////////////
 template<class sobj,class vobj> strong_inline RealD 
 axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 {
  sobj one(1.0);
  return axpby_norm_fast(z,a,one,x,y);
 }
 template<class sobj,class vobj> strong_inline RealD 
 axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 {
  const int pad = 8;
  z.checkerboard = x.checkerboard;
  conformable(z,x);
  conformable(x,y);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  GridBase *grid = x._grid;
  Vector<RealD> sumarray(grid->SumArraySize()*pad);
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
    int nwork, mywork, myoff;
    GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
    // private to thread; sub summation
    decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; 
    for(int ss=myoff;ss<mywork+myoff; ss++){
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vnrm = vnrm + innerProductD(tmp,tmp);
      vstream(z._odata[ss],tmp);
    }
    vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
  }
  nrm = 0.0; // sum across threads; linear in thread count but fast
  for(int i=0;i<grid->SumArraySize();i++){
    nrm = nrm+sumarray[i*pad];
  } 
  z._grid->GlobalSum(nrm);
  return nrm; 
 }
 template<class Op,class T1>
 inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
@@ -229,6 +274,115 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  }
 }
 template<class vobj>
 static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
 {
  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
  typedef typename vobj::scalar_type scalar_type;
  std::vector<scalar_type> lsSum;
  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
 }
 template <class vobj>
 static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
 {
  // std::cout << GridLogMessage << "Start prep" << std::endl;
  typedef typename vobj::vector_type   vector_type;
  typedef typename vobj::scalar_type   scalar_type;
  GridBase  *grid = lhs._grid;
  assert(grid!=NULL);
  conformable(grid,rhs._grid);
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
  assert(orthogdim >= 0);
  assert(orthogdim < Nd);
  int fd=grid->_fdimensions[orthogdim];
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
  // std::cout << GridLogMessage << "Start alloc" << std::endl;
  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
  std::vector<iScalar<scalar_type>> extracted(Nsimd);   // splitting the SIMD  
  // std::cout << GridLogMessage << "End alloc" << std::endl;
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
  for(int r=0;r<rd;r++){
    lvSum[r]=zero;
  }
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
  // std::cout << GridLogMessage << "End prep" << std::endl;
  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
  vector_type vv;
  parallel_for(int r=0;r<rd;r++)
  {
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int ss = so + n * stride + b;
        vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
        lvSum[r] = lvSum[r] + vv;
      }
    }
  }
  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
  // Sum across simd lanes in the plane, breaking out orthog dir.
  std::vector<int> icoor(Nd);
  for(int rt=0;rt<rd;rt++){
    iScalar<vector_type> temp; 
    temp._internal = lvSum[rt];
    extract(temp,extracted);
    for(int idx=0;idx<Nsimd;idx++){
      grid->iCoorFromIindex(icoor,idx);
      int ldx =rt+icoor[orthogdim]*rd;
      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
    }
  }
  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
 }
 template <class vobj>
 static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
 {
  typedef typename vobj::scalar_type scalar_type;
  GridBase *grid = lhs._grid;
  int fd = result.size();
  int ld = lsSum.size();
  // sum over nodes.
  std::vector<scalar_type> gsum;
  gsum.resize(fd, scalar_type(0.0));
  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
  for(int t=0;t<fd;t++){
    int pt = t/ld; // processor plane
    int lt = t%ld;
    if ( pt == grid->_processor_coor[orthogdim] ) {
      gsum[t]=lsSum[lt];
    }
  }
  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
  grid->GlobalSumVector(&gsum[0], fd);
  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
  result = gsum;
 }
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
@@ -336,6 +490,8 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::tensor_reduced tensor_reduced;
  scalar_type zscale(scale);
  GridBase *grid  = X._grid;
  int Nsimd  =grid->Nsimd();
@@ -361,7 +517,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
      grid->iCoorFromIindex(icoor,l);
      int ldx =r+icoor[orthogdim]*rd;
      scalar_type *as =(scalar_type *)&av;
-      as[l] = scalar_type(a[ldx])*scale;
+      as[l] = scalar_type(a[ldx])*zscale;
    }
    tensor_reduced at; at=av;
@@ -375,74 +531,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };
 /*
 template<class vobj>
 static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			     int Orthog,RealD scale=1.0) 
 {    
  // FIXME: Implementation is slow
  // Best base the linear combination by constructing a 
  // set of vectors of size grid->_rdimensions[Orthog].
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  Lattice<vobj> Xslice(SliceGrid);
  Lattice<vobj> Rslice(SliceGrid);
  // If we based this on Cshift it would work for spread out
  // but it would be even slower
  for(int i=0;i<Nblock;i++){
    ExtractSlice(Rslice,Y,i,Orthog);
    ExtractSlice(Xslice,X,i,Orthog);
    Rslice = Rslice + Xslice*(scale*a[i]);
    InsertSlice(Rslice,R,i,Orthog);
  }
 };
 template<class vobj>
 static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
  {
    // FIXME: Implementation is slow
    // Look at localInnerProduct implementation,
    // and do inside a site loop with block strided iterators
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
    typedef typename vobj::tensor_reduced scalar;
    typedef typename scalar::scalar_object  scomplex;
    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
    vec.resize(Nblock);
    std::vector<scomplex> sip(Nblock);
    Lattice<scalar> IP(lhs._grid); 
    IP=localInnerProduct(lhs,rhs);
    sliceSum(IP,sip,Orthog);
    for(int ss=0;ss<Nblock;ss++){
      vec[ss] = TensorRemove(sip[ss]);
    }
  }
 */
 //////////////////////////////////////////////////////////////////////////////////////////
 // FIXME: Implementation is slow
 // If we based this on Cshift it would work for spread out
 // but it would be even slower
 //
 // Repeated extract slice is inefficient
 //
 // Best base the linear combination by constructing a 
 // set of vectors of size grid->_rdimensions[Orthog].
 //////////////////////////////////////////////////////////////////////////////////////////
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@@ -461,7 +550,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
-
+*/
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@@ -471,60 +560,174 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
-  
+
  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
+
-  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
-  
+
-  for(int i=0;i<Nblock;i++){
+  assert( FullGrid->_simd_layout[Orthog]==1);
-    ExtractSlice(Rslice,Y,i,Orthog);
+  int nh =  FullGrid->_ndimension;
-    for(int j=0;j<Nblock;j++){
+  //  int nl = SliceGrid->_ndimension;
-      ExtractSlice(Xslice,X,j,Orthog);
+  int nl = nh-1;
-      Rslice = Rslice + Xslice*(scale*aa(j,i));
+
-    }
+  //FIXME package in a convenient iterator
-    InsertSlice(Rslice,R,i,Orthog);
+  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = Y[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R[o+i*ostride]=dot;
      }
    }}
  }
 };
 template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl=1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = s_x[0]*(scale*aa(0,i));
 	for(int j=1;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R[o+i*ostride]=dot;
      }
    }}
  }
 };
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  // FIXME: Implementation is slow
  // Not sure of best solution.. think about it
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
-  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  
+
-  for(int i=0;i<Nblock;i++){
+  assert( FullGrid->_simd_layout[Orthog]==1);
-    ExtractSlice(Lslice,lhs,i,Orthog);
+  int nh =  FullGrid->_ndimension;
-    for(int j=0;j<Nblock;j++){
+  //  int nl = SliceGrid->_ndimension;
-      ExtractSlice(Rslice,rhs,j,Orthog);
+  int nl = nh-1;
-      mat(i,j) = innerProduct(Lslice,Rslice);
+
-    }
+  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  typedef typename vobj::vector_typeD vector_typeD;
 #pragma omp parallel 
  {
    std::vector<vobj> Left(Nblock);
    std::vector<vobj> Right(Nblock);
    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	Left [i] = lhs[o+i*ostride];
 	Right[i] = rhs[o+i*ostride];
      }
      for(int i=0;i<Nblock;i++){
      for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
 	auto rtmp = TensorRemove(tmp);
 	mat_thread(i,j) += Reduce(rtmp);
      }}
    }}
 #pragma omp critical
    {
      mat += mat_thread;
    }  
  }
-#undef FORCE_DIAG
+
 #ifdef FORCE_DIAG
  for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
+  for(int j=0;j<Nblock;j++){
-      if ( i != j ) mat(i,j)=0.0;
+    ComplexD sum = mat(i,j);
-    }
+    FullGrid->GlobalSum(sum);
-  }
+    mat(i,j)=sum;
-#endif
+  }}
  return;
 }
 } /*END NAMESPACE GRID*/
 #endif
@@ -6,8 +6,8 @@
    Copyright (C) 2015
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -75,6 +75,30 @@ namespace Grid {
    return multiplicity;
  }
 // merge of April 11 2017
  // this function is necessary for the LS vectorised field
  inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
  {
    int rngdims = coarse->_ndimension;
    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
    int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
    // assumes that the higher dimensions are not using more processors
    // all further divisions are local
    for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
    for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
    // then divide the number of local sites
    // check that the total number of sims agree, meanse the iSites are the same
    assert(fine->Nsimd() == coarse->Nsimd());
    // check that the two grids divide cleanly
    assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
    return fine->lSites() / coarse->lSites();
  }
  // real scalars are one component
  template<class scalar,class distribution,class generator> 
  void fillScalar(scalar &s,distribution &dist,generator & gen)
@@ -109,7 +133,7 @@ namespace Grid {
 #ifdef RNG_SITMO
    typedef sitmo::prng_engine 	RngEngine;
    typedef uint64_t    	RngStateType;
-    static const int    	RngStateCount = 4;
+    static const int    	RngStateCount = 13;
 #endif
    std::vector<RngEngine>                             _generators;
@@ -122,7 +146,7 @@ namespace Grid {
    // support for parallel init
    ///////////////////////
 #ifdef RNG_FAST_DISCARD
-    static void Skip(RngEngine &eng)
+    static void Skip(RngEngine &eng,uint64_t site)
    {
      /////////////////////////////////////////////////////////////////////////////////////
      // Skip by 2^40 elements between successive lattice sites
@@ -134,9 +158,21 @@ namespace Grid {
      // tens of seconds per trajectory so this is clean in all reasonable cases,
      // and margin of safety is orders of magnitude.
      // We could hack Sitmo to skip in the higher order words of state if necessary
      //
      // Replace with 2^30 ; avoid problem on large volumes
      //
      /////////////////////////////////////////////////////////////////////////////////////
-      uint64_t skip = 0x1; skip = skip<<40;
+      //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
      const int shift = 30;
      uint64_t skip = site;
      skip = skip<<shift;
      assert((skip >> shift)==site); // check for overflow
      eng.discard(skip);
      //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
    } 
 #endif
    static RngEngine Reseed(RngEngine &eng)
@@ -164,7 +200,7 @@ namespace Grid {
      ss<<eng;
      ss.seekg(0,ss.beg);
      for(int i=0;i<RngStateCount;i++){
-	ss>>saved[i];
+        ss>>saved[i];
      }
    }
    void GetState(std::vector<RngStateType> & saved,int gen) {
@@ -174,7 +210,7 @@ namespace Grid {
      assert(saved.size()==RngStateCount);
      std::stringstream ss;
      for(int i=0;i<RngStateCount;i++){
-	ss<< saved[i]<<" ";
+        ss<< saved[i]<<" ";
      }
      ss.seekg(0,ss.beg);
      ss>>eng;
@@ -275,27 +311,42 @@ namespace Grid {
      }
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
-
+    
    void SeedFixedIntegers(const std::vector<int> &seeds){
      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
      std::seed_seq src(seeds.begin(),seeds.end());
      Seed(src,0);
    }
    void SeedUniqueString(const std::string &s){
      std::vector<int> seeds;
      std::stringstream sha;
      seeds = GridChecksum::sha256_seeds(s);
      for(int i=0;i<seeds.size();i++) { 
        sha << std::hex << seeds[i];
      }
      std::cout << GridLogMessage << "Intialising serial RNG with unique string '" 
                << s << "'" << std::endl;
      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
      SeedFixedIntegers(seeds);
    }
  };
  class GridParallelRNG : public GridRNGbase {
    double _time_counter;
  public:
    GridBase *_grid;
-    int _vol;
+    unsigned int _vol;
  public:
-    int generator_idx(int os,int is){
+    int generator_idx(int os,int is) {
      return is*_grid->oSites()+os;
    }
    GridParallelRNG(GridBase *grid) : GridRNGbase() {
-      _grid=grid;
+      _grid = grid;
-      _vol =_grid->iSites()*_grid->oSites();
+      _vol  =_grid->iSites()*_grid->oSites();
      _generators.resize(_vol);
      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
@@ -309,35 +360,48 @@ namespace Grid {
      typedef typename vobj::scalar_object scalar_object;
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
      int multiplicity = RNGfillable(_grid,l._grid);
-      int     Nsimd =_grid->Nsimd();
+      double inner_time_counter = usecond();
-      int     osites=_grid->oSites();
+
-      int words=sizeof(scalar_object)/sizeof(scalar_type);
+      int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
      int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too
      int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity
      int words  = sizeof(scalar_object) / sizeof(scalar_type);
      parallel_for(int ss=0;ss<osites;ss++){
        std::vector<scalar_object> buf(Nsimd);
        for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
-	std::vector<scalar_object> buf(Nsimd);
+          int sm = multiplicity * ss + m;  // Maps the generator site to the fine site
 	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
-	  int sm=multiplicity*ss+m;      // Maps the generator site to the fine site
+          for (int si = 0; si < Nsimd; si++) {
-
+            
-	  for(int si=0;si<Nsimd;si++){
+            int gdx = generator_idx(ss, si);  // index of generator state
-	    int gdx = generator_idx(ss,si); // index of generator state
+            scalar_type *pointer = (scalar_type *)&buf[si];
-	    scalar_type *pointer = (scalar_type *)&buf[si];
+            dist[gdx].reset();
-	    dist[gdx].reset();
+            for (int idx = 0; idx < words; idx++) 
-	    for(int idx=0;idx<words;idx++){
+              fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
-	      fillScalar(pointer[idx],dist[gdx],_generators[gdx]);
+          }
-	    }
+          // merge into SIMD lanes, FIXME suboptimal implementation
-	  }
+          merge(l._odata[sm], buf);
-
+        }
 	  // merge into SIMD lanes
 	  merge(l._odata[sm],buf);
 	}
      }
      _time_counter += usecond()- inner_time_counter;
    };
    void SeedUniqueString(const std::string &s){
      std::vector<int> seeds;
      std::stringstream sha;
      seeds = GridChecksum::sha256_seeds(s);
      for(int i=0;i<seeds.size();i++) { 
        sha << std::hex << seeds[i];
      }
      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
                << s << "'" << std::endl;
      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
      SeedFixedIntegers(seeds);
    }
    void SeedFixedIntegers(const std::vector<int> &seeds){
      // Everyone generates the same seed_seq based on input seeds
@@ -355,15 +419,14 @@ namespace Grid {
      // MT implementation does not implement fast discard even though
      // in principle this is possible
      ////////////////////////////////////////////////
      std::vector<int> gcoor;
      int rank,o_idx,i_idx;
      // Everybody loops over global volume.
-      for(int gidx=0;gidx<_grid->_gsites;gidx++){
+      parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
 	Skip(master_engine); // Skip to next RNG sequence
 	// Where is it?
 	int rank,o_idx,i_idx;
 	std::vector<int> gcoor;
 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
@@ -371,6 +434,7 @@ namespace Grid {
 	if( rank == _grid->ThisRank() ){
 	  int l_idx=generator_idx(o_idx,i_idx);
 	  _generators[l_idx] = master_engine;
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
      }
@@ -412,6 +476,12 @@ namespace Grid {
      }
 #endif
    }
    void Report(){
      std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
    }
    ////////////////////////////////////////////////////////////////////////
    // Support for rigorous test of RNG's
    // Return uniform random uint32_t from requested site generator
@@ -419,7 +489,6 @@ namespace Grid {
    uint32_t GlobalU01(int gsite){
      uint32_t the_number;
      // who
      std::vector<int> gcoor;
      int rank,o_idx,i_idx;
@@ -50,26 +50,22 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
    half.checkerboard = cb;
-    int ssh=0;
+
-    //parallel_for
+    parallel_for(int ss=0;ss<full._grid->oSites();ss++){
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
-      
+      std::vector<int> coor;
      full._grid->oCoorFromOindex(coor,ss);
      cbos=half._grid->CheckerBoard(coor);
      if (cbos==cb) {
 	int ssh=half._grid->oIndex(coor);
 	half._odata[ssh] = full._odata[ss];
 	ssh++;
      }
    }
  }
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
    int cb = half.checkerboard;
-    int ssh=0;
+    parallel_for(int ss=0;ss<full._grid->oSites();ss++){
    //parallel_for
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -77,8 +73,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
      cbos=half._grid->CheckerBoard(coor);
      if (cbos==cb) {
 	int ssh=half._grid->oIndex(coor);
 	full._odata[ss]=half._odata[ssh];
 	ssh++;
      }
    }
  }
@@ -109,8 +105,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
  coarseData=zero;
-  // Loop with a cache friendly loop ordering
+  // Loop over coars parallel, and then loop over fine associated with coarse.
-  for(int sf=0;sf<fine->oSites();sf++){
+  parallel_for(int sf=0;sf<fine->oSites();sf++){
    int sc;
    std::vector<int> coor_c(_ndimension);
@@ -119,8 +115,9 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 PARALLEL_CRITICAL
    for(int i=0;i<nbasis;i++) {
-      
+
      coarseData._odata[sc](i)=coarseData._odata[sc](i)
 	+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
@@ -139,6 +136,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
  GridBase * coarse= coarseA._grid;
  fineZ.checkerboard=fineX.checkerboard;
  assert(fineX.checkerboard==fineY.checkerboard);
  subdivides(coarse,fine); // require they map
  conformable(fineX,fineY);
  conformable(fineX,fineZ);
@@ -180,9 +178,10 @@ template<class vobj,class CComplex>
  GridBase *coarse(CoarseInner._grid);
  GridBase *fine  (fineX._grid);
-  Lattice<dotp> fine_inner(fine);
+  Lattice<dotp> fine_inner(fine); fine_inner.checkerboard = fineX.checkerboard;
  Lattice<dotp> coarse_inner(coarse);
  // Precision promotion?
  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
  parallel_for(int ss=0;ss<coarse->oSites();ss++){
@@ -193,7 +192,7 @@ template<class vobj,class CComplex>
 inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 {
  GridBase *coarse = ip._grid;
-  Lattice<vobj> zz(fineX._grid); zz=zero;
+  Lattice<vobj> zz(fineX._grid); zz=zero; zz.checkerboard=fineX.checkerboard;
  blockInnerProduct(ip,fineX,fineX);
  ip = pow(ip,-0.5);
  blockZAXPY(fineX,ip,fineX,zz);
@@ -216,19 +215,25 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
  // Turn this around to loop threaded over sc and interior loop 
  // over sf would thread better
  coarseData=zero;
-  for(int sf=0;sf<fine->oSites();sf++){
+  parallel_region {
-    
+
    int sc;
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+    
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-
+      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
+      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 PARALLEL_CRITICAL
      coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
    }
  }
  return;
 }
@@ -238,7 +243,7 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
 {
  GridBase * fine = unpicked._grid;
-  Lattice<vobj> zz(fine);
+  Lattice<vobj> zz(fine); zz.checkerboard = unpicked.checkerboard;
  Lattice<iScalar<vInteger> > fcoor(fine);
  zz = zero;
@@ -303,20 +308,21 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  }
  // Loop with a cache friendly loop ordering
-  for(int sf=0;sf<fine->oSites();sf++){
+  parallel_region {
    int sc;
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
    for(int i=0;i<nbasis;i++) {
      if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
      else     fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
      for(int i=0;i<nbasis;i++) {
 	if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
 	else     fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
      }
    }
  }
  return;
@@ -551,7 +557,10 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
-typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type 
 unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
 {
  typedef typename vobj::vector_type vtype;
  GridBase* in_grid = in._grid;
@@ -591,10 +600,152 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
  }
 }
 template<typename vobj, typename sobj>
 typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type 
 unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
 {
  typedef typename vobj::vector_type vtype;
  GridBase* in_grid = in._grid;
  out.resize(in_grid->lSites());
  int ndim = in_grid->Nd();
  int in_nsimd = vtype::Nsimd();
  std::vector<std::vector<int> > in_icoor(in_nsimd);
  for(int lane=0; lane < in_nsimd; lane++){
    in_icoor[lane].resize(ndim);
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
  }
  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> out_ptrs(in_nsimd);
    std::vector<int> in_ocoor(ndim);
    in_grid->oCoorFromOindex(in_ocoor, in_oidx);
    std::vector<int> lcoor(in_grid->Nd());
    for(int lane=0; lane < in_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
      int lex;
      Lexicographic::IndexFromCoorReversed(lcoor, lex, in_grid->_ldimensions);
      out_ptrs[lane] = &out[lex];
    }
    //Unpack into those ptrs
    const vobj & in_vobj = in._odata[in_oidx];
    extract1(in_vobj, out_ptrs, 0);
  }
 }
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
 typename std::enable_if<isSIMDvectorized<vobj>::value 
                    && !isSIMDvectorized<sobj>::value, void>::type 
 vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
 {
  typedef typename vobj::vector_type vtype;
  GridBase* grid = out._grid;
  assert(in.size()==grid->lSites());
  int ndim     = grid->Nd();
  int nsimd    = vtype::Nsimd();
  std::vector<std::vector<int> > icoor(nsimd);
  for(int lane=0; lane < nsimd; lane++){
    icoor[lane].resize(ndim);
    grid->iCoorFromIindex(icoor[lane],lane);
  }
  parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> ptrs(nsimd);
    std::vector<int> ocoor(ndim);
    grid->oCoorFromOindex(ocoor, oidx);
    std::vector<int> lcoor(grid->Nd());
    for(int lane=0; lane < nsimd; lane++){
      for(int mu=0;mu<ndim;mu++){
 	lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
      }
      int lex;
      Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions);
      ptrs[lane] = &in[lex];
    }
    //pack from those ptrs
    vobj vecobj;
    merge1(vecobj, ptrs, 0);
    out._odata[oidx] = vecobj; 
  }
 }
 template<typename vobj, typename sobj>
 typename std::enable_if<isSIMDvectorized<vobj>::value 
                    && !isSIMDvectorized<sobj>::value, void>::type 
 vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
 {
  typedef typename vobj::vector_type vtype;
  GridBase* grid = out._grid;
  assert(in.size()==grid->lSites());
  int ndim     = grid->Nd();
  int nsimd    = vtype::Nsimd();
  std::vector<std::vector<int> > icoor(nsimd);
  for(int lane=0; lane < nsimd; lane++){
    icoor[lane].resize(ndim);
    grid->iCoorFromIindex(icoor[lane],lane);
  }
  parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> ptrs(nsimd);
    std::vector<int> ocoor(ndim);
    grid->oCoorFromOindex(ocoor, oidx);
    std::vector<int> lcoor(grid->Nd());
    for(int lane=0; lane < nsimd; lane++){
      for(int mu=0;mu<ndim;mu++){
 	lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
      }
      int lex;
      Lexicographic::IndexFromCoorReversed(lcoor, lex, grid->_ldimensions);
      ptrs[lane] = &in[lex];
    }
    //pack from those ptrs
    vobj vecobj;
    merge1(vecobj, ptrs, 0);
    out._odata[oidx] = vecobj; 
  }
 }
 //Convert a Lattice from one precision to another
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  assert(out._grid->Nd() == in._grid->Nd());
  assert(out._grid->FullDimensions() == in._grid->FullDimensions());
  out.checkerboard = in.checkerboard;
  GridBase *in_grid=in._grid;
  GridBase *out_grid = out._grid;
@@ -615,7 +766,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
-  parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
+  parallel_for(uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
    std::vector<int> out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
@@ -633,6 +784,302 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
    merge(out._odata[out_oidx], ptrs, 0);
  }
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 // SIMPLE CASE:
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Mesh of nodes (2x2) ; subdivide to  1x1 subdivisions
 //
 // Lex ord:   
 //          N0 va0 vb0 vc0 vd0       N1 va1 vb1 vc1 vd1  
 //          N2 va2 vb2 vc2 vd2       N3 va3 vb3 vc3 vd3 
 //
 // Ratio = full[dim] / split[dim]
 //
 // For each dimension do an all to all; get Nvec -> Nvec / ratio
 //                                          Ldim -> Ldim * ratio
 //                                          LocalVol -> LocalVol * ratio
 // full AllToAll(0)
 //          N0 va0 vb0 va1 vb1       N1 vc0 vd0 vc1 vd1   
 //          N2 va2 vb2 va3 vb3       N3 vc2 vd2 vc3 vd3 
 //
 // REARRANGE
 //          N0 va01 vb01      N1 vc01 vd01
 //          N2 va23 vb23      N3 vc23 vd23
 //
 // full AllToAll(1)           // Not what is wanted. FIXME
 //          N0 va01 va23      N1 vc01 vc23 
 //          N2 vb01 vb23      N3 vd01 vd23
 // 
 // REARRANGE
 //          N0 va0123      N1 vc0123
 //          N2 vb0123      N3 vd0123
 //
 // Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
 // NB: Easiest to programme if keep in lex order.
 /*
 *  Let chunk = (fvol*nvec)/sP be size of a chunk.         ( Divide lexico vol * nvec into fP/sP = M chunks )
 *  
 *  2nd A2A (over sP nodes; subdivide the fP into sP chunks of M)
 * 
 *     node 0     1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1)..   data chunk x M x sP = fL / sP * M * sP = fL * M growth
 *     node 1     1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1)..
 *     node 2     1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1)..
 *     node 3     1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1)..
 *  etc...
 */
 template<class Vobj>
 void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 {
  typedef typename Vobj::scalar_object Sobj;
  int full_vecs   = full.size();
  assert(full_vecs>=1);
  GridBase * full_grid = full[0]._grid;
  GridBase *split_grid = split._grid;
  int       ndim  = full_grid->_ndimension;
  int  full_nproc = full_grid->_Nprocessors;
  int split_nproc =split_grid->_Nprocessors;
  ////////////////////////////////
  // Checkerboard management
  ////////////////////////////////
  int cb = full[0].checkerboard;
  split.checkerboard = cb;
  //////////////////////////////
  // Checks
  //////////////////////////////
  assert(full_grid->_ndimension==split_grid->_ndimension);
  for(int n=0;n<full_vecs;n++){
    assert(full[n].checkerboard == cb);
    for(int d=0;d<ndim;d++){
      assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]);
      assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]);
    }
  }
  int   nvector   =full_nproc/split_nproc; 
  assert(nvector*split_nproc==full_nproc);
  assert(nvector == full_vecs);
  std::vector<int> ratio(ndim);
  for(int d=0;d<ndim;d++){
    ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
  }
  uint64_t lsites = full_grid->lSites();
  uint64_t     sz = lsites * nvector;
  std::vector<Sobj> tmpdata(sz);
  std::vector<Sobj> alldata(sz);
  std::vector<Sobj> scalardata(lsites); 
  for(int v=0;v<nvector;v++){
    unvectorizeToLexOrdArray(scalardata,full[v]);    
    parallel_for(int site=0;site<lsites;site++){
      alldata[v*lsites+site] = scalardata[site];
    }
  }
  int nvec = nvector; // Counts down to 1 as we collapse dims
  std::vector<int> ldims = full_grid->_ldimensions;
  for(int d=ndim-1;d>=0;d--){
    if ( ratio[d] != 1 ) {
      full_grid ->AllToAll(d,alldata,tmpdata);
      if ( split_grid->_processors[d] > 1 ) {
 	alldata=tmpdata;
 	split_grid->AllToAll(d,alldata,tmpdata);
      }
      auto rdims = ldims; 
      auto     M = ratio[d];
      auto rsites= lsites*M;// increases rsites by M
      nvec      /= M;       // Reduce nvec by subdivision factor
      rdims[d]  *= M;       // increase local dim by same factor
      int sP =   split_grid->_processors[d];
      int fP =    full_grid->_processors[d];
      int fvol   = lsites;
      int chunk  = (nvec*fvol)/sP;          assert(chunk*sP == nvec*fvol);
      // Loop over reordered data post A2A
      parallel_for(int c=0;c<chunk;c++){
 	std::vector<int> coor(ndim);
 	for(int m=0;m<M;m++){
 	  for(int s=0;s<sP;s++){
 	    // addressing; use lexico
 	    int lex_r;
 	    uint64_t lex_c        = c+chunk*m+chunk*M*s;
 	    uint64_t lex_fvol_vec = c+chunk*s;
 	    uint64_t lex_fvol     = lex_fvol_vec%fvol;
 	    uint64_t lex_vec      = lex_fvol_vec/fvol;
 	    // which node sets an adder to the coordinate
 	    Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);	  
 	    coor[d] += m*ldims[d];
 	    Lexicographic::IndexFromCoor(coor, lex_r, rdims);	  
 	    lex_r += lex_vec * rsites;
 	    // LexicoFind coordinate & vector number within split lattice
 	    alldata[lex_r] = tmpdata[lex_c];
 	  }
 	}
      }
      ldims[d]*= ratio[d];
      lsites  *= ratio[d];
    }
  }
  vectorizeFromLexOrdArray(alldata,split);    
 }
 template<class Vobj>
 void Grid_split(Lattice<Vobj> &full,Lattice<Vobj>   & split)
 {
  int nvector = full._grid->_Nprocessors / split._grid->_Nprocessors;
  std::vector<Lattice<Vobj> > full_v(nvector,full._grid);
  for(int n=0;n<nvector;n++){
    full_v[n] = full;
  }
  Grid_split(full_v,split);
 }
 template<class Vobj>
 void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 {
  typedef typename Vobj::scalar_object Sobj;
  int full_vecs   = full.size();
  assert(full_vecs>=1);
  GridBase * full_grid = full[0]._grid;
  GridBase *split_grid = split._grid;
  int       ndim  = full_grid->_ndimension;
  int  full_nproc = full_grid->_Nprocessors;
  int split_nproc =split_grid->_Nprocessors;
  ////////////////////////////////
  // Checkerboard management
  ////////////////////////////////
  int cb = full[0].checkerboard;
  split.checkerboard = cb;
  //////////////////////////////
  // Checks
  //////////////////////////////
  assert(full_grid->_ndimension==split_grid->_ndimension);
  for(int n=0;n<full_vecs;n++){
    assert(full[n].checkerboard == cb);
    for(int d=0;d<ndim;d++){
      assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]);
      assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]);
    }
  }
  int   nvector   =full_nproc/split_nproc; 
  assert(nvector*split_nproc==full_nproc);
  assert(nvector == full_vecs);
  std::vector<int> ratio(ndim);
  for(int d=0;d<ndim;d++){
    ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
  }
  uint64_t lsites = full_grid->lSites();
  uint64_t     sz = lsites * nvector;
  std::vector<Sobj> tmpdata(sz);
  std::vector<Sobj> alldata(sz);
  std::vector<Sobj> scalardata(lsites); 
  unvectorizeToLexOrdArray(alldata,split);    
  /////////////////////////////////////////////////////////////////
  // Start from split grid and work towards full grid
  /////////////////////////////////////////////////////////////////
  int nvec = 1;
  uint64_t rsites        = split_grid->lSites();
  std::vector<int> rdims = split_grid->_ldimensions;
  for(int d=0;d<ndim;d++){
    if ( ratio[d] != 1 ) {
      auto     M = ratio[d];
      int sP =   split_grid->_processors[d];
      int fP =    full_grid->_processors[d];
      auto ldims = rdims;  ldims[d]  /= M;  // Decrease local dims by same factor
      auto lsites= rsites/M;                // Decreases rsites by M
      int fvol   = lsites;
      int chunk  = (nvec*fvol)/sP;          assert(chunk*sP == nvec*fvol);
      {
 	// Loop over reordered data post A2A
 	parallel_for(int c=0;c<chunk;c++){
 	  std::vector<int> coor(ndim);
 	  for(int m=0;m<M;m++){
 	    for(int s=0;s<sP;s++){
 	      // addressing; use lexico
 	      int lex_r;
 	      uint64_t lex_c = c+chunk*m+chunk*M*s;
 	      uint64_t lex_fvol_vec = c+chunk*s;
 	      uint64_t lex_fvol     = lex_fvol_vec%fvol;
 	      uint64_t lex_vec      = lex_fvol_vec/fvol;
 	      // which node sets an adder to the coordinate
 	      Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);	  
 	      coor[d] += m*ldims[d];
 	      Lexicographic::IndexFromCoor(coor, lex_r, rdims);	  
 	      lex_r += lex_vec * rsites;
 	      // LexicoFind coordinate & vector number within split lattice
 	      tmpdata[lex_c] = alldata[lex_r];
 	    }
 	  }
 	}
      }
      if ( split_grid->_processors[d] > 1 ) {
 	split_grid->AllToAll(d,tmpdata,alldata);
 	tmpdata=alldata;
      }
      full_grid ->AllToAll(d,tmpdata,alldata);
      rdims[d]/= M;
      rsites  /= M;
      nvec    *= M;       // Increase nvec by subdivision factor
    }
  }
  lsites = full_grid->lSites();
  for(int v=0;v<nvector;v++){
    //    assert(v<full.size());
    parallel_for(int site=0;site<lsites;site++){
      //      assert(v*lsites+site < alldata.size());
      scalardata[site] = alldata[v*lsites+site];
    }
    vectorizeFromLexOrdArray(scalardata,full[v]);    
  }
 }
 }
 #endif
@@ -62,14 +62,20 @@ namespace Grid {
    return ret;
  }
-  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
+  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
    }
    return ret;
  }
@@ -30,6 +30,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/util/CompilerCompatible.h>
 #include <cxxabi.h>
 #include <memory>
@@ -49,7 +50,7 @@ namespace Grid {
    return (status==0) ? res.get() : name ;
  }
-GridStopWatch Logger::StopWatch;
+GridStopWatch Logger::GlobalStopWatch;
 int Logger::timestamp;
 std::ostream Logger::devnull(0);
@@ -58,13 +59,15 @@ void GridLogTimestamp(int on){
 }
 Colours GridLogColours(0);
-GridLogger GridLogError(1, "Error", GridLogColours, "RED");
+GridLogger GridLogIRL    (1, "IRL"   , GridLogColours, "NORMAL");
 GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
-GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
+GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
-GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
+GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
-GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");
+GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@@ -94,7 +97,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
@@ -85,12 +85,16 @@ class Logger {
 protected:
  Colours &Painter;
  int active;
  int timing_mode;
  int topWidth{-1}, chanWidth{-1};
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
 public:
-  static GridStopWatch StopWatch;
+  static GridStopWatch GlobalStopWatch;
  GridStopWatch         LocalStopWatch;
  GridStopWatch *StopWatch;
  static std::ostream devnull;
  std::string background() {return Painter.colour["NORMAL"];}
@@ -101,22 +105,50 @@ public:
    name(nm),
    topName(topNm),
    Painter(col_class),
-    COLOUR(col) {} ;
+    timing_mode(0),
    COLOUR(col) 
    {
      StopWatch = & GlobalStopWatch;
    };
  void Active(int on) {active = on;};
  int  isActive(void) {return active;};
  static void Timestamp(int on) {timestamp = on;};
-  
+  void Reset(void) { 
    StopWatch->Reset(); 
    StopWatch->Start(); 
  }
  void TimingMode(int on) { 
    timing_mode = on; 
    if(on) { 
      StopWatch = &LocalStopWatch;
      Reset(); 
    }
  }
  void setTopWidth(const int w) {topWidth = w;}
  void setChanWidth(const int w) {chanWidth = w;}
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
-      stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : ";
+      stream << log.background()<<  std::left;
-      stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : ";
+      if (log.topWidth > 0)
      {
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
      stream << log.colour() <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
      }
      stream << log.name << log.background() << " : ";
      if ( log.timestamp ) {
-	StopWatch.Stop();
+	log.StopWatch->Stop();
-	GridTime now = StopWatch.Elapsed();
+	GridTime now = log.StopWatch->Elapsed();
-	StopWatch.Start();
+	if ( log.timing_mode==1 ) log.StopWatch->Reset();
-	stream << log.evidence()<< now << log.background() << " : " ;
+	log.StopWatch->Start();
 	stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
@@ -135,6 +167,8 @@ public:
 void GridLogConfigure(std::vector<std::string> &logstreams);
 extern GridLogger GridLogIRL;
 extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
 extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
@@ -0,0 +1,729 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/parallelIO/BinaryIO.h
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu<guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H
 #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 #define USE_MPI_IO
 #else
 #undef  USE_MPI_IO
 #endif
 #ifdef HAVE_ENDIAN_H
 #include <endian.h>
 #endif
 #include <arpa/inet.h>
 #include <algorithm>
 namespace Grid { 
 /////////////////////////////////////////////////////////////////////////////////
 // Byte reversal garbage
 /////////////////////////////////////////////////////////////////////////////////
 inline uint32_t byte_reverse32(uint32_t f) { 
      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
      return f;
 }
 inline uint64_t byte_reverse64(uint64_t f) { 
  uint64_t g;
  g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
  g = g << 32;
  f = f >> 32;
  g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
  return g;
 }
 #if BYTE_ORDER == BIG_ENDIAN 
 inline uint64_t Grid_ntohll(uint64_t A) { return A; }
 #else
 inline uint64_t Grid_ntohll(uint64_t A) { 
  return byte_reverse64(A);
 }
 #endif
 // A little helper
 inline void removeWhitespace(std::string &key)
 {
  key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Static class holding the parallel IO code
 // Could just use a namespace
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
  /////////////////////////////////////////////////////////////////////////////
  // more byte manipulation helpers
  /////////////////////////////////////////////////////////////////////////////
  template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,uint32_t &nersc_csum)
  {
    typedef typename vobj::scalar_object sobj;
    GridBase *grid = lat._grid;
    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    unvectorizeToLexOrdArray(scalardata,lat);    
    NerscChecksum(grid,scalardata,nersc_csum);
  }
  template <class fobj>
  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
  {
    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
    uint64_t lsites = grid->lSites();
    if (fbuf.size() == 1)
    {
      lsites = 1;
    }
 PARALLEL_REGION
    {
      uint32_t nersc_csum_thr = 0;
 PARALLEL_FOR_LOOP_INTERN
      for (uint64_t local_site = 0; local_site < lsites; local_site++)
      {
        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
        for (uint64_t j = 0; j < size32; j++)
        {
          nersc_csum_thr = nersc_csum_thr + site_buf[j];
        }
      }
 PARALLEL_CRITICAL
      {
        nersc_csum += nersc_csum_thr;
      }
    }
  }
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
  {
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
    int nd = grid->_ndimension;
    uint64_t lsites              =grid->lSites();
    if (fbuf.size()==1) {
      lsites=1;
    }
    std::vector<int> local_vol   =grid->LocalDimensions();
    std::vector<int> local_start =grid->LocalStarts();
    std::vector<int> global_vol  =grid->FullDimensions();
 PARALLEL_REGION
    { 
      std::vector<int> coor(nd);
      uint32_t scidac_csuma_thr=0;
      uint32_t scidac_csumb_thr=0;
      uint32_t site_crc=0;
 PARALLEL_FOR_LOOP_INTERN
      for(uint64_t local_site=0;local_site<lsites;local_site++){
 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
 	/* 
 	 * Scidac csum  is rather more heavyweight
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
 	int global_site;
 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
 	for(int d=0;d<nd;d++) {
 	  coor[d] = coor[d]+local_start[d];
 	}
 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);
 	uint32_t gsite29   = global_site%29;
 	uint32_t gsite31   = global_site%31;
 	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
 	//	std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
 	scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
 	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
      }
 PARALLEL_CRITICAL
      {
 	scidac_csuma^= scidac_csuma_thr;
 	scidac_csumb^= scidac_csumb_thr;
      }
    }
  }
  // Network is big endian
  static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} 
  static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} 
  static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} 
  static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} 
  static inline void be32toh_v(void *file_object,uint64_t bytes)
  {
    uint32_t * f = (uint32_t *)file_object;
    uint64_t count = bytes/sizeof(uint32_t);
    parallel_for(uint64_t i=0;i<count;i++){  
      f[i] = ntohl(f[i]);
    }
  }
  // LE must Swap and switch to host
  static inline void le32toh_v(void *file_object,uint64_t bytes)
  {
    uint32_t *fp = (uint32_t *)file_object;
    uint32_t f;
    uint64_t count = bytes/sizeof(uint32_t);
    parallel_for(uint64_t i=0;i<count;i++){  
      f = fp[i];
      // got network order and the network to host
      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
      fp[i] = ntohl(f);
    }
  }
  // BE is same as network
  static inline void be64toh_v(void *file_object,uint64_t bytes)
  {
    uint64_t * f = (uint64_t *)file_object;
    uint64_t count = bytes/sizeof(uint64_t);
    parallel_for(uint64_t i=0;i<count;i++){  
      f[i] = Grid_ntohll(f[i]);
    }
  }
  // LE must swap and switch;
  static inline void le64toh_v(void *file_object,uint64_t bytes)
  {
    uint64_t *fp = (uint64_t *)file_object;
    uint64_t f,g;
    uint64_t count = bytes/sizeof(uint64_t);
    parallel_for(uint64_t i=0;i<count;i++){  
      f = fp[i];
      // got network order and the network to host
      g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
      g = g << 32;
      f = f >> 32;
      g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
      fp[i] = Grid_ntohll(g);
    }
  }
  /////////////////////////////////////////////////////////////////////////////
  // Real action:
  // Read or Write distributed lexico array of ANY object to a specific location in file 
  //////////////////////////////////////////////////////////////////////////////////////
  static const int BINARYIO_MASTER_APPEND = 0x10;
  static const int BINARYIO_UNORDERED     = 0x08;
  static const int BINARYIO_LEXICOGRAPHIC = 0x04;
  static const int BINARYIO_READ          = 0x02;
  static const int BINARYIO_WRITE         = 0x01;
  template<class word,class fobj>
  static inline void IOobject(word w,
 			      GridBase *grid,
 			      std::vector<fobj> &iodata,
 			      std::string file,
 			      uint64_t& offset,
 			      const std::string &format, int control,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
  {
    grid->Barrier();
    GridStopWatch timer; 
    GridStopWatch bstimer;
    nersc_csum=0;
    scidac_csuma=0;
    scidac_csumb=0;
    int ndim                 = grid->Dimensions();
    int nrank                = grid->ProcessorCount();
    int myrank               = grid->ThisRank();
    std::vector<int>  psizes = grid->ProcessorGrid(); 
    std::vector<int>  pcoor  = grid->ThisProcessorCoor();
    std::vector<int> gLattice= grid->GlobalDimensions();
    std::vector<int> lLattice= grid->LocalDimensions();
    std::vector<int> lStart(ndim);
    std::vector<int> gStart(ndim);
    // Flatten the file
    uint64_t lsites = grid->lSites();
    if ( control & BINARYIO_MASTER_APPEND )  {
      assert(iodata.size()==1);
    } else {
      assert(lsites==iodata.size());
    }
    for(int d=0;d<ndim;d++){
      gStart[d] = lLattice[d]*pcoor[d];
      lStart[d] = 0;
    }
 #ifdef USE_MPI_IO
    std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
    std::vector<int> dargs   (ndim,MPI_DISTRIBUTE_DFLT_DARG);
    MPI_Datatype mpiObject;
    MPI_Datatype fileArray;
    MPI_Datatype localArray;
    MPI_Datatype mpiword;
    MPI_Offset disp = offset;
    MPI_File fh ;
    MPI_Status status;
    int numword;
    if ( sizeof( word ) == sizeof(float ) ) {
      numword = sizeof(fobj)/sizeof(float);
      mpiword = MPI_FLOAT;
    } else {
      numword = sizeof(fobj)/sizeof(double);
      mpiword = MPI_DOUBLE;
    }
    //////////////////////////////////////////////////////////////////////////////
    // Sobj in MPI phrasing
    //////////////////////////////////////////////////////////////////////////////
    int ierr;
    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);    assert(ierr==0);
    ierr = MPI_Type_commit(&mpiObject);
    //////////////////////////////////////////////////////////////////////////////
    // File global array data type
    //////////////////////////////////////////////////////////////////////////////
    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);    assert(ierr==0);
    ierr=MPI_Type_commit(&fileArray);    assert(ierr==0);
    //////////////////////////////////////////////////////////////////////////////
    // local lattice array
    //////////////////////////////////////////////////////////////////////////////
    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);    assert(ierr==0);
    ierr=MPI_Type_commit(&localArray);    assert(ierr==0);
 #endif
    //////////////////////////////////////////////////////////////////////////////
    // Byte order
    //////////////////////////////////////////////////////////////////////////////
    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
    int ieee64big = (format == std::string("IEEE64BIG"));
    int ieee64    = (format == std::string("IEEE64"));
    //////////////////////////////////////////////////////////////////////////////
    // Do the I/O
    //////////////////////////////////////////////////////////////////////////////
    if ( control & BINARYIO_READ ) { 
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
 	std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
 	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
 	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
 	MPI_File_close(&fh);
 	MPI_Type_free(&fileArray);
 	MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif
      } else {
 	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
        std::ifstream fin;
 	fin.open(file, std::ios::binary | std::ios::in);
        if (control & BINARYIO_MASTER_APPEND)
        {
          fin.seekg(-sizeof(fobj), fin.end);
        }
        else
        {
          fin.seekg(offset + myrank * lsites * sizeof(fobj));
        }
        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
        assert(fin.fail() == 0);
        fin.close();
      }
      timer.Stop();
      grid->Barrier();
      bstimer.Start();
      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      NerscChecksum(grid,iodata,nersc_csum);
      bstimer.Stop();
    }
    if ( control & BINARYIO_WRITE ) { 
      bstimer.Start();
      NerscChecksum(grid,iodata,nersc_csum);
      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
      bstimer.Stop();
      grid->Barrier();
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
        std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
 	//        std::cout << GridLogMessage << "Checking for errors" << std::endl;
        if (ierr != MPI_SUCCESS)
        {
          char error_string[BUFSIZ];
          int length_of_error_string, error_class;
          MPI_Error_class(ierr, &error_class);
          MPI_Error_string(error_class, error_string, &length_of_error_string);
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
          MPI_Error_string(ierr, error_string, &length_of_error_string);
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
        }
        std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
        assert(ierr == 0);
        std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
        assert(ierr == 0);
        MPI_Offset os;
        MPI_File_get_position(fh, &os);
        MPI_File_get_byte_offset(fh, os, &disp);
        offset = disp;
        MPI_File_close(&fh);
        MPI_Type_free(&fileArray);
        MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif
      } else { 
        std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
 	std::ofstream fout; 
 	fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
 	try {
 	  if (offset) { // Must already exist and contain data
 	    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 	  } else {     // Allow create
 	    fout.open(file,std::ios::binary|std::ios::out);
 	  }
 	} catch (const std::fstream::failure& exc) {
 	  std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
 	  std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
 	  //	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
 #ifdef USE_MPI_IO
 	  MPI_Abort(MPI_COMM_WORLD,1);
 #else
 	  exit(1);
 #endif
 	}
 	if ( control & BINARYIO_MASTER_APPEND )  {
 	  try {
 	    fout.seekp(0,fout.end);
 	  } catch (const std::fstream::failure& exc) {
 	    std::cout << "Exception in seeking file end " << file << std::endl;
 	  }
 	} else {
 	  try { 
 	    fout.seekp(offset+myrank*lsites*sizeof(fobj));
 	  } catch (const std::fstream::failure& exc) {
 	    std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
 	  }
 	}
 	try {
 	  fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
 	}
 	catch (const std::fstream::failure& exc) {
 	  std::cout << "Exception in writing file " << file << std::endl;
 	  std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
 #ifdef USE_MPI_IO
 	  MPI_Abort(MPI_COMM_WORLD,1);
 #else
 	  exit(1);
 #endif
 	}
  offset  = fout.tellp();
 	fout.close();
      }
      timer.Stop();
    }
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
    else                          std::cout << " write ";
    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;
    //////////////////////////////////////////////////////////////////////////////
    // Safety check
    //////////////////////////////////////////////////////////////////////////////
    // if the data size is 1 we do not want to sum over the MPI ranks
    if (iodata.size() != 1){
      grid->Barrier();
      grid->GlobalSum(nersc_csum);
      grid->GlobalXOR(scidac_csuma);
      grid->GlobalXOR(scidac_csumb);
      grid->Barrier();
    }
  }
  /////////////////////////////////////////////////////////////////////////////
  // Read a Lattice of object
  //////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class fobj,class munger>
  static inline void readLatticeObject(Lattice<vobj> &Umu,
 				       std::string file,
 				       munger munge,
 				       uint64_t offset,
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
 				       uint32_t &scidac_csumb)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    GridStopWatch timer; 
    timer.Start();
    parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
    vectorizeFromLexOrdArray(scalardata,Umu);    
    grid->Barrier();
    timer.Stop();
    std::cout<<GridLogMessage<<"readLatticeObject: vectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
  /////////////////////////////////////////////////////////////////////////////
  // Write a Lattice of object
  //////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class fobj,class munger>
    static inline void writeLatticeObject(Lattice<vobj> &Umu,
 					  std::string file,
 					  munger munge,
 					  uint64_t offset,
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
 					  uint32_t &scidac_csumb)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    //////////////////////////////////////////////////////////////////////////////
    // Munge [ .e.g 3rd row recon ]
    //////////////////////////////////////////////////////////////////////////////
    GridStopWatch timer; timer.Start();
    unvectorizeToLexOrdArray(scalardata,Umu);    
    parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
    grid->Barrier();
    timer.Stop();
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
  /////////////////////////////////////////////////////////////////////////////
  // Read a RNG;  use IOobject and lexico map to an array of state 
  //////////////////////////////////////////////////////////////////////////////////////
  static inline void readRNG(GridSerialRNG &serial,
 			     GridParallelRNG &parallel,
 			     std::string file,
 			     uint64_t offset,
 			     uint32_t &nersc_csum,
 			     uint32_t &scidac_csuma,
 			     uint32_t &scidac_csumb)
  {
    typedef typename GridSerialRNG::RngStateType RngStateType;
    const int RngStateCount = GridSerialRNG::RngStateCount;
    typedef std::array<RngStateType,RngStateCount> RNGstate;
    typedef RngStateType word;    word w=0;
    std::string format = "IEEE32BIG";
    GridBase *grid = parallel._grid;
    uint64_t gsites = grid->gSites();
    uint64_t lsites = grid->lSites();
    uint32_t nersc_csum_tmp   = 0;
    uint32_t scidac_csuma_tmp = 0;
    uint32_t scidac_csumb_tmp = 0;
    GridStopWatch timer;
    std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
    std::vector<RNGstate> iodata(lsites);
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    timer.Start();
    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
      parallel.SetState(tmp,lidx);
    }
    timer.Stop();
    iodata.resize(1);
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND,
 	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
    {
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
      serial.SetState(tmp,0);
    }
    nersc_csum   = nersc_csum   + nersc_csum_tmp;
    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
    std::cout << GridLogMessage << "RNG file nersc_checksum   " << std::hex << nersc_csum << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
  /////////////////////////////////////////////////////////////////////////////
  // Write a RNG; lexico map to an array of state and use IOobject
  //////////////////////////////////////////////////////////////////////////////////////
  static inline void writeRNG(GridSerialRNG &serial,
 			      GridParallelRNG &parallel,
 			      std::string file,
 			      uint64_t offset,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
  {
    typedef typename GridSerialRNG::RngStateType RngStateType;
    typedef RngStateType word; word w=0;
    const int RngStateCount = GridSerialRNG::RngStateCount;
    typedef std::array<RngStateType,RngStateCount> RNGstate;
    GridBase *grid = parallel._grid;
    uint64_t gsites = grid->gSites();
    uint64_t lsites = grid->lSites();
    uint32_t nersc_csum_tmp;
    uint32_t scidac_csuma_tmp;
    uint32_t scidac_csumb_tmp;
    GridStopWatch timer;
    std::string format = "IEEE32BIG";
    std::cout << GridLogMessage << "RNG write I/O on file " << file << std::endl;
    timer.Start();
    std::vector<RNGstate> iodata(lsites);
    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
      std::vector<RngStateType> tmp(RngStateCount);
      parallel.GetState(tmp,lidx);
      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
    }
    timer.Stop();
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    iodata.resize(1);
    {
      std::vector<RngStateType> tmp(RngStateCount);
      serial.GetState(tmp,0);
      std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
    }
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
 	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
    nersc_csum   = nersc_csum   + nersc_csum_tmp;
    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
    std::cout << GridLogMessage << "RNG file checksum " << std::hex << nersc_csum    << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG file checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG file checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
 };
 }
 #endif
@@ -0,0 +1,875 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/parallelIO/IldgIO.h
 Copyright (C) 2015
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_ILDG_IO_H
 #define GRID_ILDG_IO_H
 #ifdef HAVE_LIME
 #include <algorithm>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <pwd.h>
 #include <sys/utsname.h>
 #include <unistd.h>
 //C-Lime is a must have for this functionality
 extern "C" {  
 #include "lime.h"
 }
 namespace Grid {
 namespace QCD {
  /////////////////////////////////
  // Encode word types as strings
  /////////////////////////////////
 template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
 template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
 template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
 template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
 template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
 template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
 template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
  /////////////////////////////////////////
  // Encode a generic tensor as a string
  /////////////////////////////////////////
 template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
   typedef typename getPrecision<vobj>::real_scalar_type stype;
   int _ColourN       = indexRank<ColourIndex,vobj>();
   int _ColourScalar  =  isScalar<ColourIndex,vobj>();
   int _ColourVector  =  isVector<ColourIndex,vobj>();
   int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
   int _SpinN       = indexRank<SpinIndex,vobj>();
   int _SpinScalar  =  isScalar<SpinIndex,vobj>();
   int _SpinVector  =  isVector<SpinIndex,vobj>();
   int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
   int _LorentzN       = indexRank<LorentzIndex,vobj>();
   int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
   int _LorentzVector  =  isVector<LorentzIndex,vobj>();
   int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
   std::stringstream stream;
   stream << "GRID_";
   stream << ScidacWordMnemonic<stype>();
   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
   if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
   if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
   if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
   if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
   if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
   typesize = sizeof(typename vobj::scalar_type);
   if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
   else                 typesize*= _ColourN;
   if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
   else                 typesize*= _SpinN;
   colors    = _ColourN;
   spins     = _SpinN;
   datacount = _LorentzN;
   return stream.str();
 }
 template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) { 
   return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
 };
 ////////////////////////////////////////////////////////////
 // Helper to fill out metadata
 ////////////////////////////////////////////////////////////
 template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 					  FieldMetaData &header,
 					  scidacRecord & _scidacRecord,
 					  scidacFile   & _scidacFile) 
 {
   typedef typename getPrecision<vobj>::real_scalar_type stype;
   /////////////////////////////////////
   // Pull Grid's metadata
   /////////////////////////////////////
   PrepareMetaData(field,header);
   /////////////////////////////////////
   // Scidac Private File structure
   /////////////////////////////////////
   _scidacFile              = scidacFile(field._grid);
   /////////////////////////////////////
   // Scidac Private Record structure
   /////////////////////////////////////
   scidacRecord sr;
   sr.datatype   = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
   sr.date       = header.creation_date;
   sr.precision  = ScidacWordMnemonic<stype>();
   sr.recordtype = GRID_IO_FIELD;
   _scidacRecord = sr;
   //   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
 }
 ///////////////////////////////////////////////////////
 // Scidac checksum
 ///////////////////////////////////////////////////////
 static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
 {
   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
   if ( scidac_csuma !=scidac_checksuma) return 0;
   if ( scidac_csumb !=scidac_checksumb) return 0;
   return 1;
 }
 ////////////////////////////////////////////////////////////////////////////////////
 // Lime, ILDG and Scidac I/O classes
 ////////////////////////////////////////////////////////////////////////////////////
 class GridLimeReader : public BinaryIO {
 public:
   ///////////////////////////////////////////////////
   // FIXME: format for RNG? Now just binary out instead
   ///////////////////////////////////////////////////
   FILE       *File;
   LimeReader *LimeR;
   std::string filename;
   /////////////////////////////////////////////
   // Open the file
   /////////////////////////////////////////////
   void open(const std::string &_filename) 
   {
     filename= _filename;
     File = fopen(filename.c_str(), "r");
     if (File == nullptr)
     {
       std::cerr << "cannot open file '" << filename << "'" << std::endl;
       abort();
     }
     LimeR = limeCreateReader(File);
   }
   /////////////////////////////////////////////
   // Close the file
   /////////////////////////////////////////////
   void close(void){
     fclose(File);
     //     limeDestroyReader(LimeR);
   }
  ////////////////////////////////////////////
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    std::string format = getFormatString<vobj>();
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      uint64_t file_bytes =limeReaderBytes(LimeR);
      //      std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
      //      std::cout << GridLogMessage<< " readLimeObject seeking "<<  record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 	//	std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
 	uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
 	//	std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
 	//	std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
 	//	std::cout << "R Payload expected " <<PayloadSize<<std::endl;
 	//	std::cout << "R file size " <<file_bytes <<std::endl;
 	assert(PayloadSize == file_bytes);// Must match or user error
 	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
 	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
 	/////////////////////////////////////////////
 	// Verify checksums
 	/////////////////////////////////////////////
 	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
 	return;
      }
    }
  }
  ////////////////////////////////////////////
  // Read a generic serialisable object
  ////////////////////////////////////////////
  void readLimeObject(std::string &xmlstring,std::string record_name)
  {
    // should this be a do while; can we miss a first record??
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      //      std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 	//	std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
   xmlstring = std::string(&xmlc[0]);
 	return;
      }
    }  
    assert(0);
  }
  template<class serialisable_object>
  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
  {
    std::string xmlstring;
    readLimeObject(xmlstring, record_name);
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,object_name,object);
  }
 };
 class GridLimeWriter : public BinaryIO 
 {
 public:
   ///////////////////////////////////////////////////
   // FIXME: format for RNG? Now just binary out instead
   // FIXME: collective calls or not ?
   //      : must know if I am the I/O boss
   ///////////////////////////////////////////////////
   FILE       *File;
   LimeWriter *LimeW;
   std::string filename;
   bool        boss_node;
   GridLimeWriter( bool isboss = true) {
     boss_node = isboss;
   }
   void open(const std::string &_filename) { 
     filename= _filename;
     if ( boss_node ) {
       File = fopen(filename.c_str(), "w");
       LimeW = limeCreateWriter(File); assert(LimeW != NULL );
     }
   }
   /////////////////////////////////////////////
   // Close the file
   /////////////////////////////////////////////
   void close(void) {
     if ( boss_node ) {
       fclose(File);
     }
     //  limeDestroyWriter(LimeW);
   }
  ///////////////////////////////////////////////////////
  // Lime utility functions
  ///////////////////////////////////////////////////////
  int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
  {
    if ( boss_node ) {
      LimeRecordHeader *h;
      h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
      assert(limeWriteRecordHeader(h, LimeW) >= 0);
      limeDestroyHeader(h);
    }
    return LIME_SUCCESS;
  }
  ////////////////////////////////////////////
  // Write a generic serialisable object
  ////////////////////////////////////////////
  void writeLimeObject(int MB,int ME,XmlWriter &writer,std::string object_name,std::string record_name)
  {
    if ( boss_node ) {
      std::string xmlstring = writer.docString();
      //    std::cout << "WriteLimeObject" << record_name <<std::endl;
      uint64_t nbytes = xmlstring.size();
      //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
      int err;
      LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
      assert(h!= NULL);
      err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
      err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
      err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
      limeDestroyHeader(h);
    }
  }
  template<class serialisable_object>
  void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, const unsigned int scientificPrec = 0)
  {
    XmlWriter WR("","");
    if (scientificPrec)
    {
      WR.scientificFormat(true);
      WR.setPrecision(scientificPrec);
    }
    write(WR,object_name,object);
    writeLimeObject(MB, ME, WR, object_name, record_name);
  }
  ////////////////////////////////////////////////////
  // Write a generic lattice field and csum
  // This routine is Collectively called by all nodes
  // in communicator used by the field._grid
  ////////////////////////////////////////////////////
  template<class vobj>
  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
    // the same file through different file handles (integer units).
    // 
    // These are both buffered, so why I think this code is right is as follows.
    //
    // i)  write record header to FILE *File, telegraphing the size; flush
    // ii) ftello reads the offset from FILE *File . 
    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
    //      Closes iostream and flushes.
    // iv) fseek on FILE * to end of this disjoint section.
    //  v) Continue writing scidac record.
    ////////////////////////////////////////////////////////////////////
    GridBase *grid = field._grid;
    assert(boss_node == field._grid->IsBoss() );
    ////////////////////////////////////////////
    // Create record header
    ////////////////////////////////////////////
    typedef typename vobj::scalar_object sobj;
    int err;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
    if ( boss_node ) {
      createLimeRecordHeader(record_name, 0, 0, PayloadSize);
      fflush(File);
    }
    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
    ////////////////////////////////////////////////
    // Check all nodes agree on file position
    ////////////////////////////////////////////////
    uint64_t offset1;
    if ( boss_node ) {
      offset1 = ftello(File);    
    }
    grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
    ///////////////////////////////////////////
    // The above is collective. Write by other means into the binary record
    ///////////////////////////////////////////
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
    ///////////////////////////////////////////
    // Wind forward and close the record
    ///////////////////////////////////////////
    if ( boss_node ) {
      fseek(File,0,SEEK_END);             
      uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
      assert( (offset2-offset1) == PayloadSize);
    }
    /////////////////////////////////////////////////////////////
    // Check MPI-2 I/O did what we expect to file
    /////////////////////////////////////////////////////////////
    if ( boss_node ) { 
      err=limeWriterCloseRecord(LimeW);  assert(err>=0);
    }
    ////////////////////////////////////////
    // Write checksum element, propagaing forward from the BinaryIO
    // Always pair a checksum with a binary object, and close message
    ////////////////////////////////////////
    scidacChecksum checksum;
    std::stringstream streama; streama << std::hex << scidac_csuma;
    std::stringstream streamb; streamb << std::hex << scidac_csumb;
    checksum.suma= streama.str();
    checksum.sumb= streamb.str();
    if ( boss_node ) { 
      writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
    }
  }
 };
 class ScidacWriter : public GridLimeWriter {
 public:
  ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss)  { };
  template<class SerialisableUserFile>
  void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
  {
    scidacFile    _scidacFile(grid);
    if ( this->boss_node ) {
      writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
      writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
    }
  }
  ////////////////////////////////////////////////
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
                              const unsigned int recordScientificPrec = 0) 
  {
    GridBase * grid = field._grid;
    ////////////////////////////////////////
    // fill the Grid header
    ////////////////////////////////////////
    FieldMetaData header;
    scidacRecord  _scidacRecord;
    scidacFile    _scidacFile;
    ScidacMetaData(field,header,_scidacRecord,_scidacFile);
    //////////////////////////////////////////////
    // Fill the Lime file record by record
    //////////////////////////////////////////////
    if ( this->boss_node ) {
      writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
      writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML), recordScientificPrec);
      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    }
    // Collective call
    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
  }
 };
 class ScidacReader : public GridLimeReader {
 public:
   template<class SerialisableUserFile>
   void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
   {
     scidacFile    _scidacFile(grid);
     readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
     readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
   }
  ////////////////////////////////////////////////
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
  {
    typedef typename vobj::scalar_object sobj;
    GridBase * grid = field._grid;
    ////////////////////////////////////////
    // fill the Grid header
    ////////////////////////////////////////
    FieldMetaData header;
    scidacRecord  _scidacRecord;
    scidacFile    _scidacFile;
    //////////////////////////////////////////////
    // Fill the Lime file record by record
    //////////////////////////////////////////////
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
  }
  void skipPastBinaryRecord(void) {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
    }    
  }
  void skipPastObjectRecord(std::string rec_name) {
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
 	return;
      }
    }
  }
  void skipScidacFieldRecord() {
    skipPastObjectRecord(std::string(GRID_FORMAT));
    skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
    skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
    skipPastBinaryRecord();
  }
 };
 class IldgWriter : public ScidacWriter {
 public:
  IldgWriter(bool isboss) : ScidacWriter(isboss) {};
  ///////////////////////////////////
  // A little helper
  ///////////////////////////////////
  void writeLimeIldgLFN(std::string &LFN)
  {
    uint64_t PayloadSize = LFN.size();
    int err;
    createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
    err=limeWriterCloseRecord(LimeW); assert(err>=0);
  }
  ////////////////////////////////////////////////////////////////
  // Special ILDG operations ; gauge configs only.
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  ////////////////////////////////////////////////////////////////
  template <class vsimd>
  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
  {
    GridBase * grid = Umu._grid;
    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    typedef iLorentzColourMatrix<vsimd> vobj;
    typedef typename vobj::scalar_object sobj;
    ////////////////////////////////////////
    // fill the Grid header
    ////////////////////////////////////////
    FieldMetaData header;
    scidacRecord  _scidacRecord;
    scidacFile    _scidacFile;
    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
    std::string format = header.floating_point;
    header.ensemble_id    = description;
    header.ensemble_label = description;
    header.sequence_number = sequence;
    header.ildg_lfn = LFN;
    assert ( (format == std::string("IEEE32BIG"))  
           ||(format == std::string("IEEE64BIG")) );
    //////////////////////////////////////////////////////
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
    ildgfmt.field     = std::string("su3gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
    } else { 
      ildgfmt.precision = 64;
    }
    ildgfmt.version = 1.0;
    ildgfmt.lx = header.dimension[0];
    ildgfmt.ly = header.dimension[1];
    ildgfmt.lz = header.dimension[2];
    ildgfmt.lt = header.dimension[3];
    assert(header.nd==4);
    assert(header.nd==header.dimension.size());
    //////////////////////////////////////////////////////////////////////////////
    // Fill the USQCD info field
    //////////////////////////////////////////////////////////////////////////////
    usqcdInfo info;
    info.version=1.0;
    info.plaq   = header.plaquette;
    info.linktr = header.link_trace;
    std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
    //////////////////////////////////////////////
    // Fill the Lime file record by record
    //////////////////////////////////////////////
    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
    writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
    writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT)); // rec
    writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
    writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
    //    limeDestroyWriter(LimeW);
  }
 };
 class IldgReader : public GridLimeReader {
 public:
  ////////////////////////////////////////////////////////////////
  // Read either Grid/SciDAC/ILDG configuration
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  // Else use ILDG MetaData object if present.
  // Else use SciDAC MetaData object if present.
  ////////////////////////////////////////////////////////////////
  template <class vsimd>
  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    typedef typename GaugeField::vector_object  vobj;
    typedef typename vobj::scalar_object sobj;
    typedef LorentzColourMatrixF fobj;
    typedef LorentzColourMatrixD dobj;
    GridBase *grid = Umu._grid;
    std::vector<int> dims = Umu._grid->FullDimensions();
    assert(dims.size()==4);
    // Metadata holders
    ildgFormat     ildgFormat_    ;
    std::string    ildgLFN_       ;
    scidacChecksum scidacChecksum_; 
    usqcdInfo      usqcdInfo_     ;
    // track what we read from file
    int found_ildgFormat    =0;
    int found_ildgLFN       =0;
    int found_scidacChecksum=0;
    int found_usqcdInfo     =0;
    int found_ildgBinary =0;
    int found_FieldMetaData =0;
    uint32_t nersc_csum;
    uint32_t scidac_csuma;
    uint32_t scidac_csumb;
    // Binary format
    std::string format;
    //////////////////////////////////////////////////////////////////////////
    // Loop over all records
    // -- Order is poorly guaranteed except ILDG header preceeds binary section.
    // -- Run like an event loop.
    // -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing
    //    that Scidac. 
    // -- Insist on Scidac checksum record.
    //////////////////////////////////////////////////////////////////////////
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
      //////////////////////////////////////////////////////////////////
      // If not BINARY_DATA read a string and parse
      //////////////////////////////////////////////////////////////////
      if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) )  ) {
 	// Copy out the string
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
 	//////////////////////////////////
 	// ILDG format record
  std::string xmlstring(&xmlc[0]);
 	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) { 
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"ildgFormat",ildgFormat_);
 	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
 	  if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
 	  assert( ildgFormat_.lx == dims[0]);
 	  assert( ildgFormat_.ly == dims[1]);
 	  assert( ildgFormat_.lz == dims[2]);
 	  assert( ildgFormat_.lt == dims[3]);
 	  found_ildgFormat = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
 	  FieldMetaData_.ildg_lfn = xmlstring;
 	  found_ildgLFN = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) { 
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"FieldMetaData",FieldMetaData_);
 	  format = FieldMetaData_.floating_point;
 	  assert(FieldMetaData_.dimension[0] == dims[0]);
 	  assert(FieldMetaData_.dimension[1] == dims[1]);
 	  assert(FieldMetaData_.dimension[2] == dims[2]);
 	  assert(FieldMetaData_.dimension[3] == dims[3]);
 	  found_FieldMetaData = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
 	  // is it a USQCD info field
 	  if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) { 
 	    //	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
 	    XmlReader RD(xmlstring, true, "");
 	    read(RD,"usqcdInfo",usqcdInfo_);
 	    found_usqcdInfo = 1;
 	  }
 	}
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"scidacChecksum",scidacChecksum_);
 	  found_scidacChecksum = 1;
 	}
      } else {  
 	/////////////////////////////////
 	// Binary data
 	/////////////////////////////////
 	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
 	uint64_t offset= ftello(File);
 	if ( format == std::string("IEEE64BIG") ) {
 	  GaugeSimpleMunger<dobj, sobj> munge;
 	  BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	} else { 
 	  GaugeSimpleMunger<fobj, sobj> munge;
 	  BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	found_ildgBinary = 1;
      }
    }
    //////////////////////////////////////////////////////
    // Minimally must find binary segment and checksum
    // Since this is an ILDG reader require ILDG format
    //////////////////////////////////////////////////////
    assert(found_ildgBinary);
    assert(found_ildgFormat);
    assert(found_scidacChecksum);
    // Must find something with the lattice dimensions
    assert(found_FieldMetaData||found_ildgFormat);
    if ( found_FieldMetaData ) {
      std::cout << GridLogMessage<<"Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
    } else { 
      assert(found_ildgFormat);
      assert ( ildgFormat_.field == std::string("su3gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
      ///////////////////////////////////////////////////////////////////////////////////////
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
      FieldMetaData_.dimension[0] = ildgFormat_.lx ;
      FieldMetaData_.dimension[1] = ildgFormat_.ly ;
      FieldMetaData_.dimension[2] = ildgFormat_.lz ;
      FieldMetaData_.dimension[3] = ildgFormat_.lt ;
      if ( found_usqcdInfo ) { 
 	FieldMetaData_.plaquette = usqcdInfo_.plaq;
 	FieldMetaData_.link_trace= usqcdInfo_.linktr;
 	std::cout << GridLogMessage <<"This configuration was probably written by USQCD "<<std::endl;
 	std::cout << GridLogMessage <<"USQCD xml record Plaquette : "<<FieldMetaData_.plaquette<<std::endl;
 	std::cout << GridLogMessage <<"USQCD xml record LinkTrace : "<<FieldMetaData_.link_trace<<std::endl;
      } else { 
 	FieldMetaData_.plaquette = 0.0;
 	FieldMetaData_.link_trace= 0.0;
 	std::cout << GridLogWarning << "This configuration is unsafe with no plaquette records that can verify it !!! "<<std::endl;
      }
    }
    ////////////////////////////////////////////////////////////
    // Really really want to mandate a scidac checksum
    ////////////////////////////////////////////////////////////
    if ( found_scidacChecksum ) {
      FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
      FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
      scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
      assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
      assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
      std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
    } else { 
      std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
      assert(0); // Can I insist always checksum ?
    }
    if ( found_FieldMetaData || found_usqcdInfo ) {
      FieldMetaData checker;
      GaugeStatistics(Umu,checker);
      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
    }
  }
 };
 }}
 //HAVE_LIME
 #endif
 #endif
@@ -0,0 +1,237 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/parallelIO/IldgIO.h
 Copyright (C) 2015
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_ILDGTYPES_IO_H
 #define GRID_ILDGTYPES_IO_H
 #ifdef HAVE_LIME
 extern "C" { // for linkage
 #include "lime.h"
 }
 namespace Grid {
 /////////////////////////////////////////////////////////////////////////////////
 // Data representation of records that enter ILDG and SciDac formats
 /////////////////////////////////////////////////////////////////////////////////
 #define GRID_FORMAT      "grid-format"
 #define ILDG_FORMAT      "ildg-format"
 #define ILDG_BINARY_DATA "ildg-binary-data"
 #define ILDG_DATA_LFN    "ildg-data-lfn"
 #define SCIDAC_CHECKSUM           "scidac-checksum"
 #define SCIDAC_PRIVATE_FILE_XML   "scidac-private-file-xml"
 #define SCIDAC_FILE_XML           "scidac-file-xml"
 #define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml"
 #define SCIDAC_RECORD_XML         "scidac-record-xml"
 #define SCIDAC_BINARY_DATA        "scidac-binary-data"
 // Unused SCIDAC records names; could move to support this functionality
 #define SCIDAC_SITELIST           "scidac-sitelist"
  ////////////////////////////////////////////////////////////
  const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
  const int GRID_IO_MULTIFILE  = 1; // hardcode lift from QIO compat
  const int GRID_IO_FIELD      = 0; // hardcode lift from QIO compat
  const int GRID_IO_GLOBAL     = 1; // hardcode lift from QIO compat
  ////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////
 // QIO uses mandatory "private" records fixed format
 // Private is in principle "opaque" however it can't be changed now because that would break existing 
 // file compatability, so should be correct to assume the undocumented but defacto file structure.
 /////////////////////////////////////////////////////////////////////////////////
 struct emptyUserRecord : Serializable { 
  GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
  emptyUserRecord() { dummy=0; };
 };
 ////////////////////////
 // Scidac private file xml
 // <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
 ////////////////////////
 struct scidacFile : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
                                  double, version,
                                  int, spacetime,
 				  std::string, dims, // must convert to int
                                  int, volfmt);
  std::vector<int> getDimensions(void) { 
    std::stringstream stream(dims);
    std::vector<int> dimensions;
    int n;
    while(stream >> n){
      dimensions.push_back(n);
    }
    return dimensions;
  }
  void setDimensions(std::vector<int> dimensions) { 
    char delimiter = ' ';
    std::stringstream stream;
    for(int i=0;i<dimensions.size();i++){ 
      stream << dimensions[i];
      if ( i != dimensions.size()-1) { 
 	stream << delimiter <<std::endl;
      }
    }
    dims = stream.str();
  }
  // Constructor provides Grid
  scidacFile() =default; // default constructor
  scidacFile(GridBase * grid){
    version      = 1.0;
    spacetime    = grid->_ndimension;
    setDimensions(grid->FullDimensions()); 
    volfmt       = GRID_IO_SINGLEFILE;
  }
 };
 ///////////////////////////////////////////////////////////////////////
 // scidac-private-record-xml : example
 // <scidacRecord>
 // <version>1.1</version><date>Tue Jul 26 21:14:44 2011 UTC</date><recordtype>0</recordtype>
 // <datatype>QDP_D3_ColorMatrix</datatype><precision>D</precision><colors>3</colors><spins>4</spins>
 // <typesize>144</typesize><datacount>4</datacount>
 // </scidacRecord>
 ///////////////////////////////////////////////////////////////////////
 struct scidacRecord : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
                                  double, version,
                                  std::string, date,
 				  int, recordtype,
 				  std::string, datatype,
 				  std::string, precision,
 				  int, colors,
 				  int, spins,
 				  int, typesize,
 				  int, datacount);
  scidacRecord()
  : version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
  {}
 };
 ////////////////////////
 // ILDG format
 ////////////////////////
 struct ildgFormat : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
 				  double, version,
 				  std::string, field,
 				  int, precision,
 				  int, lx,
 				  int, ly,
 				  int, lz,
 				  int, lt);
  ildgFormat() { version=1.0; };
 };
 ////////////////////////
 // USQCD info
 ////////////////////////
 struct usqcdInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
 				  double, version,
 				  double, plaq,
 				  double, linktr,
 				  std::string, info);
  usqcdInfo() { 
    version=1.0; 
  };
 };
 ////////////////////////
 // Scidac Checksum
 ////////////////////////
 struct scidacChecksum : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
 				  double, version,
 				  std::string, suma,
 				  std::string, sumb);
  scidacChecksum() { 
    version=1.0; 
  };
 };
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Type:           scidac-file-xml         <title>MILC ILDG archival gauge configuration</title>
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Type:           
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////
 // Scidac private file xml 
 // <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile> 
 ////////////////////////                                                                                                                                                                              
 #if 0
 ////////////////////////////////////////////////////////////////////////////////////////
 // From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
 ////////////////////////////////////////////////////////////////////////////////////////
 struct usqcdPropFile : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
 				  double, version,
 				  std::string, type,
 				  std::string, info);
  usqcdPropFile() { 
    version=1.0; 
  };
 };
 struct usqcdSourceInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
 				  double, version,
 				  std::string, info);
  usqcdSourceInfo() { 
    version=1.0; 
  };
 };
 struct usqcdPropInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
 				  double, version,
 				  int, spin,
 				  int, color,
 				  std::string, info);
  usqcdPropInfo() { 
    version=1.0; 
  };
 };
 #endif
 }
 #endif
 #endif
@@ -0,0 +1,327 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/parallelIO/NerscIO.h
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <algorithm>
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <map>
 #include <unistd.h>
 #include <sys/utsname.h>
 #include <pwd.h>
 namespace Grid {
  ///////////////////////////////////////////////////////
  // Precision mapping
  ///////////////////////////////////////////////////////
  template<class vobj> static std::string getFormatString (void)
  {
    std::string format;
    typedef typename getPrecision<vobj>::real_scalar_type stype;
    if ( sizeof(stype) == sizeof(float) ) {
      format = std::string("IEEE32BIG");
    }
    if ( sizeof(stype) == sizeof(double) ) {
      format = std::string("IEEE64BIG");
    }
    return format;
  }
  ////////////////////////////////////////////////////////////////////////////////
  // header specification/interpretation
  ////////////////////////////////////////////////////////////////////////////////
    class FieldMetaData : Serializable {
    public:
      GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
 				      int, nd,
 				      std::vector<int>, dimension,
 				      std::vector<std::string>, boundary,
 				      int, data_start,
 				      std::string, hdr_version,
 				      std::string, storage_format,
 				      double, link_trace,
 				      double, plaquette,
 				      uint32_t, checksum,
 				      uint32_t, scidac_checksuma,
 				      uint32_t, scidac_checksumb,
 				      unsigned int, sequence_number,
 				      std::string, data_type,
 				      std::string, ensemble_id,
 				      std::string, ensemble_label,
 				      std::string, ildg_lfn,
 				      std::string, creator,
 				      std::string, creator_hardware,
 				      std::string, creation_date,
 				      std::string, archive_date,
 				      std::string, floating_point);
      // WARNING: non-initialised values might lead to twisted parallel IO
      // issues, std::string are fine because they initliase to size 0
      // as per C++ standard.
      FieldMetaData(void) 
      : nd(4), dimension(4,0), boundary(4, ""), data_start(0),
      link_trace(0.), plaquette(0.), checksum(0),
      scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
      {}
    };
  namespace QCD {
    using namespace Grid;
    //////////////////////////////////////////////////////////////////////
    // Bit and Physical Checksumming and QA of data
    //////////////////////////////////////////////////////////////////////
    inline void GridMetaData(GridBase *grid,FieldMetaData &header)
    {
      int nd = grid->_ndimension;
      header.nd = nd;
      header.dimension.resize(nd);
      header.boundary.resize(nd);
      header.data_start = 0;
      for(int d=0;d<nd;d++) {
 	header.dimension[d] = grid->_fdimensions[d];
      }
      for(int d=0;d<nd;d++) {
 	header.boundary[d] = std::string("PERIODIC");
      }
    }
    inline void MachineCharacteristics(FieldMetaData &header)
    {
      // Who
      struct passwd *pw = getpwuid (getuid());
      if (pw) header.creator = std::string(pw->pw_name); 
      // When
      std::time_t t = std::time(nullptr);
      std::tm tm_ = *std::localtime(&t);
      std::ostringstream oss; 
      //      oss << std::put_time(&tm_, "%c %Z");
      header.creation_date = oss.str();
      header.archive_date  = header.creation_date;
      // What
      struct utsname name;  uname(&name);
      header.creator_hardware = std::string(name.nodename)+"-";
      header.creator_hardware+= std::string(name.machine)+"-";
      header.creator_hardware+= std::string(name.sysname)+"-";
      header.creator_hardware+= std::string(name.release);
    }
 #define dump_meta_data(field, s)					\
      s << "BEGIN_HEADER"      << std::endl;				\
      s << "HDR_VERSION = "    << field.hdr_version    << std::endl;	\
      s << "DATATYPE = "       << field.data_type      << std::endl;	\
      s << "STORAGE_FORMAT = " << field.storage_format << std::endl;	\
      for(int i=0;i<4;i++){						\
 	s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
      }									\
      s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
      s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl; \
      for(int i=0;i<4;i++){						\
 	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
      }									\
 									\
      s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
      s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
      s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
      s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
      s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;	\
      s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;	\
      s << "CREATOR = "         << field.creator          << std::endl;	\
      s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;	\
      s << "CREATION_DATE = "   << field.creation_date    << std::endl;	\
      s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;	\
      s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
      s << "END_HEADER"         << std::endl;
 template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
 {
  GridBase *grid = field._grid;
  std::string format = getFormatString<vobj>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   MachineCharacteristics(header);
 }
 inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
 {
   // How to convert data precision etc...
   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
 }
 inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
 {
   // How to convert data precision etc...
   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
 }
 template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
 {
   GridBase *grid = field._grid;
   std::string format = getFormatString<vLorentzColourMatrixF>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   GaugeStatistics(field,header);
   MachineCharacteristics(header);
 }
 template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
 {
   GridBase *grid = field._grid;
   std::string format = getFormatString<vLorentzColourMatrixD>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   GaugeStatistics(field,header);
   MachineCharacteristics(header);
 }
    //////////////////////////////////////////////////////////////////////
    // Utilities ; these are QCD aware
    //////////////////////////////////////////////////////////////////////
    inline void reconstruct3(LorentzColourMatrix & cm)
    {
      const int x=0;
      const int y=1;
      const int z=2;
      for(int mu=0;mu<Nd;mu++){
 	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
 	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
 	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
      }
    }
    ////////////////////////////////////////////////////////////////////////////////
    // Some data types for intermediate storage
    ////////////////////////////////////////////////////////////////////////////////
    template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
    typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
    typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
    typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
 /////////////////////////////////////////////////////////////////////////////////
 // Simple classes for precision conversion
 /////////////////////////////////////////////////////////////////////////////////
 template <class fobj, class sobj>
 struct BinarySimpleUnmunger {
  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
  void operator()(sobj &in, fobj &out) {
    // take word by word and transform accoding to the status
    fobj_stype *out_buffer = (fobj_stype *)&out;
    sobj_stype *in_buffer = (sobj_stype *)&in;
    size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
    size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
    assert(fobj_words == sobj_words);
    for (unsigned int word = 0; word < sobj_words; word++)
      out_buffer[word] = in_buffer[word];  // type conversion on the fly
  }
 };
 template <class fobj, class sobj>
 struct BinarySimpleMunger {
  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
  void operator()(fobj &in, sobj &out) {
    // take word by word and transform accoding to the status
    fobj_stype *in_buffer = (fobj_stype *)&in;
    sobj_stype *out_buffer = (sobj_stype *)&out;
    size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
    size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
    assert(fobj_words == sobj_words);
    for (unsigned int word = 0; word < sobj_words; word++)
      out_buffer[word] = in_buffer[word];  // type conversion on the fly
  }
 };
    template<class fobj,class sobj>
    struct GaugeSimpleMunger{
      void operator()(fobj &in, sobj &out) {
        for (int mu = 0; mu < Nd; mu++) {
          for (int i = 0; i < Nc; i++) {
          for (int j = 0; j < Nc; j++) {
 	    out(mu)()(i, j) = in(mu)()(i, j);
 	  }}
        }
      };
    };
    template <class fobj, class sobj>
    struct GaugeSimpleUnmunger {
      void operator()(sobj &in, fobj &out) {
        for (int mu = 0; mu < Nd; mu++) {
          for (int i = 0; i < Nc; i++) {
          for (int j = 0; j < Nc; j++) {
 	    out(mu)()(i, j) = in(mu)()(i, j);
 	  }}
        }
      };
    };
    template<class fobj,class sobj>
    struct Gauge3x2munger{
      void operator() (fobj &in,sobj &out){
 	for(int mu=0;mu<Nd;mu++){
 	  for(int i=0;i<2;i++){
 	  for(int j=0;j<3;j++){
 	    out(mu)()(i,j) = in(mu)(i)(j);
 	  }}
 	}
 	reconstruct3(out);
      }
    };
    template<class fobj,class sobj>
    struct Gauge3x2unmunger{
      void operator() (sobj &in,fobj &out){
 	for(int mu=0;mu<Nd;mu++){
 	  for(int i=0;i<2;i++){
 	  for(int j=0;j<3;j++){
 	    out(mu)(i)(j) = in(mu)()(i,j);
 	  }}
 	}
      }
    };
  }
 }
@@ -0,0 +1,363 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/parallelIO/NerscIO.h
    Copyright (C) 2015
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 namespace Grid {
  namespace QCD {
    using namespace Grid;
    ////////////////////////////////////////////////////////////////////////////////
    // Write and read from fstream; comput header offset for payload
    ////////////////////////////////////////////////////////////////////////////////
    class NerscIO : public BinaryIO { 
    public:
      static inline void truncate(std::string file){
 	std::ofstream fout(file,std::ios::out);
      }
      static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
      {
      std::ofstream fout(file,std::ios::out|std::ios::in);
      fout.seekp(0,std::ios::beg);
      dump_meta_data(field, fout);
      field.data_start = fout.tellp();
      return field.data_start;
    }
      // for the header-reader
      static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
      {
      uint64_t offset=0;
      std::map<std::string,std::string> header;
      std::string line;
      //////////////////////////////////////////////////
      // read the header
      //////////////////////////////////////////////////
      std::ifstream fin(file);
      getline(fin,line); // read one line and insist is 
      removeWhitespace(line);
      std::cout << GridLogMessage << "* " << line << std::endl;
      assert(line==std::string("BEGIN_HEADER"));
      do {
      getline(fin,line); // read one line
      std::cout << GridLogMessage << "* "<<line<< std::endl;
      int eq = line.find("=");
      if(eq >0) {
      std::string key=line.substr(0,eq);
      std::string val=line.substr(eq+1);
      removeWhitespace(key);
      removeWhitespace(val);
      header[key] = val;
    }
    } while( line.find("END_HEADER") == std::string::npos );
      field.data_start = fin.tellg();
      //////////////////////////////////////////////////
      // chomp the values
      //////////////////////////////////////////////////
      field.hdr_version    = header["HDR_VERSION"];
      field.data_type      = header["DATATYPE"];
      field.storage_format = header["STORAGE_FORMAT"];
      field.dimension[0] = std::stol(header["DIMENSION_1"]);
      field.dimension[1] = std::stol(header["DIMENSION_2"]);
      field.dimension[2] = std::stol(header["DIMENSION_3"]);
      field.dimension[3] = std::stol(header["DIMENSION_4"]);
      assert(grid->_ndimension == 4);
      for(int d=0;d<4;d++){
      assert(grid->_fdimensions[d]==field.dimension[d]);
    }
      field.link_trace = std::stod(header["LINK_TRACE"]);
      field.plaquette  = std::stod(header["PLAQUETTE"]);
      field.boundary[0] = header["BOUNDARY_1"];
      field.boundary[1] = header["BOUNDARY_2"];
      field.boundary[2] = header["BOUNDARY_3"];
      field.boundary[3] = header["BOUNDARY_4"];
      field.checksum = std::stoul(header["CHECKSUM"],0,16);
      field.ensemble_id      = header["ENSEMBLE_ID"];
      field.ensemble_label   = header["ENSEMBLE_LABEL"];
      field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
      field.creator          = header["CREATOR"];
      field.creator_hardware = header["CREATOR_HARDWARE"];
      field.creation_date    = header["CREATION_DATE"];
      field.archive_date     = header["ARCHIVE_DATE"];
      field.floating_point   = header["FLOATING_POINT"];
      return field.data_start;
    }
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Now the meat: the object readers
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class vsimd>
    static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 					 FieldMetaData& header,
 					 std::string file)
    {
      typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
      GridBase *grid = Umu._grid;
      uint64_t offset = readHeader(file,Umu._grid,header);
      FieldMetaData clone(header);
      std::string format(header.floating_point);
      int ieee32big = (format == std::string("IEEE32BIG"));
      int ieee32    = (format == std::string("IEEE32"));
      int ieee64big = (format == std::string("IEEE64BIG"));
      int ieee64    = (format == std::string("IEEE64"));
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      // depending on datatype, set up munger;
      // munger is a function of <floating point, Real, data_type>
      if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
 	if ( ieee32 || ieee32big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 	    (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	if ( ieee64 || ieee64big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 	    (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
      } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
 	if ( ieee32 || ieee32big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	if ( ieee64 || ieee64big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
      } else {
 	assert(0);
      }
      GaugeStatistics(Umu,clone);
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	       <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
 	       <<" header    "<<header.plaquette<<std::endl;
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
 	       <<" header    "<<header.link_trace<<std::endl;
      if ( fabs(clone.plaquette -header.plaquette ) >=  1.0e-5 ) { 
 	std::cout << " Plaquette mismatch "<<std::endl;
 	std::cout << Umu[0]<<std::endl;
 	std::cout << Umu[1]<<std::endl;
      }
      if ( nersc_csum != header.checksum ) { 
 	std::cerr << " checksum mismatch " << std::endl;
 	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
 	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
 	std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
 	exit(0);
      }
      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
      assert(nersc_csum == header.checksum );
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
    }
      template<class vsimd>
      static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 					    std::string file, 
 					    int two_row,
 					    int bits32)
      {
 	typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 	typedef iLorentzColourMatrix<vsimd> vobj;
 	typedef typename vobj::scalar_object sobj;
 	FieldMetaData header;
 	///////////////////////////////////////////
 	// Following should become arguments
 	///////////////////////////////////////////
 	header.sequence_number = 1;
 	header.ensemble_id     = "UKQCD";
 	header.ensemble_label  = "DWF";
 	typedef LorentzColourMatrixD fobj3D;
 	typedef LorentzColour2x3D    fobj2D;
 	GridBase *grid = Umu._grid;
 	GridMetaData(grid,header);
 	assert(header.nd==4);
 	GaugeStatistics(Umu,header);
 	MachineCharacteristics(header);
 	uint64_t offset;
 	// Sod it -- always write 3x3 double
 	header.floating_point = std::string("IEEE64BIG");
 	header.data_type      = std::string("4D_SU3_GAUGE_3x3");
 	GaugeSimpleUnmunger<fobj3D,sobj> munge;
 	if ( grid->IsBoss() ) { 
 	  truncate(file);
 	  offset = writeHeader(header,file);
 	}
 	grid->Broadcast(0,(void *)&offset,sizeof(offset));
 	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
 	BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 								  nersc_csum,scidac_csuma,scidac_csumb);
 	header.checksum = nersc_csum;
 	if ( grid->IsBoss() ) { 
 	  writeHeader(header,file);
 	}
 	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 		 <<std::hex<<header.checksum
 		 <<std::dec<<" plaq "<< header.plaquette <<std::endl;
      }
      ///////////////////////////////
      // RNG state
      ///////////////////////////////
      static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
      {
 	typedef typename GridParallelRNG::RngStateType RngStateType;
 	// Following should become arguments
 	FieldMetaData header;
 	header.sequence_number = 1;
 	header.ensemble_id     = "UKQCD";
 	header.ensemble_label  = "DWF";
 	GridBase *grid = parallel._grid;
 	GridMetaData(grid,header);
 	assert(header.nd==4);
 	header.link_trace=0.0;
 	header.plaquette=0.0;
 	MachineCharacteristics(header);
 	uint64_t offset;
 #ifdef RNG_RANLUX
 	header.floating_point = std::string("UINT64");
 	header.data_type      = std::string("RANLUX48");
 #endif
 #ifdef RNG_MT19937
 	header.floating_point = std::string("UINT32");
 	header.data_type      = std::string("MT19937");
 #endif
 #ifdef RNG_SITMO
 	header.floating_point = std::string("UINT64");
 	header.data_type      = std::string("SITMO");
 #endif
 	if ( grid->IsBoss() ) { 
 	  truncate(file);
 	  offset = writeHeader(header,file);
 	}
 	grid->Broadcast(0,(void *)&offset,sizeof(offset));
 	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
 	BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
 	header.checksum = nersc_csum;
 	if ( grid->IsBoss() ) { 
 	  offset = writeHeader(header,file);
 	}
 	std::cout<<GridLogMessage 
 		 <<"Written NERSC RNG STATE "<<file<< " checksum "
 		 <<std::hex<<header.checksum
 		 <<std::dec<<std::endl;
      }
      static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
      {
 	typedef typename GridParallelRNG::RngStateType RngStateType;
 	GridBase *grid = parallel._grid;
 	uint64_t offset = readHeader(file,grid,header);
 	FieldMetaData clone(header);
 	std::string format(header.floating_point);
 	std::string data_type(header.data_type);
 #ifdef RNG_RANLUX
 	assert(format == std::string("UINT64"));
 	assert(data_type == std::string("RANLUX48"));
 #endif
 #ifdef RNG_MT19937
 	assert(format == std::string("UINT32"));
 	assert(data_type == std::string("MT19937"));
 #endif
 #ifdef RNG_SITMO
 	assert(format == std::string("UINT64"));
 	assert(data_type == std::string("SITMO"));
 #endif
 	// depending on datatype, set up munger;
 	// munger is a function of <floating point, Real, data_type>
 	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
 	BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
 	if ( nersc_csum != header.checksum ) { 
 	  std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
 	  exit(0);
 	}
 	assert(nersc_csum == header.checksum );
 	std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
      }
    };
  }}
 #endif
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
    // 4
-#ifdef AVX512
+#ifdef KNL
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
@@ -205,13 +205,14 @@ public:
  void Stop(void) {
    count=0;
    cycles=0;
    size_t ign;
 #ifdef __linux__
    ssize_t ign;
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ign=::read(fd, &count, sizeof(long long));
-      ign=::read(cyclefd, &cycles, sizeof(long long));
+      ign+=::read(cyclefd, &cycles, sizeof(long long));
      assert(ign=2*sizeof(long long));
    }
    elapsed = cyclecount() - begin;
 #else
@@ -49,7 +49,8 @@ inline double usecond(void) {
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
-typedef  std::chrono::milliseconds          GridTime;
+typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridTime;
 typedef  std::chrono::microseconds          GridUsecs;
 inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
@@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
  stream << time.count()<<" ms";
  return stream;
 }
 inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
 {
  stream << time.count()<<" usec";
  return stream;
 }
 class GridStopWatch {
 private:
@@ -96,6 +102,9 @@ public:
    assert(running == false);
    return (uint64_t) accumulator.count();
  }
  bool isRunning(void){
    return running;
  }
 };
 }
@@ -1,7 +1,7 @@
 /**
- * pugixml parser - version 1.6
+ * pugixml parser - version 1.9
 * --------------------------------------------------------
- * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at http://pugixml.org/
 *
 * This library is distributed under the MIT License. See notice at the end
@@ -17,6 +17,9 @@
 // Uncomment this to enable wchar_t mode
 // #define PUGIXML_WCHAR_MODE
 // Uncomment this to enable compact mode
 // #define PUGIXML_COMPACT
 // Uncomment this to disable XPath
 // #define PUGIXML_NO_XPATH
@@ -46,7 +49,7 @@
 #endif
 /**
- * Copyright (c) 2006-2015 Arseny Kapoulkine
+ * Copyright (c) 2006-2018 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
@@ -59,7 +62,7 @@
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -1,7 +1,7 @@
 /**
- * pugixml parser - version 1.6
+ * pugixml parser - version 1.9
 * --------------------------------------------------------
- * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at http://pugixml.org/
 *
 * This library is distributed under the MIT License. See notice at the end
@@ -13,7 +13,7 @@
 #ifndef PUGIXML_VERSION
 // Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
-#	define PUGIXML_VERSION 160
+#	define PUGIXML_VERSION 190
 #endif
 // Include user configuration file (this can define various configuration macros)
@@ -72,6 +72,44 @@
 #	endif
 #endif
 // If the platform is known to have move semantics support, compile move ctor/operator implementation
 #ifndef PUGIXML_HAS_MOVE
 #	if __cplusplus >= 201103
 #		define PUGIXML_HAS_MOVE
 #	elif defined(_MSC_VER) && _MSC_VER >= 1600
 #		define PUGIXML_HAS_MOVE
 #	endif
 #endif
 // If C++ is 2011 or higher, add 'noexcept' specifiers
 #ifndef PUGIXML_NOEXCEPT
 #	if __cplusplus >= 201103
 #		define PUGIXML_NOEXCEPT noexcept
 #	elif defined(_MSC_VER) && _MSC_VER >= 1900
 #		define PUGIXML_NOEXCEPT noexcept
 #	else
 #		define PUGIXML_NOEXCEPT
 #	endif
 #endif
 // Some functions can not be noexcept in compact mode
 #ifdef PUGIXML_COMPACT
 #	define PUGIXML_NOEXCEPT_IF_NOT_COMPACT
 #else
 #	define PUGIXML_NOEXCEPT_IF_NOT_COMPACT PUGIXML_NOEXCEPT
 #endif
 // If C++ is 2011 or higher, add 'override' qualifiers
 #ifndef PUGIXML_OVERRIDE
 #	if __cplusplus >= 201103
 #		define PUGIXML_OVERRIDE override
 #	elif defined(_MSC_VER) && _MSC_VER >= 1700
 #		define PUGIXML_OVERRIDE override
 #	else
 #		define PUGIXML_OVERRIDE
 #	endif
 #endif
 // Character interface macros
 #ifdef PUGIXML_WCHAR_MODE
 #	define PUGIXML_TEXT(t) L ## t
@@ -133,13 +171,13 @@ namespace pugi
 	// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
 	const unsigned int parse_eol = 0x0020;
-	
+
 	// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
 	const unsigned int parse_wconv_attribute = 0x0040;
 	// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
 	const unsigned int parse_wnorm_attribute = 0x0080;
-	
+
 	// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
 	const unsigned int parse_declaration = 0x0100;
@@ -158,6 +196,11 @@ namespace pugi
 	// is a valid document. This flag is off by default.
 	const unsigned int parse_fragment = 0x1000;
 	// This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of
 	// the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments.
 	// This flag is off by default.
 	const unsigned int parse_embed_pcdata = 0x2000;
 	// The default parsing mode.
 	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
 	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
@@ -184,16 +227,16 @@ namespace pugi
 	};
 	// Formatting flags
-	
+
 	// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
 	const unsigned int format_indent = 0x01;
-	
+
 	// Write encoding-specific BOM to the output stream. This flag is off by default.
 	const unsigned int format_write_bom = 0x02;
 	// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
 	const unsigned int format_raw = 0x04;
-	
+
 	// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
 	const unsigned int format_no_declaration = 0x08;
@@ -206,6 +249,9 @@ namespace pugi
 	// Write every attribute on a new line with appropriate indentation. This flag is off by default.
 	const unsigned int format_indent_attributes = 0x40;
 	// Don't output empty element tags, instead writing an explicit start and end tag even if there are no children. This flag is off by default.
 	const unsigned int format_no_empty_element_tags = 0x80;
 	// The default set of formatting flags.
 	// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
 	const unsigned int format_default = format_indent;
@@ -225,7 +271,7 @@ namespace pugi
 	class xml_node;
 	class xml_text;
-	
+
 	#ifndef PUGIXML_NO_XPATH
 	class xpath_node;
 	class xpath_node_set;
@@ -268,7 +314,7 @@ namespace pugi
 		// Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
 		xml_writer_file(void* file);
-		virtual void write(const void* data, size_t size);
+		virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;
 	private:
 		void* file;
@@ -283,7 +329,7 @@ namespace pugi
 		xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
 		xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
-		virtual void write(const void* data, size_t size);
+		virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;
 	private:
 		std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
@@ -299,13 +345,13 @@ namespace pugi
 	private:
 		xml_attribute_struct* _attr;
-	
+
 		typedef void (*unspecified_bool_type)(xml_attribute***);
 	public:
 		// Default constructor. Constructs an empty attribute.
 		xml_attribute();
-		
+
 		// Constructs attribute from internal pointer
 		explicit xml_attribute(xml_attribute_struct* attr);
@@ -354,6 +400,8 @@ namespace pugi
 		// Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
 		bool set_value(int rhs);
 		bool set_value(unsigned int rhs);
 		bool set_value(long rhs);
 		bool set_value(unsigned long rhs);
 		bool set_value(double rhs);
 		bool set_value(float rhs);
 		bool set_value(bool rhs);
@@ -367,6 +415,8 @@ namespace pugi
 		xml_attribute& operator=(const char_t* rhs);
 		xml_attribute& operator=(int rhs);
 		xml_attribute& operator=(unsigned int rhs);
 		xml_attribute& operator=(long rhs);
 		xml_attribute& operator=(unsigned long rhs);
 		xml_attribute& operator=(double rhs);
 		xml_attribute& operator=(float rhs);
 		xml_attribute& operator=(bool rhs);
@@ -417,7 +467,7 @@ namespace pugi
 		// Borland C++ workaround
 		bool operator!() const;
-	
+
 		// Comparison operators (compares wrapped node pointers)
 		bool operator==(const xml_node& r) const;
 		bool operator!=(const xml_node& r) const;
@@ -438,7 +488,7 @@ namespace pugi
 		// Get node value, or "" if node is empty or it has no value
 		// Note: For <node>text</node> node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
 		const char_t* value() const;
-	
+
 		// Get attribute list
 		xml_attribute first_attribute() const;
 		xml_attribute last_attribute() const;
@@ -450,7 +500,7 @@ namespace pugi
 		// Get next/previous sibling in the children list of the parent node
 		xml_node next_sibling() const;
 		xml_node previous_sibling() const;
-		
+
 		// Get parent node
 		xml_node parent() const;
@@ -478,7 +528,7 @@ namespace pugi
 		// Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
 		bool set_name(const char_t* rhs);
 		bool set_value(const char_t* rhs);
-		
+
 		// Add attribute with specified name. Returns added attribute, or empty attribute on errors.
 		xml_attribute append_attribute(const char_t* name);
 		xml_attribute prepend_attribute(const char_t* name);
@@ -532,11 +582,11 @@ namespace pugi
 		template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
 		{
 			if (!_root) return xml_attribute();
-			
+
 			for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
 				if (pred(attrib))
 					return attrib;
-		
+
 			return xml_attribute();
 		}
@@ -544,11 +594,11 @@ namespace pugi
 		template <typename Predicate> xml_node find_child(Predicate pred) const
 		{
 			if (!_root) return xml_node();
-	
+
 			for (xml_node node = first_child(); node; node = node.next_sibling())
 				if (pred(node))
 					return node;
-		
+
 			return xml_node();
 		}
@@ -558,7 +608,7 @@ namespace pugi
 			if (!_root) return xml_node();
 			xml_node cur = first_child();
-			
+
 			while (cur._root && cur._root != _root)
 			{
 				if (pred(cur)) return cur;
@@ -590,7 +640,7 @@ namespace pugi
 		// Recursively traverse subtree with xml_tree_walker
 		bool traverse(xml_tree_walker& walker);
-	
+
 	#ifndef PUGIXML_NO_XPATH
 		// Select single node by evaluating XPath query. Returns first node from the resulting node set.
 		xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const;
@@ -601,11 +651,11 @@ namespace pugi
 		xpath_node_set select_nodes(const xpath_query& query) const;
 		// (deprecated: use select_node instead) Select single node by evaluating XPath query.
-		xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+		PUGIXML_DEPRECATED xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
-		xpath_node select_single_node(const xpath_query& query) const;
+		PUGIXML_DEPRECATED xpath_node select_single_node(const xpath_query& query) const;
 	#endif
-		
+
 		// Print subtree using a writer object
 		void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
@@ -701,6 +751,8 @@ namespace pugi
 		// Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
 		bool set(int rhs);
 		bool set(unsigned int rhs);
 		bool set(long rhs);
 		bool set(unsigned long rhs);
 		bool set(double rhs);
 		bool set(float rhs);
 		bool set(bool rhs);
@@ -714,6 +766,8 @@ namespace pugi
 		xml_text& operator=(const char_t* rhs);
 		xml_text& operator=(int rhs);
 		xml_text& operator=(unsigned int rhs);
 		xml_text& operator=(long rhs);
 		xml_text& operator=(unsigned long rhs);
 		xml_text& operator=(double rhs);
 		xml_text& operator=(float rhs);
 		xml_text& operator=(bool rhs);
@@ -867,11 +921,11 @@ namespace pugi
 	private:
 		int _depth;
-	
+
 	protected:
 		// Get current traversal depth
 		int depth() const;
-	
+
 	public:
 		xml_tree_walker();
 		virtual ~xml_tree_walker();
@@ -942,13 +996,14 @@ namespace pugi
 		char_t* _buffer;
 		char _memory[192];
-		
+
 		// Non-copyable semantics
 		xml_document(const xml_document&);
-		const xml_document& operator=(const xml_document&);
+		xml_document& operator=(const xml_document&);
-		void create();
+		void _create();
-		void destroy();
+		void _destroy();
 		void _move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
 	public:
 		// Default constructor, makes empty document
@@ -957,6 +1012,12 @@ namespace pugi
 		// Destructor, invalidates all node/attribute handles to this document
 		~xml_document();
 	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
 		xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
 		xml_document& operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
 	#endif
 		// Removes all nodes, leaving the empty document
 		void reset();
@@ -970,7 +1031,7 @@ namespace pugi
 	#endif
 		// (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
-		xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+		PUGIXML_DEPRECATED xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
 		// Load document from zero-terminated string. No encoding conversions are applied.
 		xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
@@ -1051,7 +1112,7 @@ namespace pugi
 		// Non-copyable semantics
 		xpath_variable(const xpath_variable&);
 		xpath_variable& operator=(const xpath_variable&);
-		
+
 	public:
 		// Get variable name
 		const char_t* name() const;
@@ -1095,10 +1156,10 @@ namespace pugi
 		xpath_variable_set(const xpath_variable_set& rhs);
 		xpath_variable_set& operator=(const xpath_variable_set& rhs);
-	#if __cplusplus >= 201103
+	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_variable_set(xpath_variable_set&& rhs);
+		xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
-		xpath_variable_set& operator=(xpath_variable_set&& rhs);
+		xpath_variable_set& operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
 	#endif
 		// Add a new variable or get the existing one, if the types match
@@ -1139,29 +1200,29 @@ namespace pugi
 		// Destructor
 		~xpath_query();
-	#if __cplusplus >= 201103
+	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_query(xpath_query&& rhs);
+		xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT;
-		xpath_query& operator=(xpath_query&& rhs);
+		xpath_query& operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT;
 	#endif
 		// Get query expression return type
 		xpath_value_type return_type() const;
-		
+
 		// Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
 		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
 		bool evaluate_boolean(const xpath_node& n) const;
-		
+
 		// Evaluate expression as double value in the specified context; performs type conversion if necessary.
 		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
 		double evaluate_number(const xpath_node& n) const;
-		
+
 	#ifndef PUGIXML_NO_STL
 		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
 		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
 		string_t evaluate_string(const xpath_node& n) const;
 	#endif
-		
+
 		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
 		// At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
 		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
@@ -1188,7 +1249,7 @@ namespace pugi
 		// Borland C++ workaround
 		bool operator!() const;
 	};
-	
+
 	#ifndef PUGIXML_NO_EXCEPTIONS
 	// XPath exception class
 	class PUGIXML_CLASS xpath_exception: public std::exception
@@ -1201,26 +1262,26 @@ namespace pugi
 		explicit xpath_exception(const xpath_parse_result& result);
 		// Get error message
-		virtual const char* what() const throw();
+		virtual const char* what() const throw() PUGIXML_OVERRIDE;
 		// Get parse result
 		const xpath_parse_result& result() const;
 	};
 	#endif
-	
+
 	// XPath node class (either xml_node or xml_attribute)
 	class PUGIXML_CLASS xpath_node
 	{
 	private:
 		xml_node _node;
 		xml_attribute _attribute;
-	
+
 		typedef void (*unspecified_bool_type)(xpath_node***);
 	public:
 		// Default constructor; constructs empty XPath node
 		xpath_node();
-		
+
 		// Construct XPath node from XML node/attribute
 		xpath_node(const xml_node& node);
 		xpath_node(const xml_attribute& attribute, const xml_node& parent);
@@ -1228,13 +1289,13 @@ namespace pugi
 		// Get node/attribute, if any
 		xml_node node() const;
 		xml_attribute attribute() const;
-		
+
 		// Get parent of contained node/attribute
 		xml_node parent() const;
 		// Safe bool conversion operator
 		operator unspecified_bool_type() const;
-		
+
 		// Borland C++ workaround
 		bool operator!() const;
@@ -1260,13 +1321,13 @@ namespace pugi
 			type_sorted,			// Sorted by document order (ascending)
 			type_sorted_reverse		// Sorted by document order (descending)
 		};
-		
+
 		// Constant iterator type
 		typedef const xpath_node* const_iterator;
 		// We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work
 		typedef const xpath_node* iterator;
-	
+
 		// Default constructor. Constructs empty set.
 		xpath_node_set();
@@ -1275,49 +1336,49 @@ namespace pugi
 		// Destructor
 		~xpath_node_set();
-		
+
 		// Copy constructor/assignment operator
 		xpath_node_set(const xpath_node_set& ns);
 		xpath_node_set& operator=(const xpath_node_set& ns);
-	#if __cplusplus >= 201103
+	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_node_set(xpath_node_set&& rhs);
+		xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
-		xpath_node_set& operator=(xpath_node_set&& rhs);
+		xpath_node_set& operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
 	#endif
 		// Get collection type
 		type_t type() const;
-		
+
 		// Get collection size
 		size_t size() const;
 		// Indexing operator
 		const xpath_node& operator[](size_t index) const;
-		
+
 		// Collection iterators
 		const_iterator begin() const;
 		const_iterator end() const;
 		// Sort the collection in ascending/descending order by document order
 		void sort(bool reverse = false);
-		
+
 		// Get first node in the collection by document order
 		xpath_node first() const;
-		
+
 		// Check if collection is empty
 		bool empty() const;
-	
+
 	private:
 		type_t _type;
-		
+
 		xpath_node _storage;
-		
+
 		xpath_node* _begin;
 		xpath_node* _end;
 		void _assign(const_iterator begin, const_iterator end, type_t type);
-		void _move(xpath_node_set& rhs);
+		void _move(xpath_node_set& rhs) PUGIXML_NOEXCEPT;
 	};
 #endif
@@ -1325,7 +1386,7 @@ namespace pugi
 	// Convert wide string to UTF8
 	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
 	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
-	
+
 	// Convert UTF8 to wide string
 	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
 	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
@@ -1333,13 +1394,13 @@ namespace pugi
 	// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
 	typedef void* (*allocation_function)(size_t size);
-	
+
 	// Memory deallocation function interface
 	typedef void (*deallocation_function)(void* ptr);
 	// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
 	void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
-	
+
 	// Get current memory management functions
 	allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
 	deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
@@ -1375,7 +1436,7 @@ namespace std
 #endif
 /**
- * Copyright (c) 2006-2015 Arseny Kapoulkine
+ * Copyright (c) 2006-2018 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
@@ -1388,7 +1449,7 @@ namespace std
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -1,6 +1,6 @@
-pugixml 1.6 - an XML processing library
+pugixml 1.9 - an XML processing library
-Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 Report bugs and download new versions at http://pugixml.org/
 This is the distribution of pugixml, which is a C++ XML processing library,
@@ -28,7 +28,7 @@ The distribution contains the following folders:
 This library is distributed under the MIT License:
-Copyright (c) 2006-2015 Arseny Kapoulkine
+Copyright (c) 2006-2018 Arseny Kapoulkine
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
@@ -0,0 +1,124 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/QCD.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LT_H
 #define GRID_LT_H
 namespace Grid{
 // First steps in the complete generalization of the Physics part
 // Design not final
 namespace LatticeTheories {
 template <int Dimensions>
 struct LatticeTheory {
  static const int Nd = Dimensions;
  static const int Nds = Dimensions * 2;  // double stored field
  template <typename vtype>
  using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
 };
 template <int Dimensions, int Colours>
 struct LatticeGaugeTheory : public LatticeTheory<Dimensions> {
  static const int Nds = Dimensions * 2;
  static const int Nd = Dimensions;
  static const int Nc = Colours;
  template <typename vtype> 
  using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > >;
  template <typename vtype>
  using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd>;
  template <typename vtype>
  using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>;
  template <typename vtype>
  using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
 };
 template <int Dimensions, int Colours, int Spin>
 struct FermionicLatticeGaugeTheory
    : public LatticeGaugeTheory<Dimensions, Colours> {
  static const int Nd = Dimensions;
  static const int Nds = Dimensions * 2;
  static const int Nc = Colours;
  static const int Ns = Spin;
  template <typename vtype>
  using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
  template <typename vtype>
  using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
  template <typename vtype>
  using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
  template <typename vtype>
  using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
  // These 2 only if Spin is a multiple of 2
  static const int Nhs = Spin / 2;
  template <typename vtype>
  using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
  template <typename vtype>
  using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
  //tests
  typedef iColourMatrix<Complex> ColourMatrix;
  typedef iColourMatrix<ComplexF> ColourMatrixF;
  typedef iColourMatrix<ComplexD> ColourMatrixD;
 };
 // Examples, not complete now.
 struct QCD : public FermionicLatticeGaugeTheory<4, 3, 4> {
    static const int Xp = 0;
    static const int Yp = 1;
    static const int Zp = 2;
    static const int Tp = 3;
    static const int Xm = 4;
    static const int Ym = 5;
    static const int Zm = 6;
    static const int Tm = 7;
    typedef FermionicLatticeGaugeTheory FLGT;
    typedef FLGT::iSpinMatrix<Complex  >          SpinMatrix;
    typedef FLGT::iSpinMatrix<ComplexF >          SpinMatrixF;
    typedef FLGT::iSpinMatrix<ComplexD >          SpinMatrixD;
 };
 struct QED : public FermionicLatticeGaugeTheory<4, 1, 4> {//fill
 };
 template <int Dimensions>
 struct Scalar : public LatticeTheory<Dimensions> {};
 };  // LatticeTheories
 } // Grid
 #endif
@@ -32,10 +32,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_QCD_BASE_H
 #define GRID_QCD_BASE_H
 namespace Grid{
 namespace QCD {
    static const int Xdir = 0;
    static const int Ydir = 1;
    static const int Zdir = 2;
    static const int Tdir = 3;
    static const int Xp = 0;
    static const int Yp = 1;
    static const int Zp = 2;
@@ -354,36 +358,36 @@ namespace QCD {
    //////////////////////////////////////////////
    template<class vobj> 
      void pokeColour(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
+              const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
-		      int i)
+              int i)
    {
      PokeIndex<ColourIndex>(lhs,rhs,i);
    }
    template<class vobj> 
      void pokeColour(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
+              const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
-		      int i,int j)
+              int i,int j)
    {
      PokeIndex<ColourIndex>(lhs,rhs,i,j);
    }
    template<class vobj> 
      void pokeSpin(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
+              const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
-		      int i)
+              int i)
    {
      PokeIndex<SpinIndex>(lhs,rhs,i);
    }
    template<class vobj> 
      void pokeSpin(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
+              const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
-		      int i,int j)
+              int i,int j)
    {
      PokeIndex<SpinIndex>(lhs,rhs,i,j);
    }
    template<class vobj> 
      void pokeLorentz(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
+              const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
-		      int i)
+              int i)
    {
      PokeIndex<LorentzIndex>(lhs,rhs,i);
    }
@@ -417,15 +421,16 @@ namespace QCD {
    //////////////////////////////////////////////
    // Fermion <-> propagator assignements
    //////////////////////////////////////////////
-    template <class Prop, class Ferm>
+    //template <class Prop, class Ferm>
-    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
+    template <class Fimpl>
      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
    {
-        for(int j = 0; j < Ns; ++j)
+      for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
-            for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
            {
                pokeColour(pjs, peekColour(fj, i), i, c);
            }
@@ -433,15 +438,16 @@ namespace QCD {
        }
    }
-    template <class Prop, class Ferm>
+    //template <class Prop, class Ferm>
-    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
+    template <class Fimpl>
      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
    {
        for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
-            for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
            {
                pokeColour(fj, peekColour(pjs, i, c), i);
            }
@@ -489,6 +495,14 @@ namespace QCD {
      return traceIndex<ColourIndex>(lhs);
    }
    //////////////////////////////////////////
    // Current types
    //////////////////////////////////////////
    GRID_SERIALIZABLE_ENUM(Current, undef,
                           Vector,  0,
                           Axial,   1,
                           Tadpole, 2);
 }   //namespace QCD
 } // Grid
--- a/Show More
+++ b/Show More