From a7772c827b948a7c75fe20080f77ed02b764b96a Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Thu, 12 Dec 2019 16:05:22 +0000
Subject: [PATCH 01/14] Documentation tweak

---
 documentation/GridXcode/readme.md | 49 +++++++++++++++----------------
 1 file changed, 24 insertions(+), 25 deletions(-)
diff --git a/documentation/GridXcode/readme.md b/documentation/GridXcode/readme.md
index 8cbf9112..e5aab405 100644
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -39,8 +39,8 @@ These are the environment variables we will define for Grid:
 
 Variable | Typical Value | Use
 --- | --- | ---
-`Grid` | `/Users/user_id/src/Grid` | Path to grid source
-`GridPre` | `/Users/user_id/bin` | Path to install directory containing grid pre-requisites built from source
+`Grid` | `$HOME/src/Grid` | Path to grid source
+`GridPre` | `$HOME/.local` | Path to install directory containing grid pre-requisites built from source
 `GridPkg` | **MacPorts**=`/opt/local`, **Homebrew**=`/usr/local` | Path to package manager install directory
 
 Choose either of the following ways to do this, and when you're done, log out and in again. To check these have been set:
@@ -54,7 +54,7 @@ Choose either of the following ways to do this, and when you're done, log out an
 
 ```apple script
 do shell script "launchctl setenv Grid $HOME/src/Grid
-launchctl setenv GridPre $HOME/bin
+launchctl setenv GridPre $HOME/.local
 launchctl setenv GridPkg /opt/local"
 ```
 
@@ -82,7 +82,7 @@ Make the file `environment.plist` in `~/Library/LaunchAgents` with the following
 <string>sh</string>
 <string>-c</string>
 <string>launchctl setenv Grid $HOME/src/Grid
-launchctl setenv GridPre $HOME/bin
+launchctl setenv GridPre $HOME/.local
 launchctl setenv GridPkg /opt/local</string>
 
 </array>
@@ -94,16 +94,16 @@ launchctl setenv GridPkg /opt/local</string>
 
 ## 3. Install and build Open MPI -- ***optional***
 
-Download the latest version of [Open MPI][OMPI] version 3.1 (I used 3.1.3).
-
-NB: Grid does not have any dependencies on fortran, however many standard scientific packages do, so you may as well download GNU fortran (e.g. MacPorts ``gfortran`` package) and build Open MPI like so:
+Download the latest version of [Open MPI][OMPI] version 3.1 (I used 3.1.5) and build it like so:
 
 [OMPI]: https://www.open-mpi.org/software/ompi/v3.1/
 
-    ../configure CC=clang CXX=clang++ F77=gfortran FC=gfortran CXXFLAGS=-g --prefix=$GridPre/openmpi
+    ../configure CC=clang CXX=clang++ CXXFLAGS=-g --prefix=$GridPre
     make -j 4 all install
 
-(If you don't want to bother with fortran bindings, just don't include the F77 and FC flags)
+NB: Grid does not have any dependencies on fortran, however many standard scientific packages do, so you may wish to download GNU fortran (e.g. MacPorts ``gfortran`` package) and add the following to your configure invocation:
+
+    F77=gfortran FC=gfortran
 
 ## 4. Install and build Grid pre-requisites
 
@@ -121,17 +121,17 @@ These are the `portname`s for mandatory Grid libraries:
 
 * git-flow-avh
 * gmp
+* hdf5
 * mpfr
 
 and these are the `portname`s for optional Grid libraries:
 
 * fftw-3-single
-* hdf5
 * lapack
 * doxygen
 * OpenBLAS
 
-***Please update this list with any packages I've missed! ... and double-check whether OpenBLAS is really for Grid***
+***Please update this list with any packages I've missed! ... and double-check whether OpenBLAS is really for Grid. NB: lapack doesn't seem to work. Should it be scalapack?***
 
 ### 2. [Homebrew][Homebrew]
 
@@ -149,7 +149,7 @@ There isn't currently a port for [C-LIME][C-LIME], so download the source and th
 
 [C-LIME]: https://usqcd-software.github.io/c-lime/ "C-language API for Lattice QCD Interchange Message Encapsulation / Large Internet Message Encapsulation"
 
-    ../configure --prefix=$GridPre/lime CC=clang
+    ../configure CC=clang --prefix=$GridPre
     make -j 4 all install
 
 ## 5. Install, Configure and Build Grid
@@ -182,19 +182,19 @@ Below are shown the `configure` script invocations for three recommended configu
 
 This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:
 
-    ../configure --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre/lime --enable-simd=GEN --enable-precision=double CXX=clang++ --prefix=$GridPre/GridDebug --enable-comms=none --enable-doxygen-doc
+    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none
 
 #### 2. `Release`
 
 Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
 
-    ../configure --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre/lime --enable-simd=GEN --enable-precision=single CXX=clang++ --prefix=$GridPre/GridRelease --enable-comms=none --enable-doxygen-doc
+    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none
 
 #### 3. `MPIDebug`
 
 Debug configuration with MPI:
 
-    ../configure --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre/lime --enable-simd=GEN --enable-precision=double CXX=clang++ --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/openmpi/bin/mpicxx --enable-doxygen-doc
+    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/mpicxx
 
 ### 5.3 Build Grid
 
@@ -250,9 +250,9 @@ Obtain a list of header locations required by Grid by running the following from
 
     ./grid-config --cxxflags
 
-Output should look similar to:
+Output should look similar to (but will likely include duplicates):
 
-    -I$GridPre/openmpi/include -I$GridPkg/include -I$GridPre/lime/include -I$GridPkg/include -I$GridPkg/include -I$GridPkg/include -O3 -g -std=c++11
+    -I$GridPre/include -I$GridPkg/include -O3 -g -std=c++11
 
 The header locations follow the `-I` switches. You can ignore the other switches, and you can ignore duplicate entries, which just mean that your package manager has installed multiple packages in the same location.
 
@@ -265,13 +265,12 @@ Set HEADER_SEARCH_PATHS to:
 
 followed by (***the order is important***) the locations reported by `grid-config --cxxflags`, ignoring duplicates, e.g.:
 
-    $GridPre/openmpi/include
+    $GridPre/include
     $GridPkg/include
-    $GridPre/lime/include
-
+    
 **Note: the easiest way to set this value is to put it all on one line, space separated, and edit the text to the right of `HEADER_SEARCH_PATHS`**, i.e.:
 
-    $Grid/build$(CONFIGURATION)/Grid $Grid $GridPre/openmpi/include $GridPkg/include $GridPre/lime/include
+    $Grid/build$(CONFIGURATION)/Grid $Grid $GridPre/include $GridPkg/include
 
 #### LIBRARY_SEARCH_PATHS
 
@@ -279,13 +278,13 @@ Obtain a list of library locations required by Grid by running the following fro
 
     ./grid-config --ldflags
 
-Output should look similar to:
+Output should look similar to (but will likely include duplicates):
 
-    -L$GridPre/openmpi/lib -L$GridPkg/lib -L$GridPre/lime/lib -L$GridPkg/lib -L$GridPkg/lib -L$GridPkg/lib
+    -L$GridPre/lib -L$GridPkg/lib
 
 Paste the output ***with `$Grid/build$(CONFIGURATION)/Grid $Grid/build$(CONFIGURATION)/Hadrons ` prepended*** into `LIBRARY_SEARCH_PATHS`:
 
-    $Grid/build$(CONFIGURATION)/Grid $Grid/build$(CONFIGURATION)/Hadrons $GridPre/openmpi/lib $GridPkg/lib $GridPre/lime/lib
+    $Grid/build$(CONFIGURATION)/Grid $Grid/build$(CONFIGURATION)/Hadrons $GridPre/lib $GridPkg/lib
 
 ### 2. Linking
 
@@ -369,7 +368,7 @@ Instead:
 
 3. From a terminal session, locate and run your executable using `mpirun` (*the mangled name of the project build products will not be exactly the same as this example*):
 
-    `$GridPre/openmpi/bin/mpirun -np 2 ~/Library/Developer/Xcode/DerivedData/HelloGrid-fiyyuveptaqelbbvllomcgjyvghr/Build/Products/Debug/HelloGrid --grid 4.4.4.8 --mpi 1.1.1.2`
+    `$GridPre/bin/mpirun -np 2 ~/Library/Developer/Xcode/DerivedData/HelloGrid-fiyyuveptaqelbbvllomcgjyvghr/Build/Products/Debug/HelloGrid --grid 4.4.4.8 --mpi 1.1.1.2`
 
     The Xcode debugger will attach to the first process.
 

From c7637a84adabe10217a5fa21cee0fcad40895d16 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Thu, 12 Dec 2019 17:00:03 +0000
Subject: [PATCH 02/14] Documentation tweak for peculiarities of OpenMPI
 --prefix

---
 documentation/GridXcode/readme.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/documentation/GridXcode/readme.md b/documentation/GridXcode/readme.md
index e5aab405..11deb6fa 100644
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -98,10 +98,12 @@ Download the latest version of [Open MPI][OMPI] version 3.1 (I used 3.1.5) and b
 
 [OMPI]: https://www.open-mpi.org/software/ompi/v3.1/
 
-    ../configure CC=clang CXX=clang++ CXXFLAGS=-g --prefix=$GridPre
+    ../configure CC=clang CXX=clang++ CXXFLAGS=-g --prefix=$GridPre/bin
     make -j 4 all install
 
-NB: Grid does not have any dependencies on fortran, however many standard scientific packages do, so you may wish to download GNU fortran (e.g. MacPorts ``gfortran`` package) and add the following to your configure invocation:
+***Note the `/bin` at the end of the prefix - this is required. As a quirk of the OpenMPI installer, `--prefix` must point to the `bin` subdirectory, with other files installed in `$GridPre/include`, `$GridPre/lib`, `$GridPre/share`, etc.***
+
+Grid does not have any dependencies on fortran, however many standard scientific packages do, so you may wish to download GNU fortran (e.g. MacPorts ``gfortran`` package) and add the following to your configure invocation:
 
     F77=gfortran FC=gfortran
 
@@ -194,7 +196,7 @@ Since Grid itself doesn't really have debug configurations, the release build is
 
 Debug configuration with MPI:
 
-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/mpicxx
+    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx
 
 ### 5.3 Build Grid
 

From 0ca1992151abd6410c0d5f603879ab0efab71430 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Fri, 20 Dec 2019 13:53:27 +0000
Subject: [PATCH 03/14] Remove warning in tensor layout comparison. Make
 default names and index names visible for PerambTensor and NoiseTensor

---
 Grid/serialisation/BaseIO.h | 2 +-
 Hadrons/NamedTensor.hpp     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h
index a9147c49..b858c835 100644
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@@ -318,7 +318,7 @@ namespace Grid {
       TotalDims[TensorRank + i] = Traits::Dimension(i);
 
     // If the Tensor isn't in Row-Major order, then we'll need to copy it's data
-    const bool CopyData{NumElements > 1 && ETensor::Layout != Eigen::StorageOptions::RowMajor};
+    const bool CopyData{NumElements > 1 && static_cast<int>( ETensor::Layout ) != static_cast<int>( Eigen::StorageOptions::RowMajor )};
     const Scalar * pWriteBuffer;
     std::vector<Scalar> CopyBuffer;
     const Index TotalNumElements = NumElements * Traits::count;
diff --git a/Hadrons/NamedTensor.hpp b/Hadrons/NamedTensor.hpp
index 954de2d8..003a92fc 100644
--- a/Hadrons/NamedTensor.hpp
+++ b/Hadrons/NamedTensor.hpp
@@ -159,9 +159,9 @@ using LapEvecs = Grid::Hadrons::EigenPack<LatticeColourVector>;
 
 class NoiseTensor : public NamedTensor<Complex, 4>
 {
+    public:
     static const std::string                Name__;
     static const std::array<std::string, 4> DefaultIndexNames__;
-    public:
     // Default constructor (assumes tensor will be loaded from file)
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NoiseTensor() : NamedTensor{Name__, DefaultIndexNames__} {}
 
@@ -173,9 +173,9 @@ class NoiseTensor : public NamedTensor<Complex, 4>
 
 class PerambTensor : public NamedTensor<SpinVector, 6>
 {
+    public:
     static const std::string                Name__;
     static const std::array<std::string, 6> DefaultIndexNames__;
-    public:
     // Default constructor (assumes tensor will be loaded from file)
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PerambTensor() : NamedTensor{Name__, DefaultIndexNames__} {}
 

From 2ed39ebb7a52f9c8e3703f1452fd32cd060bd760 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Fri, 24 Jan 2020 13:01:06 +0000
Subject: [PATCH 04/14] Perambulator won't even allocate memory for unsmeared
 sinks unless the filename is specified. Prior to this update, memory is
 allocated regardless of whether these are requested.

---
 Hadrons/Modules/MDistil/Perambulator.hpp | 42 +++++++++++++++++++-----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/Hadrons/Modules/MDistil/Perambulator.hpp b/Hadrons/Modules/MDistil/Perambulator.hpp
index fb9d16dc..54db5e50 100644
--- a/Hadrons/Modules/MDistil/Perambulator.hpp
+++ b/Hadrons/Modules/MDistil/Perambulator.hpp
@@ -89,10 +89,22 @@ std::vector<std::string> TPerambulator<FImpl>::getInput(void)
     return {par().lapevec, par().solver, par().noise, par().DistilParams};
 }
 
+static const std::string UnsmearedSink{ "_unsmeared_sink" };
+
 template <typename FImpl>
 std::vector<std::string> TPerambulator<FImpl>::getOutput(void)
 {
-    return {getName(), getName() + "_unsmeared_sink"};
+    // Always return perambulator with name of module
+    std::string objName{ getName() };
+    std::vector<std::string> output{ objName };
+    // If unsmeared sink is specified, then output that as well
+    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
+    if( !UnsmearedSinkFileName.empty() )
+    {
+        objName.append( UnsmearedSink );
+        output.emplace_back( objName );
+    }
+    return output;
 }
 
 // setup ///////////////////////////////////////////////////////////////////////
@@ -105,9 +117,15 @@ void TPerambulator<FImpl>::setup(void)
     const bool full_tdil{ dp.TI == Nt };
     const int  Nt_inv{ full_tdil ? 1 : dp.TI };
 
-    envCreate(PerambTensor, getName(), 1, Nt, dp.nvec, dp.LI, dp.nnoise, Nt_inv, dp.SI);
-    envCreate(std::vector<FermionField>, getName() + "_unsmeared_sink", 1,
-              dp.nnoise*dp.LI*Ns*Nt_inv, envGetGrid(FermionField));
+    std::string objName{ getName() };
+    envCreate(PerambTensor, objName, 1, Nt, dp.nvec, dp.LI, dp.nnoise, Nt_inv, dp.SI);
+    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
+    if( !UnsmearedSinkFileName.empty() )
+    {
+        objName.append( UnsmearedSink );
+        envCreate(std::vector<FermionField>, objName, 1, dp.nnoise*dp.LI*Ns*Nt_inv,
+                  envGetGrid(FermionField));
+    }
     
     envTmpLat(LatticeSpinColourVector,   "dist_source");
     envTmpLat(LatticeSpinColourVector,   "source4d");
@@ -139,9 +157,12 @@ void TPerambulator<FImpl>::execute(void)
     envGetTmp(FermionField, v5dtmp);
     envGetTmp(FermionField, v5dtmp_sol);
     auto &noise = envGet(NoiseTensor, par().noise);
-    auto &perambulator = envGet(PerambTensor, getName());
+    std::string objName{ getName() };
+    auto &perambulator = envGet(PerambTensor, objName);
     auto &epack = envGet(LapEvecs, par().lapevec);
-    auto &unsmeared_sink = envGet(std::vector<FermionField>, getName() + "_unsmeared_sink");
+    objName.append( UnsmearedSink );
+    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
+    const bool bSaveUnsmearedSink( !UnsmearedSinkFileName.empty() );
     envGetTmp(LatticeSpinColourVector, dist_source);
     envGetTmp(LatticeSpinColourVector, source4d);
     envGetTmp(LatticeSpinColourVector, source3d);
@@ -153,7 +174,6 @@ void TPerambulator<FImpl>::execute(void)
     GridCartesian * const grid4d{ env().getGrid() }; // Owned by environment (so I won't delete it)
     const int Ntlocal{grid4d->LocalDimensions()[3]};
     const int Ntfirst{grid4d->LocalStarts()[3]};
-    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
 
     for (int inoise = 0; inoise < dp.nnoise; inoise++)
     {
@@ -197,8 +217,11 @@ void TPerambulator<FImpl>::execute(void)
                         mat.ExportPhysicalFermionSolution(v5dtmp_sol, v4dtmp);
                         result4d = v4dtmp;
                     }
-                    if (!UnsmearedSinkFileName.empty())
+                    if( bSaveUnsmearedSink )
+                    {
+                        auto &unsmeared_sink = envGet(std::vector<FermionField>, objName);
                         unsmeared_sink[inoise+dp.nnoise*(dk+dp.LI*(dt+Nt_inv*ds))] = result4d;
+                    }
                     for (int is = 0; is < Ns; is++)
                     {
                         result4d_nospin = peekSpin(result4d,is);
@@ -250,9 +273,10 @@ void TPerambulator<FImpl>::execute(void)
     }
     
     //Save the unsmeared sinks if filename specified
-    if (!UnsmearedSinkFileName.empty())
+    if (bSaveUnsmearedSink)
     {
         LOG(Message) << "Writing unsmeared sink to " << UnsmearedSinkFileName << std::endl;
+        auto &unsmeared_sink = envGet(std::vector<FermionField>, objName);
         A2AVectorsIo::write(UnsmearedSinkFileName, unsmeared_sink, false, vm().getTrajectory());
     }
 }

From c69a3b6ef6cedb173e7a56908d48c5850146c263 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Wed, 29 Jan 2020 21:20:20 +0000
Subject: [PATCH 05/14] When saving eigenvectors, LapEvec now saves eigenvalues
 for every timeslice as well. I.e. nT x nVec eigenvalues are saved in
 FileName.evals.conf.h5. A new named tensor, "TimesliceEvals" can be used to
 simplify restoring these from disk. NB: The changes in BaseIO add support so
 that Eigen tensors can be easily used in MPI operations, e.g. GlobalSum. See
 LapEvec.hpp for an example of how this is done.

---
 Grid/serialisation/BaseIO.h             | 36 ++++++++++++-------------
 Hadrons/Modules/MDistil/LapEvec.hpp     | 23 ++++++++++++++--
 Hadrons/Modules/MDistil/Perambulator.cc |  3 +++
 Hadrons/NamedTensor.hpp                 | 14 ++++++++++
 4 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h
index b858c835..bf424fc7 100644
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@@ -97,6 +97,23 @@ namespace Grid {
     template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {};
     template<typename T> struct is_tensor_variable<T, typename std::enable_if<is_tensor<T>::value
         && !is_tensor_fixed<T>::value>::type> : public std::true_type {};
+
+    // Helper functions to get the ultimate scalar inside a tensor, and corresponding size
+    template <typename ET>
+    inline typename std::enable_if<is_tensor<ET>::value, const typename ET::Index>::type
+    getScalarCount(const ET &eigenTensor) { return eigenTensor.size() * Traits<ET>::count; }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_scalar<ET>::value, const typename ET::Scalar *>::type
+    getFirstScalar(const ET &eigenTensor) { return eigenTensor.data(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_scalar<ET>::value, typename ET::Scalar *>::type
+    getFirstScalar(ET &eigenTensor) { return eigenTensor.data(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_container<ET>::value, const typename Traits<ET>::scalar_type *>::type
+    getFirstScalar(const ET &eigenTensor) { return eigenTensor.data()->begin(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_container<ET>::value, typename Traits<ET>::scalar_type *>::type
+    getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); }
   }
 
   // Abstract writer/reader classes ////////////////////////////////////////////
@@ -128,23 +145,6 @@ namespace Grid {
     typename std::enable_if<EigenIO::is_tensor<ETensor>::value>::type
     write(const std::string &s, const ETensor &output);
 
-    // Helper functions for Scalar vs Container specialisations
-    template <typename ETensor>
-    inline typename std::enable_if<EigenIO::is_tensor_of_scalar<ETensor>::value,
-    const typename ETensor::Scalar *>::type
-    getFirstScalar(const ETensor &output)
-    {
-      return output.data();
-    }
-    
-    template <typename ETensor>
-    inline typename std::enable_if<EigenIO::is_tensor_of_container<ETensor>::value,
-    const typename EigenIO::Traits<ETensor>::scalar_type *>::type
-    getFirstScalar(const ETensor &output)
-    {
-      return output.data()->begin();
-    }
-    
     template <typename S>
     inline typename std::enable_if<EigenIO::is_scalar<S>::value, void>::type
     copyScalars(S * &pCopy, const S &Source)
@@ -323,7 +323,7 @@ namespace Grid {
     std::vector<Scalar> CopyBuffer;
     const Index TotalNumElements = NumElements * Traits::count;
     if( !CopyData ) {
-      pWriteBuffer = getFirstScalar( output );
+      pWriteBuffer = EigenIO::getFirstScalar( output );
     } else {
       // Regardless of the Eigen::Tensor storage order, the copy will be Row Major
       CopyBuffer.resize( TotalNumElements );
diff --git a/Hadrons/Modules/MDistil/LapEvec.hpp b/Hadrons/Modules/MDistil/LapEvec.hpp
index 9b5197c2..3c1122ca 100644
--- a/Hadrons/Modules/MDistil/LapEvec.hpp
+++ b/Hadrons/Modules/MDistil/LapEvec.hpp
@@ -263,6 +263,11 @@ void TLapEvec<GImpl>::execute(void)
     const int Ntlocal{gridHD->LocalDimensions()[Tdir]};
     const int Ntfirst{gridHD->LocalStarts()[Tdir]};
     uint32_t ConvergenceErrors{0};
+    const int NtFull{env().getDim(Tdir)};
+    TimesliceEvals Evals{ NtFull, LPar.Nvec };
+    for (int t = 0; t < NtFull; t++)
+        for (int v = 0; v < LPar.Nvec; v++)
+            Evals.tensor( t, v ) = 0;
     for (int t = 0; t < Ntlocal; t++ )
     {
         LOG(Message) << "------------------------------------------------------------" << std::endl;
@@ -302,6 +307,8 @@ void TLapEvec<GImpl>::execute(void)
             InsertSliceLocal(eig[t].evec[i],eig4d.evec[i],0,t,Tdir);
             if(t==0 && Ntfirst==0)
                 eig4d.eval[i] = eig[t].eval[i]; // TODO: Discuss: is this needed? Is there a better way?
+            if(gridLD->IsBoss()) // Only do this on one node per timeslice, so a global sum will work
+                Evals.tensor(t + Ntfirst,i) = eig[t].eval[i];
         }
     }
     GridLogIRL.Active( PreviousIRLLogState );
@@ -322,9 +329,21 @@ void TLapEvec<GImpl>::execute(void)
         sOperatorXml.append( b->parString() );
         sOperatorXml.append( "</options></module>" );
         eig4d.record.operatorXml = sOperatorXml;
-        sEigenPackName.append(".");
-        sEigenPackName.append(std::to_string(vm().getTrajectory()));
+        sEigenPackName.append(1, '.');
+        std::size_t NameLen{ sEigenPackName.length() };
+        const std::string sTrajNum{std::to_string(vm().getTrajectory())};
+        sEigenPackName.append(sTrajNum);
         eig4d.write(sEigenPackName,false);
+        // Communicate eig[t].evec to boss-node, save into new object evecs
+        gridHD->GlobalSumVector(EigenIO::getFirstScalar(Evals.tensor),
+                                static_cast<int>(EigenIO::getScalarCount(Evals.tensor)));
+        if(gridHD->IsBoss())
+        {
+            sEigenPackName.resize(NameLen);
+            sEigenPackName.append("evals.");
+            sEigenPackName.append(sTrajNum);
+            Evals.write( sEigenPackName );
+        }
     }
 }
 
diff --git a/Hadrons/Modules/MDistil/Perambulator.cc b/Hadrons/Modules/MDistil/Perambulator.cc
index e710698e..a1b24a2d 100644
--- a/Hadrons/Modules/MDistil/Perambulator.cc
+++ b/Hadrons/Modules/MDistil/Perambulator.cc
@@ -53,5 +53,8 @@ const std::array<std::string, 4> NoiseTensor::DefaultIndexNames__{"nNoise", "nT"
 const std::string                PerambTensor::Name__{"Perambulator"};
 const std::array<std::string, 6> PerambTensor::DefaultIndexNames__{"nT", "nVec", "LI", "nNoise", "nT_inv", "SI"};
 
+const std::string                TimesliceEvals::Name__{"TimesliceEigenValues"};
+const std::array<std::string, 2> TimesliceEvals::DefaultIndexNames__{"nT", "nVec"};
+
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
diff --git a/Hadrons/NamedTensor.hpp b/Hadrons/NamedTensor.hpp
index 003a92fc..53e10e04 100644
--- a/Hadrons/NamedTensor.hpp
+++ b/Hadrons/NamedTensor.hpp
@@ -185,6 +185,20 @@ class PerambTensor : public NamedTensor<SpinVector, 6>
     : NamedTensor{Name__, DefaultIndexNames__, nT, nVec, LI, nNoise, nT_inv, SI} {}
 };
 
+class TimesliceEvals : public NamedTensor<RealD, 2>
+{
+    public:
+    static const std::string                Name__;
+    static const std::array<std::string, 2> DefaultIndexNames__;
+    // Default constructor (assumes tensor will be loaded from file)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TimesliceEvals() : NamedTensor{Name__, DefaultIndexNames__} {}
+
+    // Construct a named tensor explicitly specifying size of each dimension
+    template<typename... IndexTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TimesliceEvals(Eigen::Index nT, Eigen::Index nVec)
+    : NamedTensor{Name__, DefaultIndexNames__, nT, nVec} {}
+};
+
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_NamedTensor_hpp_

From 10192dfc71b3dc24e714d78914cd913ff05b12b4 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Fri, 31 Jan 2020 15:02:03 +0000
Subject: [PATCH 06/14] Wall source momenta must be specified for spatial
 components only. So we don't break existing scripts, allow momentum in time
 direction as well, but only if zero. Fail early, so do the check in setup()

---
 Hadrons/Modules/MSource/Wall.hpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/Hadrons/Modules/MSource/Wall.hpp b/Hadrons/Modules/MSource/Wall.hpp
index ea26d6c9..3c974dee 100644
--- a/Hadrons/Modules/MSource/Wall.hpp
+++ b/Hadrons/Modules/MSource/Wall.hpp
@@ -122,6 +122,15 @@ void TWall<FImpl>::setup(void)
     envCache(Lattice<iScalar<vInteger>>, tName_, 1, envGetGrid(LatticeComplex));
     envCacheLat(LatticeComplex, momphName_);
     envTmpLat(LatticeComplex, "coor");
+    auto  &src = envGet(PropagatorField, getName());
+    const auto NumDims = src.Grid()->Dimensions();
+    // Don't break existing scripts as long as they specified zero momentum in time dimension
+    std::vector<Real> p = strToVec<Real>(par().mom);
+    if (!(p.size() == NumDims - 1 || (p.size() == NumDims && p[NumDims - 1] == 0)))
+    {
+        HADRONS_ERROR(Size, "momentum has " + std::to_string(p.size())
+                      + " components (must have " + std::to_string(NumDims - 1) + ")");
+    }
 }
 
 // execution ///////////////////////////////////////////////////////////////////
@@ -143,7 +152,8 @@ void TWall<FImpl>::execute(void)
         envGetTmp(LatticeComplex, coor);
         p  = strToVec<Real>(par().mom);
         ph = Zero();
-        for(unsigned int mu = 0; mu < env().getNd(); mu++)
+        const auto NumDims = src.Grid()->Dimensions();
+        for(unsigned int mu = 0; mu < NumDims - 1; mu++)
         {
             LatticeCoordinate(coor, mu);
             ph = ph + (p[mu]/env().getDim(mu))*coor;

From b32b1ca642d153f5146577631fb9c7d0978963f7 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Wed, 26 Feb 2020 12:06:45 +0000
Subject: [PATCH 07/14] bugfix in perambulator module

---
 Hadrons/Modules/MDistil/Perambulator.hpp | 42 +++++++++++++++++++-----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/Hadrons/Modules/MDistil/Perambulator.hpp b/Hadrons/Modules/MDistil/Perambulator.hpp
index fb9d16dc..54db5e50 100644
--- a/Hadrons/Modules/MDistil/Perambulator.hpp
+++ b/Hadrons/Modules/MDistil/Perambulator.hpp
@@ -89,10 +89,22 @@ std::vector<std::string> TPerambulator<FImpl>::getInput(void)
     return {par().lapevec, par().solver, par().noise, par().DistilParams};
 }
 
+static const std::string UnsmearedSink{ "_unsmeared_sink" };
+
 template <typename FImpl>
 std::vector<std::string> TPerambulator<FImpl>::getOutput(void)
 {
-    return {getName(), getName() + "_unsmeared_sink"};
+    // Always return perambulator with name of module
+    std::string objName{ getName() };
+    std::vector<std::string> output{ objName };
+    // If unsmeared sink is specified, then output that as well
+    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
+    if( !UnsmearedSinkFileName.empty() )
+    {
+        objName.append( UnsmearedSink );
+        output.emplace_back( objName );
+    }
+    return output;
 }
 
 // setup ///////////////////////////////////////////////////////////////////////
@@ -105,9 +117,15 @@ void TPerambulator<FImpl>::setup(void)
     const bool full_tdil{ dp.TI == Nt };
     const int  Nt_inv{ full_tdil ? 1 : dp.TI };
 
-    envCreate(PerambTensor, getName(), 1, Nt, dp.nvec, dp.LI, dp.nnoise, Nt_inv, dp.SI);
-    envCreate(std::vector<FermionField>, getName() + "_unsmeared_sink", 1,
-              dp.nnoise*dp.LI*Ns*Nt_inv, envGetGrid(FermionField));
+    std::string objName{ getName() };
+    envCreate(PerambTensor, objName, 1, Nt, dp.nvec, dp.LI, dp.nnoise, Nt_inv, dp.SI);
+    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
+    if( !UnsmearedSinkFileName.empty() )
+    {
+        objName.append( UnsmearedSink );
+        envCreate(std::vector<FermionField>, objName, 1, dp.nnoise*dp.LI*Ns*Nt_inv,
+                  envGetGrid(FermionField));
+    }
     
     envTmpLat(LatticeSpinColourVector,   "dist_source");
     envTmpLat(LatticeSpinColourVector,   "source4d");
@@ -139,9 +157,12 @@ void TPerambulator<FImpl>::execute(void)
     envGetTmp(FermionField, v5dtmp);
     envGetTmp(FermionField, v5dtmp_sol);
     auto &noise = envGet(NoiseTensor, par().noise);
-    auto &perambulator = envGet(PerambTensor, getName());
+    std::string objName{ getName() };
+    auto &perambulator = envGet(PerambTensor, objName);
     auto &epack = envGet(LapEvecs, par().lapevec);
-    auto &unsmeared_sink = envGet(std::vector<FermionField>, getName() + "_unsmeared_sink");
+    objName.append( UnsmearedSink );
+    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
+    const bool bSaveUnsmearedSink( !UnsmearedSinkFileName.empty() );
     envGetTmp(LatticeSpinColourVector, dist_source);
     envGetTmp(LatticeSpinColourVector, source4d);
     envGetTmp(LatticeSpinColourVector, source3d);
@@ -153,7 +174,6 @@ void TPerambulator<FImpl>::execute(void)
     GridCartesian * const grid4d{ env().getGrid() }; // Owned by environment (so I won't delete it)
     const int Ntlocal{grid4d->LocalDimensions()[3]};
     const int Ntfirst{grid4d->LocalStarts()[3]};
-    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
 
     for (int inoise = 0; inoise < dp.nnoise; inoise++)
     {
@@ -197,8 +217,11 @@ void TPerambulator<FImpl>::execute(void)
                         mat.ExportPhysicalFermionSolution(v5dtmp_sol, v4dtmp);
                         result4d = v4dtmp;
                     }
-                    if (!UnsmearedSinkFileName.empty())
+                    if( bSaveUnsmearedSink )
+                    {
+                        auto &unsmeared_sink = envGet(std::vector<FermionField>, objName);
                         unsmeared_sink[inoise+dp.nnoise*(dk+dp.LI*(dt+Nt_inv*ds))] = result4d;
+                    }
                     for (int is = 0; is < Ns; is++)
                     {
                         result4d_nospin = peekSpin(result4d,is);
@@ -250,9 +273,10 @@ void TPerambulator<FImpl>::execute(void)
     }
     
     //Save the unsmeared sinks if filename specified
-    if (!UnsmearedSinkFileName.empty())
+    if (bSaveUnsmearedSink)
     {
         LOG(Message) << "Writing unsmeared sink to " << UnsmearedSinkFileName << std::endl;
+        auto &unsmeared_sink = envGet(std::vector<FermionField>, objName);
         A2AVectorsIo::write(UnsmearedSinkFileName, unsmeared_sink, false, vm().getTrajectory());
     }
 }

From 0a827aa7bf993ca6ac0bb608183fc695a2b49481 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Wed, 11 Mar 2020 08:52:52 +0000
Subject: [PATCH 08/14] Added Hadrons_Error in case blockSize is set too large

---
 Hadrons/Modules/MContraction/A2AMesonField.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Hadrons/Modules/MContraction/A2AMesonField.hpp b/Hadrons/Modules/MContraction/A2AMesonField.hpp
index d3f90959..cf1b9415 100644
--- a/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -231,6 +231,11 @@ void TA2AMesonField<FImpl>::execute(void)
     int block      = par().block;
     int cacheBlock = par().cacheBlock;
 
+    if (N_i < block || N_j < block)
+    {
+        HADRONS_ERROR(Range, "blockSize must not exceed size of input vector.");
+    }
+
     LOG(Message) << "Computing all-to-all meson fields" << std::endl;
     LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
     LOG(Message) << "Momenta:" << std::endl;

From 0fa93383b7a0fb35725fae2316ef445ee3a9e5ce Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Wed, 11 Mar 2020 09:05:01 +0000
Subject: [PATCH 09/14] changed to push_back according to request

---
 Hadrons/Modules/MDistil/Perambulator.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Hadrons/Modules/MDistil/Perambulator.hpp b/Hadrons/Modules/MDistil/Perambulator.hpp
index 54db5e50..e436c208 100644
--- a/Hadrons/Modules/MDistil/Perambulator.hpp
+++ b/Hadrons/Modules/MDistil/Perambulator.hpp
@@ -102,7 +102,7 @@ std::vector<std::string> TPerambulator<FImpl>::getOutput(void)
     if( !UnsmearedSinkFileName.empty() )
     {
         objName.append( UnsmearedSink );
-        output.emplace_back( objName );
+        output.push_back( objName );
     }
     return output;
 }

From 516ac1d4d5b7ca419d60983dc02b391b2dd7f7e2 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Thu, 12 Mar 2020 10:52:27 +0000
Subject: [PATCH 10/14] registered module supporting ZMobius action

---
 Hadrons/Modules/MDistil/Perambulator.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Hadrons/Modules/MDistil/Perambulator.hpp b/Hadrons/Modules/MDistil/Perambulator.hpp
index fb9d16dc..b3d40509 100644
--- a/Hadrons/Modules/MDistil/Perambulator.hpp
+++ b/Hadrons/Modules/MDistil/Perambulator.hpp
@@ -74,6 +74,7 @@ protected:
 };
 
 MODULE_REGISTER_TMP(Perambulator, TPerambulator<FIMPL>, MDistil);
+MODULE_REGISTER_TMP(Perambulator, TPerambulator<ZFIMPL>, MDistil);
 
 /******************************************************************************
  *                 TPerambulator implementation                             *

From 373cf61abb29157ab6d60b654d4133651f821842 Mon Sep 17 00:00:00 2001
From: Felix Erben <dc-erbe1@tesseract-login1.ib0.sgi.cluster.dirac.ed.ac.uk>
Date: Thu, 12 Mar 2020 11:44:43 +0000
Subject: [PATCH 11/14] bugfix ZPerambulator

---
 Hadrons/Modules/MDistil/Perambulator.cc  | 1 +
 Hadrons/Modules/MDistil/Perambulator.hpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Hadrons/Modules/MDistil/Perambulator.cc b/Hadrons/Modules/MDistil/Perambulator.cc
index e710698e..691940ac 100644
--- a/Hadrons/Modules/MDistil/Perambulator.cc
+++ b/Hadrons/Modules/MDistil/Perambulator.cc
@@ -34,6 +34,7 @@ using namespace Hadrons;
 using namespace MDistil;
 
 template class Grid::Hadrons::MDistil::TPerambulator<FIMPL>;
+template class Grid::Hadrons::MDistil::TPerambulator<ZFIMPL>;
 
 BEGIN_HADRONS_NAMESPACE
 
diff --git a/Hadrons/Modules/MDistil/Perambulator.hpp b/Hadrons/Modules/MDistil/Perambulator.hpp
index fb9d16dc..7b507c7a 100644
--- a/Hadrons/Modules/MDistil/Perambulator.hpp
+++ b/Hadrons/Modules/MDistil/Perambulator.hpp
@@ -74,6 +74,7 @@ protected:
 };
 
 MODULE_REGISTER_TMP(Perambulator, TPerambulator<FIMPL>, MDistil);
+MODULE_REGISTER_TMP(ZPerambulator, TPerambulator<ZFIMPL>, MDistil);
 
 /******************************************************************************
  *                 TPerambulator implementation                             *

From 37535089570a59793becd44a0289f83831029aa3 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Thu, 12 Mar 2020 13:47:51 +0000
Subject: [PATCH 12/14] Making change 1) as simple as possible 2) as much like
 MSink/Point.hpp as possible

---
 Hadrons/Modules/MSource/Wall.hpp | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/Hadrons/Modules/MSource/Wall.hpp b/Hadrons/Modules/MSource/Wall.hpp
index 3c974dee..d6a223cf 100644
--- a/Hadrons/Modules/MSource/Wall.hpp
+++ b/Hadrons/Modules/MSource/Wall.hpp
@@ -122,15 +122,6 @@ void TWall<FImpl>::setup(void)
     envCache(Lattice<iScalar<vInteger>>, tName_, 1, envGetGrid(LatticeComplex));
     envCacheLat(LatticeComplex, momphName_);
     envTmpLat(LatticeComplex, "coor");
-    auto  &src = envGet(PropagatorField, getName());
-    const auto NumDims = src.Grid()->Dimensions();
-    // Don't break existing scripts as long as they specified zero momentum in time dimension
-    std::vector<Real> p = strToVec<Real>(par().mom);
-    if (!(p.size() == NumDims - 1 || (p.size() == NumDims && p[NumDims - 1] == 0)))
-    {
-        HADRONS_ERROR(Size, "momentum has " + std::to_string(p.size())
-                      + " components (must have " + std::to_string(NumDims - 1) + ")");
-    }
 }
 
 // execution ///////////////////////////////////////////////////////////////////
@@ -152,8 +143,7 @@ void TWall<FImpl>::execute(void)
         envGetTmp(LatticeComplex, coor);
         p  = strToVec<Real>(par().mom);
         ph = Zero();
-        const auto NumDims = src.Grid()->Dimensions();
-        for(unsigned int mu = 0; mu < NumDims - 1; mu++)
+        for(unsigned int mu = 0; mu < p.size(); mu++)
         {
             LatticeCoordinate(coor, mu);
             ph = ph + (p[mu]/env().getDim(mu))*coor;

From 037bb6ea7397fcedddcd94515786849351e4d507 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Mon, 16 Mar 2020 14:07:52 +0100
Subject: [PATCH 13/14] Check in reader for openqcd configs

This reader is suboptimal in the sense that it opens the entire config on every MPI rank.
---
 Grid/parallelIO/OpenQcdIO.h  | 153 +++++++++++++++++++++++++++++++++++
 Grid/qcd/hmc/HMC_aggregate.h |   1 +
 tests/IO/Test_openqcd_io.cc  |  55 +++++++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 Grid/parallelIO/OpenQcdIO.h
 create mode 100644 tests/IO/Test_openqcd_io.cc

diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h
new file mode 100644
index 00000000..f340e8fc
--- /dev/null
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -0,0 +1,153 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/parallelIO/OpenQcdIO.h
+
+Copyright (C) 2015 - 2020
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+struct OpenQcdHeader : Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
+                                  int,    Nt,
+                                  int,    Nx,
+                                  int,    Ny,
+                                  int,    Nz,
+                                  double, plaq);
+};
+
+class OpenQcdIO : public BinaryIO {
+public:
+  static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
+
+  static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
+    OpenQcdHeader header;
+
+    {
+      std::ifstream fin(file, std::ios::in | std::ios::binary);
+      fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
+      assert(!fin.fail());
+      field.data_start = fin.tellg();
+      fin.close();
+    }
+
+    header.plaq /= normalisationFactor;
+
+    // sanity check (should trigger on endian issues)
+    assert(0 < header.Nt && header.Nt <= 1024);
+    assert(0 < header.Nx && header.Nx <= 1024);
+    assert(0 < header.Ny && header.Ny <= 1024);
+    assert(0 < header.Nz && header.Nz <= 1024);
+
+    field.dimension[0] = header.Nx;
+    field.dimension[1] = header.Ny;
+    field.dimension[2] = header.Nz;
+    field.dimension[3] = header.Nt;
+
+    assert(grid->_ndimension == Nd);
+    for(int d = 0; d < Nd; d++)
+      assert(grid->_fdimensions[d] == field.dimension[d]);
+
+    field.plaquette = header.plaq;
+
+    return field.data_start;
+  }
+
+  template<class vsimd>
+  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                       FieldMetaData&                        header,
+                                       std::string                           file) {
+    auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
+    assert(grid != nullptr);
+    assert((grid->_ndimension == Nd) && (Nd == 4));
+
+    uint64_t offset = readHeader(file, Umu.Grid(), header);
+    FieldMetaData clone(header);
+
+    // NOTE: This version is suboptimal because we read in the full file on every rank
+    std::vector<ColourMatrix> data(grid->gSites() * 4);
+    {
+      auto fin = std::fstream(file, std::ios::in | std::ios::binary);
+      fin.seekg(offset);
+      fin.read((char *)data.data(), data.size() * sizeof(ColourMatrix));
+      fin.close();
+    }
+
+    // global lattice size
+    Coordinate fdim = grid->FullDimensions();
+
+    // coordinate of this process
+    Coordinate pcoor;
+    grid->ProcessorCoorFromRank(CartesianCommunicator::RankWorld(), pcoor);
+
+    // loop over local indices
+    thread_for(idx, grid->lSites(), {
+      // convert local index to global coordinate
+      Coordinate lcoor, gcoor;
+      grid->LocalIndexToLocalCoor(idx, lcoor);
+      grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor, lcoor, gcoor);
+
+      // openQCD stores links attached to odd sites
+      bool neg = (gcoor[Xdir] + gcoor[Ydir] + gcoor[Zdir] + gcoor[Tdir]) % 2 != 1;
+
+      LorentzColourMatrix site_data;
+      for (int mu = 0; mu < 4; ++mu) {
+        // determine the site at which it is stored
+        Coordinate c = gcoor;
+        if (neg)
+          c[mu] = (c[mu] + 1) % grid->FullDimensions()[mu];
+
+        // site-index in the OpenQCD format (which uses t,x,y,z order)
+        int openqcd_idx = (c[Tdir] * fdim[Xdir] * fdim[Ydir] * fdim[Zdir]
+                        +  c[Xdir] * fdim[Ydir] * fdim[Zdir]
+                        +  c[Ydir] * fdim[Zdir]
+                        +  c[Zdir])/2;
+        int openqcd_mu = (mu + 1) % 4;
+
+        // pick the colour-matrix out
+        site_data(mu) =
+            data[8 * openqcd_idx + 2 * openqcd_mu + (neg ? 1 : 0)]();
+      }
+
+      pokeLocalSite(site_data, Umu, lcoor);
+    });
+
+    GaugeStatistics(Umu, clone);
+
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " plaquette "
+              << std::setprecision(15)
+              << clone.plaquette << " header " << header.plaquette
+              << " difference " << fabs(clone.plaquette - header.plaquette)
+              << std::endl;
+
+    if(fabs(clone.plaquette - header.plaquette) >= 1.0e-5) std::cout << " Plaquette mismatch " << std::endl;
+    assert(fabs(clone.plaquette - header.plaquette) < 1.0e-5);
+
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
+  }
+};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h
index e4d2ce83..94c745e1 100644
--- a/Grid/qcd/hmc/HMC_aggregate.h
+++ b/Grid/qcd/hmc/HMC_aggregate.h
@@ -39,6 +39,7 @@ directory
 #include <Grid/parallelIO/IldgIOtypes.h>
 #include <Grid/parallelIO/IldgIO.h>
 #include <Grid/parallelIO/NerscIO.h>
+#include <Grid/parallelIO/OpenQcdIO.h>
 NAMESPACE_CHECK(Ildg);
 
 #include <Grid/qcd/hmc/checkpointers/CheckPointers.h>
diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc
new file mode 100644
index 00000000..2a5769bd
--- /dev/null
+++ b/tests/IO/Test_openqcd_io.cc
@@ -0,0 +1,55 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/io/Test_openqcd_io.cc
+
+Copyright (C) 2015 - 2020
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  GridCartesian* grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+                                                       GridDefaultSimd(Nd, vComplexD::Nsimd()),
+                                                       GridDefaultMpi());
+
+  LatticeGaugeField Umu(grid);
+
+  FieldMetaData header;
+
+  if(!Grid::GridCmdOptionExists(argv, argv + argc, "--config")) {
+    std::cout << GridLogError << "You need to use --config /path/to/openqcd_config" << std::endl;
+    abort();
+  }
+
+  std::string file = Grid::GridCmdOptionPayload(argv, argv + argc, "--config");
+  assert(!file.empty());
+
+  OpenQcdIO::readConfiguration(Umu, header, file);
+
+  Grid_finalize();
+}

From 989af658071f5d9fc92adc0d6e0ab9775b3e0e51 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Mon, 23 Mar 2020 17:33:18 +0100
Subject: [PATCH 14/14] Check in parallel reader for openqcd configs

---
 Grid/parallelIO/MetaData.h                 |  24 ++
 Grid/parallelIO/OpenQcdIO.h                | 165 ++++++++----
 Grid/parallelIO/OpenQcdIOChromaReference.h | 281 +++++++++++++++++++++
 Grid/qcd/hmc/HMC_aggregate.h               |   3 +
 tests/IO/Test_openqcd_io.cc                |  51 +++-
 5 files changed, 466 insertions(+), 58 deletions(-)
 create mode 100644 Grid/parallelIO/OpenQcdIOChromaReference.h

diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h
index 2e211838..4c1cfbdb 100644
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger {
   };
 };
 
+template<class fobj,class sobj>
+struct GaugeDoubleStoredMunger{
+  void operator()(fobj &in, sobj &out) {
+    for (int mu = 0; mu < Nds; mu++) {
+      for (int i = 0; i < Nc; i++) {
+        for (int j = 0; j < Nc; j++) {
+          out(mu)()(i, j) = in(mu)()(i, j);
+        }}
+    }
+  };
+};
+
+template <class fobj, class sobj>
+struct GaugeDoubleStoredUnmunger {
+  void operator()(sobj &in, fobj &out) {
+    for (int mu = 0; mu < Nds; mu++) {
+      for (int i = 0; i < Nc; i++) {
+        for (int j = 0; j < Nc; j++) {
+          out(mu)()(i, j) = in(mu)()(i, j);
+        }}
+    }
+  };
+};
+
 template<class fobj,class sobj>
 struct Gauge3x2munger{
   void operator() (fobj &in,sobj &out){
diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h
index f340e8fc..00911595 100644
--- a/Grid/parallelIO/OpenQcdIO.h
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -67,6 +67,10 @@ public:
     field.dimension[2] = header.Nz;
     field.dimension[3] = header.Nt;
 
+    std::cout << GridLogDebug << "header: " << header << std::endl;
+    std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
+    std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
+
     assert(grid->_ndimension == Nd);
     for(int d = 0; d < Nd; d++)
       assert(grid->_fdimensions[d] == field.dimension[d]);
@@ -80,74 +84,141 @@ public:
   static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
                                        FieldMetaData&                        header,
                                        std::string                           file) {
+    typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
+
+    assert(Ns == 4 and Nd == 4 and Nc == 3);
+
     auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
-    assert(grid != nullptr);
-    assert((grid->_ndimension == Nd) && (Nd == 4));
+    assert(grid != nullptr); assert(grid->_ndimension == Nd);
 
     uint64_t offset = readHeader(file, Umu.Grid(), header);
+
     FieldMetaData clone(header);
 
-    // NOTE: This version is suboptimal because we read in the full file on every rank
-    std::vector<ColourMatrix> data(grid->gSites() * 4);
-    {
-      auto fin = std::fstream(file, std::ios::in | std::ios::binary);
-      fin.seekg(offset);
-      fin.read((char *)data.data(), data.size() * sizeof(ColourMatrix));
-      fin.close();
-    }
+    std::string format("IEEE64"); // they always store little endian double precsision
+    uint32_t    nersc_csum, scidac_csuma, scidac_csumb;
 
-    // global lattice size
-    Coordinate fdim = grid->FullDimensions();
+    GridCartesian*         grid_openqcd = createOpenQcdGrid(grid);
+    GridRedBlackCartesian* grid_rb      = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
 
-    // coordinate of this process
-    Coordinate pcoor;
-    grid->ProcessorCoorFromRank(CartesianCommunicator::RankWorld(), pcoor);
+    typedef DoubleStoredColourMatrixD                                              fobj;
+    typedef typename DoubleStoredGaugeField::vector_object::scalar_object          sobj;
+    typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
 
-    // loop over local indices
-    thread_for(idx, grid->lSites(), {
-      // convert local index to global coordinate
-      Coordinate lcoor, gcoor;
-      grid->LocalIndexToLocalCoor(idx, lcoor);
-      grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor, lcoor, gcoor);
+    word w = 0;
 
-      // openQCD stores links attached to odd sites
-      bool neg = (gcoor[Xdir] + gcoor[Ydir] + gcoor[Zdir] + gcoor[Tdir]) % 2 != 1;
+    std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
+    std::vector<sobj> scalardata(grid->lSites());
 
-      LorentzColourMatrix site_data;
-      for (int mu = 0; mu < 4; ++mu) {
-        // determine the site at which it is stored
-        Coordinate c = gcoor;
-        if (neg)
-          c[mu] = (c[mu] + 1) % grid->FullDimensions()[mu];
+    IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
+             nersc_csum, scidac_csuma, scidac_csumb);
 
-        // site-index in the OpenQCD format (which uses t,x,y,z order)
-        int openqcd_idx = (c[Tdir] * fdim[Xdir] * fdim[Ydir] * fdim[Zdir]
-                        +  c[Xdir] * fdim[Ydir] * fdim[Zdir]
-                        +  c[Ydir] * fdim[Zdir]
-                        +  c[Zdir])/2;
-        int openqcd_mu = (mu + 1) % 4;
+    GridStopWatch timer;
+    timer.Start();
 
-        // pick the colour-matrix out
-        site_data(mu) =
-            data[8 * openqcd_idx + 2 * openqcd_mu + (neg ? 1 : 0)]();
-      }
+    DoubleStoredGaugeField Umu_ds(grid);
 
-      pokeLocalSite(site_data, Umu, lcoor);
+    auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
+
+    Coordinate ldim = grid->LocalDimensions();
+    thread_for(idx_g, grid->lSites(), {
+        Coordinate coor;
+        grid->LocalIndexToLocalCoor(idx_g, coor);
+
+        bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
+
+        if(!isOdd) continue;
+
+        int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
+                  +  coor[Xdir] * ldim[Ydir] * ldim[Zdir]
+                  +  coor[Ydir] * ldim[Zdir]
+                  +  coor[Zdir])/2;
+
+        munge(iodata[idx_o], scalardata[idx_g]);
     });
 
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
+
+    timer.Reset(); timer.Start();
+
+    vectorizeFromLexOrdArray(scalardata, Umu_ds);
+
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
+
+    timer.Reset(); timer.Start();
+
+    undoDoubleStore(Umu, Umu_ds);
+
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
+
     GaugeStatistics(Umu, clone);
 
-    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " plaquette "
-              << std::setprecision(15)
-              << clone.plaquette << " header " << header.plaquette
-              << " difference " << fabs(clone.plaquette - header.plaquette)
-              << std::endl;
+    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
 
-    if(fabs(clone.plaquette - header.plaquette) >= 1.0e-5) std::cout << " Plaquette mismatch " << std::endl;
-    assert(fabs(clone.plaquette - header.plaquette) < 1.0e-5);
+    // clang-format off
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file
+              << " plaquette " << clone.plaquette
+              << " header " << header.plaquette
+              << " difference " << plaq_diff
+              << std::endl;
+    // clang-format on
+
+    RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
+    RealD tol     = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
+
+    if(plaq_diff >= tol)
+      std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
+    assert(plaq_diff < tol);
 
     std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
   }
+
+  template<class vsimd>
+  static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                        std::string                           file) {
+    std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+private:
+  static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
+    // exploit GridCartesian to be able to still use IOobject
+    Coordinate gdim  = grid->GlobalDimensions();
+    Coordinate ldim  = grid->LocalDimensions();
+    Coordinate pcoor = grid->ThisProcessorCoor();
+
+    // openqcd does rb on the z direction
+    gdim[Zdir] /= 2;
+    ldim[Zdir] /= 2;
+
+    // and has the order T X Y Z (from slowest to fastest)
+    std::swap(gdim[Xdir], gdim[Zdir]);
+    std::swap(ldim[Xdir], ldim[Zdir]);
+    std::swap(pcoor[Xdir], pcoor[Zdir]);
+
+    GridCartesian* ret   = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
+    ret->_ldimensions    = ldim;
+    ret->_processor_coor = pcoor;
+    return ret;
+  }
+
+  template<class vsimd>
+  static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>&            Umu,
+                                     Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
+    conformable(Umu.Grid(), Umu_ds.Grid());
+    Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
+
+    // they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
+    for(int mu_g = 0; mu_g < Nd; ++mu_g) {
+      int mu_o = (mu_g + 1) % Nd;
+      U        = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
+               + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
+      PokeIndex<LorentzIndex>(Umu, U, mu_g);
+    }
+  }
 };
 
 NAMESPACE_END(Grid);
diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h
new file mode 100644
index 00000000..bab54fe8
--- /dev/null
+++ b/Grid/parallelIO/OpenQcdIOChromaReference.h
@@ -0,0 +1,281 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
+
+Copyright (C) 2015 - 2020
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <ios>
+#include <iostream>
+#include <limits>
+#include <iomanip>
+#include <mpi.h>
+#include <ostream>
+#include <string>
+
+#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
+#define CHECK_VAR(a)   { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
+// #undef CHECK
+// #define CHECK
+
+NAMESPACE_BEGIN(Grid);
+
+class ParRdr {
+private:
+  bool const swap;
+
+  MPI_Status status;
+  MPI_File   fp;
+
+  int err;
+
+  MPI_Datatype oddSiteType;
+  MPI_Datatype fileViewType;
+
+  GridBase* grid;
+
+public:
+  ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
+    : swap(false)
+    , grid(gridPtr) {
+    err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
+    assert(err == MPI_SUCCESS);
+  }
+
+  virtual ~ParRdr() { MPI_File_close(&fp); }
+
+  inline void errInfo(int const err, std::string const& func) {
+    static char estring[MPI_MAX_ERROR_STRING];
+    int         eclass = -1, len = 0;
+    MPI_Error_class(err, &eclass);
+    MPI_Error_string(err, estring, &len);
+    std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
+  }
+
+  int readHeader(FieldMetaData& field) {
+    assert((grid->_ndimension == Nd) && (Nd == 4));
+    assert(Nc == 3);
+
+    OpenQcdHeader header;
+
+    readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
+
+    header.plaq /= 3.; // TODO change this into normalizationfactor
+
+    // sanity check (should trigger on endian issues) TODO remove?
+    assert(0 < header.Nt && header.Nt <= 1024);
+    assert(0 < header.Nx && header.Nx <= 1024);
+    assert(0 < header.Ny && header.Ny <= 1024);
+    assert(0 < header.Nz && header.Nz <= 1024);
+
+    field.dimension[0] = header.Nx;
+    field.dimension[1] = header.Ny;
+    field.dimension[2] = header.Nz;
+    field.dimension[3] = header.Nt;
+
+    for(int d = 0; d < Nd; d++)
+      assert(grid->FullDimensions()[d] == field.dimension[d]);
+
+    field.plaquette = header.plaq;
+
+    field.data_start = sizeof(OpenQcdHeader);
+
+    return field.data_start;
+  }
+
+  void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
+    err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
+    errInfo(err, "MPI_File_read_at_all");
+    // CHECK_VAR(err)
+
+    int read = -1;
+    MPI_Get_count(&status, datatype, &read);
+    // CHECK_VAR(read)
+    assert(nbytes == (uint64_t)read);
+    assert(err == MPI_SUCCESS);
+  }
+
+  void createTypes() {
+    constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
+
+    err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
+
+    Coordinate const L = grid->GlobalDimensions();
+    Coordinate const l = grid->LocalDimensions();
+    Coordinate const i = grid->ThisProcessorCoor();
+
+    Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
+    Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
+    Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
+
+    err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
+  }
+
+  void freeTypes() {
+    err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
+  }
+
+  bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
+    auto hdr_offset = readHeader(meta);
+    CHECK
+    createTypes();
+    err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
+    CHECK
+    int const domainSites = grid->lSites();
+    domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
+
+    // the actual READ
+    constexpr uint64_t cm_size   = 2 * Nc * Nc * sizeof(double);    // 2_complex
+    constexpr uint64_t os_size   = Nd * 2 * cm_size;                // 2_fwdbwd
+    constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
+    uint64_t const     n_os      = domainSites / 2;
+
+    for(uint64_t os_idx = 0; os_idx < n_os;) {
+      uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
+      uint64_t const cm      = os_idx * Nd * 2;
+      readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
+      os_idx += read_os;
+    }
+
+    CHECK
+    err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
+  errInfo(err, "MPI_File_set_view1");
+    assert(err == MPI_SUCCESS);
+    freeTypes();
+
+    std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
+    return true;
+  }
+};
+
+class OpenQcdIOChromaReference : public BinaryIO {
+public:
+  template<class vsimd>
+  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                       Grid::FieldMetaData&                  header,
+                                       std::string                           file) {
+    typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
+
+    assert(Ns == 4 and Nd == 4 and Nc == 3);
+
+    auto grid = Umu.Grid();
+
+    typedef ColourMatrixD fobj;
+
+    std::vector<fobj> iodata(
+      Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
+
+    {
+      ParRdr rdr(MPI_COMM_WORLD, file, grid);
+      rdr.readGauge(iodata, header);
+    } // equivalent to using binaryio
+
+    std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
+
+    copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
+
+    DoubledGaugeField Umu_ds(grid);
+
+    vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
+
+    redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
+
+    FieldMetaData clone(header);
+
+    GaugeStatistics(Umu, clone);
+
+    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
+
+    // clang-format off
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file
+              << " plaquette " << clone.plaquette
+              << " header " << header.plaquette
+              << " difference " << plaq_diff
+              << std::endl;
+    // clang-format on
+
+    RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
+    RealD tol     = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
+
+    if(plaq_diff >= tol)
+      std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
+    assert(plaq_diff < tol);
+
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
+  }
+
+private:
+  template<class vsimd>
+  static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>&            Umu,
+                                  Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
+    Grid::conformable(Umu.Grid(), Umu_ds.Grid());
+    Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
+
+    U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
+  }
+
+  static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
+                                         std::vector<ColourMatrixD> const&      node_buff,
+                                         GridBase*                              grid) {
+    assert(node_buff.size() == Nd * grid->lSites());
+
+    Coordinate const& l = grid->LocalDimensions();
+
+    Coordinate coord(Nd);
+    int&       x = coord[0];
+    int&       y = coord[1];
+    int&       z = coord[2];
+    int&       t = coord[3];
+
+    int buff_idx = 0;
+    for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
+      for(x = 0; x < l[0]; ++x)
+        for(y = 0; y < l[1]; ++y)
+          for(z = 0; z < l[2]; ++z) {
+            if((t + z + y + x) % 2 == 0) continue;
+
+            int local_idx;
+            Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
+            for(int mu = 0; mu < 2 * Nd; ++mu)
+              for(int c1 = 0; c1 < Nc; ++c1) {
+                for(int c2 = 0; c2 < Nc; ++c2) {
+                  u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
+                }
+              }
+            buff_idx += 2 * Nd;
+          }
+
+    assert(node_buff.size() == buff_idx);
+  }
+};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h
index 94c745e1..cb510953 100644
--- a/Grid/qcd/hmc/HMC_aggregate.h
+++ b/Grid/qcd/hmc/HMC_aggregate.h
@@ -40,6 +40,9 @@ directory
 #include <Grid/parallelIO/IldgIO.h>
 #include <Grid/parallelIO/NerscIO.h>
 #include <Grid/parallelIO/OpenQcdIO.h>
+#if !defined(GRID_COMMS_NONE)
+#include <Grid/parallelIO/OpenQcdIOChromaReference.h>
+#endif
 NAMESPACE_CHECK(Ildg);
 
 #include <Grid/qcd/hmc/checkpointers/CheckPointers.h>
diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc
index 2a5769bd..83b498c2 100644
--- a/tests/IO/Test_openqcd_io.cc
+++ b/tests/IO/Test_openqcd_io.cc
@@ -28,28 +28,57 @@ See the full license in the file "LICENSE" in the top level distribution directo
 
 #include <Grid/Grid.h>
 
+#if defined(GRID_COMMS_NONE)
+#error This test requires Grid compiled with MPI
+#endif
+
 using namespace Grid;
 
 int main(int argc, char** argv) {
   Grid_init(&argc, &argv);
 
-  GridCartesian* grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
-                                                       GridDefaultSimd(Nd, vComplexD::Nsimd()),
-                                                       GridDefaultMpi());
+  auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
+  auto mpi_layout  = GridDefaultMpi();
+  auto latt_size   = GridDefaultLatt();
 
-  LatticeGaugeField Umu(grid);
+  GridCartesian grid(latt_size, simd_layout, mpi_layout);
 
-  FieldMetaData header;
+  GridParallelRNG pRNG(&grid);
 
-  if(!Grid::GridCmdOptionExists(argv, argv + argc, "--config")) {
-    std::cout << GridLogError << "You need to use --config /path/to/openqcd_config" << std::endl;
-    abort();
+  pRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
+
+  LatticeGaugeField Umu_ref(&grid);
+  LatticeGaugeField Umu_me(&grid);
+  LatticeGaugeField Umu_diff(&grid);
+
+  FieldMetaData header_ref;
+  FieldMetaData header_me;
+
+  Umu_ref = Zero();
+  Umu_me  = Zero();
+
+  std::string file("/home/daniel/configs/openqcd/test_16x8_pbcn6");
+
+  if(GridCmdOptionExists(argv, argv + argc, "--config")) {
+    file = GridCmdOptionPayload(argv, argv + argc, "--config");
+    std::cout << "file: " << file << std::endl;
+    assert(!file.empty());
   }
 
-  std::string file = Grid::GridCmdOptionPayload(argv, argv + argc, "--config");
-  assert(!file.empty());
+  OpenQcdIOChromaReference::readConfiguration(Umu_ref, header_ref, file);
+  OpenQcdIO::readConfiguration(Umu_me, header_me, file);
 
-  OpenQcdIO::readConfiguration(Umu, header, file);
+  std::cout << GridLogMessage << header_ref << std::endl;
+  std::cout << GridLogMessage << header_me << std::endl;
+
+  Umu_diff = Umu_ref - Umu_me;
+
+  // clang-format off
+  std::cout << GridLogMessage
+            << "norm2(Umu_ref) = " << norm2(Umu_ref)
+            << " norm2(Umu_me) = " << norm2(Umu_me)
+            << " norm2(Umu_diff) = " << norm2(Umu_diff) << std::endl;
+  // clang-format on
 
   Grid_finalize();
 }