From 6e4a06e180f7500df13ceea362b71294b8da74ff Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 20 Oct 2016 15:04:00 +0100
Subject: [PATCH 001/177] qed-fvol: initial commit

---
 Makefile.am                   |  2 +-
 configure.ac                  |  2 ++
 programs/Makefile.am          |  1 +
 programs/qed-fvol/Global.cc   | 11 +++++++++
 programs/qed-fvol/Global.hpp  | 42 +++++++++++++++++++++++++++++++++++
 programs/qed-fvol/Makefile.am |  9 ++++++++
 programs/qed-fvol/qed-fvol.cc | 36 ++++++++++++++++++++++++++++++
 7 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 programs/Makefile.am
 create mode 100644 programs/qed-fvol/Global.cc
 create mode 100644 programs/qed-fvol/Global.hpp
 create mode 100644 programs/qed-fvol/Makefile.am
 create mode 100644 programs/qed-fvol/qed-fvol.cc

diff --git a/Makefile.am b/Makefile.am
index 90c5cd71..8cc860a9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = lib benchmarks tests
+SUBDIRS = lib benchmarks tests programs
 
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
diff --git a/configure.ac b/configure.ac
index 7bcdc49f..81ced467 100644
--- a/configure.ac
+++ b/configure.ac
@@ -326,6 +326,8 @@ AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
+AC_CONFIG_FILES(programs/Makefile)
+AC_CONFIG_FILES(programs/qed-fvol/Makefile)
 AC_OUTPUT
 
 echo "
diff --git a/programs/Makefile.am b/programs/Makefile.am
new file mode 100644
index 00000000..ff7f6584
--- /dev/null
+++ b/programs/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = qed-fvol
diff --git a/programs/qed-fvol/Global.cc b/programs/qed-fvol/Global.cc
new file mode 100644
index 00000000..57ed97cc
--- /dev/null
+++ b/programs/qed-fvol/Global.cc
@@ -0,0 +1,11 @@
+#include <qed-fvol/Global.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace QedFVol;
+
+QedFVolLogger QedFVol::QedFVolLogError(1,"Error");
+QedFVolLogger QedFVol::QedFVolLogWarning(1,"Warning");
+QedFVolLogger QedFVol::QedFVolLogMessage(1,"Message");
+QedFVolLogger QedFVol::QedFVolLogIterative(1,"Iterative");
+QedFVolLogger QedFVol::QedFVolLogDebug(1,"Debug");
diff --git a/programs/qed-fvol/Global.hpp b/programs/qed-fvol/Global.hpp
new file mode 100644
index 00000000..7f07200d
--- /dev/null
+++ b/programs/qed-fvol/Global.hpp
@@ -0,0 +1,42 @@
+#ifndef QedFVol_Global_hpp_
+#define QedFVol_Global_hpp_
+
+#include <Grid/Grid.h>
+
+#define BEGIN_QEDFVOL_NAMESPACE \
+namespace Grid {\
+using namespace QCD;\
+namespace QedFVol {\
+using Grid::operator<<;
+#define END_QEDFVOL_NAMESPACE }}
+
+/* the 'using Grid::operator<<;' statement prevents a very nasty compilation
+ * error with GCC (clang compiles fine without it).
+ */
+
+BEGIN_QEDFVOL_NAMESPACE
+
+class QedFVolLogger: public Logger
+{
+public:
+    QedFVolLogger(int on, std::string nm): Logger("QedFVol", on, nm,
+                                                  GridLogColours, "BLACK"){};
+};
+
+#define LOG(channel) std::cout << QedFVolLog##channel
+#define QEDFVOL_ERROR(msg)\
+LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\
+           << __LINE__ << ")" << std::endl;\
+abort();
+
+#define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
+
+extern QedFVolLogger QedFVolLogError;
+extern QedFVolLogger QedFVolLogWarning;
+extern QedFVolLogger QedFVolLogMessage;
+extern QedFVolLogger QedFVolLogIterative;
+extern QedFVolLogger QedFVolLogDebug;
+
+END_QEDFVOL_NAMESPACE
+
+#endif // QedFVol_Global_hpp_
diff --git a/programs/qed-fvol/Makefile.am b/programs/qed-fvol/Makefile.am
new file mode 100644
index 00000000..cd762e94
--- /dev/null
+++ b/programs/qed-fvol/Makefile.am
@@ -0,0 +1,9 @@
+AM_CXXFLAGS += -I$(top_srcdir)/programs -I../$(top_srcdir)/programs
+
+bin_PROGRAMS = qed-fvol
+
+qed_fvol_SOURCES =   \
+    qed-fvol.cc      \
+    Global.cc
+
+qed_fvol_LDADD   = -lGrid
diff --git a/programs/qed-fvol/qed-fvol.cc b/programs/qed-fvol/qed-fvol.cc
new file mode 100644
index 00000000..bb3204c6
--- /dev/null
+++ b/programs/qed-fvol/qed-fvol.cc
@@ -0,0 +1,36 @@
+#include <Global.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace QedFVol;
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    std::string parameterFileName;
+    
+    if (argc < 2)
+    {
+        std::cerr << "usage: " << argv[0] << " <parameter file> [Grid options]";
+        std::cerr << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+    parameterFileName = argv[1];
+    
+    // initialization
+    Grid_init(&argc, &argv);
+    QedFVolLogError.Active(GridLogError.isActive());
+    QedFVolLogWarning.Active(GridLogWarning.isActive());
+    QedFVolLogMessage.Active(GridLogMessage.isActive());
+    QedFVolLogIterative.Active(GridLogIterative.isActive());
+    QedFVolLogDebug.Active(GridLogDebug.isActive());
+    LOG(Message) << "Grid initialized" << std::endl;
+    
+
+    
+    // epilogue
+    LOG(Message) << "Grid is finalizing now" << std::endl;
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}

From 0d889b70410bfdaf70b5cbffe2fb92157943cc03 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 21 Oct 2016 15:21:32 +0100
Subject: [PATCH 002/177] QedFVol: first attempt at generating a QED field

---
 programs/qed-fvol/qed-fvol.cc | 38 ++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/programs/qed-fvol/qed-fvol.cc b/programs/qed-fvol/qed-fvol.cc
index bb3204c6..53e01de9 100644
--- a/programs/qed-fvol/qed-fvol.cc
+++ b/programs/qed-fvol/qed-fvol.cc
@@ -4,6 +4,31 @@ using namespace Grid;
 using namespace QCD;
 using namespace QedFVol;
 
+template <class S> 
+class QedGimpl 
+{
+public:
+  typedef S Simd;
+
+  template <typename vtype>
+  using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
+  template <typename vtype>
+  using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
+
+  typedef iImplGaugeLink<Simd> SiteGaugeLink;
+  typedef iImplGaugeField<Simd> SiteGaugeField;
+
+  typedef Lattice<SiteGaugeLink> GaugeLinkField; // bit ugly naming; polarised
+                                                 // gauge field, lorentz... all
+                                                 // ugly
+  typedef Lattice<SiteGaugeField> GaugeField;
+};
+
+typedef QedGimpl<vComplex>      QedGimplR;
+typedef Photon<QedGimplR>       PhotonR;
+typedef PhotonR::GaugeField     EmField;
+typedef PhotonR::GaugeLinkField EmComp;
+
 int main(int argc, char *argv[])
 {
     // parse command line
@@ -26,8 +51,19 @@ int main(int argc, char *argv[])
     QedFVolLogDebug.Active(GridLogDebug.isActive());
     LOG(Message) << "Grid initialized" << std::endl;
     
+    // QED stuff
+    std::vector<int> latt_size   = GridDefaultLatt();
+    std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd());
+    std::vector<int> mpi_layout  = GridDefaultMpi();
+    GridCartesian    grid(latt_size,simd_layout,mpi_layout);
+    GridParallelRNG  pRNG(&grid);
+    PhotonR          photon(PhotonR::Gauge::Feynman,
+                            PhotonR::ZmScheme::QedL);
+    EmField          a(&grid);
+
+    pRNG.SeedRandomDevice();
+    photon.StochasticField(a, pRNG);
 
-    
     // epilogue
     LOG(Message) << "Grid is finalizing now" << std::endl;
     Grid_finalize();

From 3ab4c8c0bbde6a572d41074405c2baa8e9a0119c Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Tue, 25 Oct 2016 13:32:02 +0100
Subject: [PATCH 003/177] QedFVol: calculate plaquette and 2x2 Wilson loop of
 stochastic QED field

---
 programs/qed-fvol/WilsonLoops.h | 167 ++++++++++++++++++++++++++++++++
 programs/qed-fvol/qed-fvol.cc   |  45 +++++++++
 2 files changed, 212 insertions(+)
 create mode 100644 programs/qed-fvol/WilsonLoops.h

diff --git a/programs/qed-fvol/WilsonLoops.h b/programs/qed-fvol/WilsonLoops.h
new file mode 100644
index 00000000..610fdc7b
--- /dev/null
+++ b/programs/qed-fvol/WilsonLoops.h
@@ -0,0 +1,167 @@
+#ifndef QEDFVOL_WILSONLOOPS_H
+#define QEDFVOL_WILSONLOOPS_H
+
+#include <Global.hpp>
+
+BEGIN_QEDFVOL_NAMESPACE
+
+template <class Gimpl> class WilsonLoops : public Gimpl {
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+  //////////////////////////////////////////////////
+  // directed plaquette oriented in mu,nu plane
+  //////////////////////////////////////////////////
+  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
+                           const int mu, const int nu) {
+    // Annoyingly, must use either scope resolution to find dependent base
+    // class,
+    // or this-> ; there is no "this" in a static method. This forces explicit
+    // Gimpl scope
+    // resolution throughout the usage in this file, and rather defeats the
+    // purpose of deriving
+    // from Gimpl.
+    plaq = Gimpl::CovShiftBackward(
+        U[mu], mu, Gimpl::CovShiftBackward(
+                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
+  }
+  //////////////////////////////////////////////////
+  // trace of directed plaquette oriented in mu,nu plane
+  //////////////////////////////////////////////////
+  static void traceDirPlaquette(LatticeComplex &plaq,
+                                const std::vector<GaugeMat> &U, const int mu,
+                                const int nu) {
+    GaugeMat sp(U[0]._grid);
+    dirPlaquette(sp, U, mu, nu);
+    plaq = trace(sp);
+  }
+  //////////////////////////////////////////////////
+  // sum over all planes of plaquette
+  //////////////////////////////////////////////////
+  static void sitePlaquette(LatticeComplex &Plaq,
+                            const std::vector<GaugeMat> &U) {
+    LatticeComplex sitePlaq(U[0]._grid);
+    Plaq = zero;
+    for (int mu = 1; mu < Nd; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceDirPlaquette(sitePlaq, U, mu, nu);
+        Plaq = Plaq + sitePlaq;
+      }
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of plaquette
+  //////////////////////////////////////////////////
+  static RealD sumPlaquette(const GaugeLorentz &Umu) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Nd; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Plaq(Umu._grid);
+
+    sitePlaquette(Plaq, U);
+
+    TComplex Tp = sum(Plaq);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of plaquette
+  //////////////////////////////////////////////////
+  static RealD avgPlaquette(const GaugeLorentz &Umu) {
+    RealD sumplaq = sumPlaquette(Umu);
+    double vol = Umu._grid->gSites();
+    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
+    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
+  }
+
+  //////////////////////////////////////////////////
+  // Wilson loop of size (R1, R2), oriented in mu,nu plane
+  //////////////////////////////////////////////////
+  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
+                           const int Rmu, const int Rnu,
+                           const int mu, const int nu) {
+    wl = U[nu];
+
+    for(int i = 0; i < Rnu-1; i++){
+      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
+    }
+
+    for(int i = 0; i < Rmu; i++){
+      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
+    }
+
+    for(int i = 0; i < Rnu; i++){
+      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
+    }
+
+    for(int i = 0; i < Rmu; i++){
+      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
+    }
+  }
+  //////////////////////////////////////////////////
+  // trace of Wilson Loop oriented in mu,nu plane
+  //////////////////////////////////////////////////
+  static void traceWilsonLoop(LatticeComplex &wl,
+                                const std::vector<GaugeMat> &U,
+                                const int Rmu, const int Rnu,
+                                const int mu, const int nu) {
+    GaugeMat sp(U[0]._grid);
+    WilsonLoop(sp, U, Rmu, Rnu, mu, nu);
+    wl = trace(sp);
+  }
+  //////////////////////////////////////////////////
+  // sum over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static void siteWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+    Wl = zero;
+    for (int mu = 1; mu < Nd; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
+        Wl = Wl + siteWl;
+        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
+        Wl = Wl + siteWl;
+      }
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static RealD sumWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Nd; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of Wilson loop
+  //////////////////////////////////////////////////
+  static RealD avgPlaquette(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    RealD sumWl = sumWilsonLoop(Umu);
+    double vol = Umu._grid->gSites();
+    double faces = 1.0 * Nd * (Nd - 1);
+    return sumWl / vol / faces / Nc; // Nd , Nc dependent... FIXME
+  }
+
+END_QEDFVOL_NAMESPACE
+
+#endif // QEDFVOL_WILSONLOOPS_H
\ No newline at end of file
diff --git a/programs/qed-fvol/qed-fvol.cc b/programs/qed-fvol/qed-fvol.cc
index 53e01de9..02c36a67 100644
--- a/programs/qed-fvol/qed-fvol.cc
+++ b/programs/qed-fvol/qed-fvol.cc
@@ -64,6 +64,51 @@ int main(int argc, char *argv[])
     pRNG.SeedRandomDevice();
     photon.StochasticField(a, pRNG);
 
+    // Calculate log of plaquette
+    EmComp              plaqA(&grid);
+    EmComp              wlA(&grid);
+    EmComp              tmp(&grid);
+    std::vector<EmComp> a_comp(4, &grid);
+
+    for (int dir = 0; dir < Nd; dir++) {
+      a_comp[dir] = PeekIndex<LorentzIndex>(a, dir);
+    }
+
+    plaqA = zero;
+    wlA = zero;
+
+    for(int mu = 1; mu < Nd; mu++) {
+        for(int nu = 0; nu < mu; nu++) {
+            tmp = a_comp[mu] + Cshift(a_comp[nu], mu, 1) - Cshift(a_comp[mu], nu, 1) - a_comp[nu];
+            plaqA = plaqA + cos(tmp);
+
+            tmp = a_comp[mu] + Cshift(a_comp[mu], mu, 1)
+                  + Cshift(a_comp[nu], mu, 2) + Cshift(Cshift(a_comp[nu], mu, 2), nu, 1)
+                  - Cshift(Cshift(a_comp[mu], nu, 2), mu, 1) - Cshift(a_comp[mu], nu, 2)
+                  - Cshift(a_comp[nu], nu, 1) - a_comp[nu];
+            wlA = wlA + cos(tmp);
+        }
+    }
+
+    double vol = grid.gSites();
+    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
+
+    Complex avgPlaqA = sum(trace(plaqA));
+    avgPlaqA = avgPlaqA / vol / faces;
+
+    Complex avgWlA = sum(trace(wlA));
+    avgWlA = avgWlA / vol / faces;
+
+    TComplex tplaqsite;
+    LatticeComplex plaqtrace = trace(plaqA);
+    std::vector<int> site0 = {0,0,0,0};
+    peekSite(tplaqsite, plaqtrace, site0);
+    Complex plaqsite = TensorRemove(tplaqsite);
+
+    LOG(Message) << "Plaquette average: " << avgPlaqA << std::endl;
+    LOG(Message) << "2x2 Wilson Loop average: " << avgWlA << std::endl;
+    LOG(Message) << "Plaquette (one site): " << plaqsite / faces << std::endl;
+
     // epilogue
     LOG(Message) << "Grid is finalizing now" << std::endl;
     Grid_finalize();

From 78c7bcee36f7d937c8f5c6afe0f2088f85ebda51 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Tue, 1 Nov 2016 13:30:11 +0000
Subject: [PATCH 004/177] QedFVol: Change variables of type "double" to type
 "Real".

---
 programs/qed-fvol/qed-fvol.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/programs/qed-fvol/qed-fvol.cc b/programs/qed-fvol/qed-fvol.cc
index 02c36a67..fd780edf 100644
--- a/programs/qed-fvol/qed-fvol.cc
+++ b/programs/qed-fvol/qed-fvol.cc
@@ -90,8 +90,8 @@ int main(int argc, char *argv[])
         }
     }
 
-    double vol = grid.gSites();
-    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
+    Real vol = grid.gSites();
+    Real faces = (1.0 * Nd * (Nd - 1)) / 2.0;
 
     Complex avgPlaqA = sum(trace(plaqA));
     avgPlaqA = avgPlaqA / vol / faces;

From c30d96ea5097ab1760a3f0a6ea1aed8ac1e6142b Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Wed, 9 Nov 2016 11:06:20 +0000
Subject: [PATCH 005/177] QedFVol: x86intrin.h namespace fix

---
 lib/PerfCount.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/PerfCount.h b/lib/PerfCount.h
index 9ac58883..5ab07c02 100644
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
+#ifdef __x86_64__
+#include <x86intrin.h>
+#endif
 
 namespace Grid {
 
@@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
    return tmp;
 }
 #elif defined __x86_64__
-#include <x86intrin.h>
 inline uint64_t cyclecount(void){ 
   return __rdtsc();
   //  unsigned int dummy;

From cf167d0cd1c561bed3557eaf89350b8d8eb8d9b1 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Mon, 14 Nov 2016 17:02:29 +0000
Subject: [PATCH 006/177] QedFVol: implement exponentiation of photon field

---
 programs/qed-fvol/WilsonLoops.h | 19 ++++++++++---------
 programs/qed-fvol/qed-fvol.cc   | 32 +++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/programs/qed-fvol/WilsonLoops.h b/programs/qed-fvol/WilsonLoops.h
index 610fdc7b..c40fbaf3 100644
--- a/programs/qed-fvol/WilsonLoops.h
+++ b/programs/qed-fvol/WilsonLoops.h
@@ -5,7 +5,7 @@
 
 BEGIN_QEDFVOL_NAMESPACE
 
-template <class Gimpl> class WilsonLoops : public Gimpl {
+template <class Gimpl> class NewWilsonLoops : public Gimpl {
 public:
   INHERIT_GIMPL_TYPES(Gimpl);
 
@@ -55,7 +55,7 @@ public:
   //////////////////////////////////////////////////
   // sum over all x,y,z,t and over all planes of plaquette
   //////////////////////////////////////////////////
-  static RealD sumPlaquette(const GaugeLorentz &Umu) {
+  static Real sumPlaquette(const GaugeLorentz &Umu) {
     std::vector<GaugeMat> U(4, Umu._grid);
 
     for (int mu = 0; mu < Nd; mu++) {
@@ -73,8 +73,8 @@ public:
   //////////////////////////////////////////////////
   // average over all x,y,z,t and over all planes of plaquette
   //////////////////////////////////////////////////
-  static RealD avgPlaquette(const GaugeLorentz &Umu) {
-    RealD sumplaq = sumPlaquette(Umu);
+  static Real avgPlaquette(const GaugeLorentz &Umu) {
+    Real sumplaq = sumPlaquette(Umu);
     double vol = Umu._grid->gSites();
     double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
     return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
@@ -112,14 +112,14 @@ public:
                                 const int Rmu, const int Rnu,
                                 const int mu, const int nu) {
     GaugeMat sp(U[0]._grid);
-    WilsonLoop(sp, U, Rmu, Rnu, mu, nu);
+    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
     wl = trace(sp);
   }
   //////////////////////////////////////////////////
   // sum over all planes of Wilson loop
   //////////////////////////////////////////////////
   static void siteWilsonLoop(LatticeComplex &Wl,
-                            const std::vector<GaugeMat> &U
+                            const std::vector<GaugeMat> &U,
                             const int R1, const int R2) {
     LatticeComplex siteWl(U[0]._grid);
     Wl = zero;
@@ -135,7 +135,7 @@ public:
   //////////////////////////////////////////////////
   // sum over all x,y,z,t and over all planes of Wilson loop
   //////////////////////////////////////////////////
-  static RealD sumWilsonLoop(const GaugeLorentz &Umu,
+  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                             const int R1, const int R2) {
     std::vector<GaugeMat> U(4, Umu._grid);
 
@@ -154,13 +154,14 @@ public:
   //////////////////////////////////////////////////
   // average over all x,y,z,t and over all planes of Wilson loop
   //////////////////////////////////////////////////
-  static RealD avgPlaquette(const GaugeLorentz &Umu,
+  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                             const int R1, const int R2) {
-    RealD sumWl = sumWilsonLoop(Umu);
+    Real sumWl = sumWilsonLoop(Umu, R1, R2);
     double vol = Umu._grid->gSites();
     double faces = 1.0 * Nd * (Nd - 1);
     return sumWl / vol / faces / Nc; // Nd , Nc dependent... FIXME
   }
+};
 
 END_QEDFVOL_NAMESPACE
 
diff --git a/programs/qed-fvol/qed-fvol.cc b/programs/qed-fvol/qed-fvol.cc
index fd780edf..68705b8f 100644
--- a/programs/qed-fvol/qed-fvol.cc
+++ b/programs/qed-fvol/qed-fvol.cc
@@ -1,4 +1,5 @@
 #include <Global.hpp>
+#include <WilsonLoops.h>
 
 using namespace Grid;
 using namespace QCD;
@@ -24,10 +25,11 @@ public:
   typedef Lattice<SiteGaugeField> GaugeField;
 };
 
-typedef QedGimpl<vComplex>      QedGimplR;
-typedef Photon<QedGimplR>       PhotonR;
-typedef PhotonR::GaugeField     EmField;
-typedef PhotonR::GaugeLinkField EmComp;
+typedef QedGimpl<vComplex>              QedGimplR;
+typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR;
+typedef Photon<QedGimplR>               PhotonR;
+typedef PhotonR::GaugeField             EmField;
+typedef PhotonR::GaugeLinkField         EmComp;
 
 int main(int argc, char *argv[])
 {
@@ -60,11 +62,18 @@ int main(int argc, char *argv[])
     PhotonR          photon(PhotonR::Gauge::Feynman,
                             PhotonR::ZmScheme::QedL);
     EmField          a(&grid);
+    EmField          expA(&grid);
+
+    Real avgPlaqAexp, avgWl2x2Aexp;
 
     pRNG.SeedRandomDevice();
     photon.StochasticField(a, pRNG);
 
-    // Calculate log of plaquette
+    // Exponentiate photon field
+    Complex imag_unit(0, 1);
+    expA = exp(imag_unit*0.5*(a+conjugate(a)));
+
+    // Calculate plaquette from photon field
     EmComp              plaqA(&grid);
     EmComp              wlA(&grid);
     EmComp              tmp(&grid);
@@ -105,8 +114,17 @@ int main(int argc, char *argv[])
     peekSite(tplaqsite, plaqtrace, site0);
     Complex plaqsite = TensorRemove(tplaqsite);
 
-    LOG(Message) << "Plaquette average: " << avgPlaqA << std::endl;
-    LOG(Message) << "2x2 Wilson Loop average: " << avgWlA << std::endl;
+    // Calculate plaquette from exponentiated photon field
+    avgPlaqAexp = NewWilsonLoops<QedPeriodicGimplR>::avgPlaquette(expA);
+    avgWl2x2Aexp = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, 2, 2);
+
+    avgPlaqAexp = avgPlaqAexp*3;
+    avgWl2x2Aexp = avgWl2x2Aexp*3;
+
+    LOG(Message) << "Plaquette average (from A): " << avgPlaqA << std::endl;
+    LOG(Message) << "Plaquette average (from exp(A)): " << avgPlaqAexp << std::endl;
+    LOG(Message) << "2x2 Wilson Loop average (from A): " << avgWlA << std::endl;
+    LOG(Message) << "2x2 Wilson Loop average (from exp(A)): " << avgWl2x2Aexp << std::endl;
     LOG(Message) << "Plaquette (one site): " << plaqsite / faces << std::endl;
 
     // epilogue

From f4ebea3381046026276864f3f908cb10b114d6a5 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Mon, 14 Nov 2016 17:51:53 +0000
Subject: [PATCH 007/177] QedFVol: add functions for computing spatial and
 timelike Wilson loops

---
 programs/qed-fvol/WilsonLoops.h | 117 +++++++++++++++++++++++++++++---
 programs/qed-fvol/qed-fvol.cc   |   8 ++-
 2 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/programs/qed-fvol/WilsonLoops.h b/programs/qed-fvol/WilsonLoops.h
index c40fbaf3..98db6b7a 100644
--- a/programs/qed-fvol/WilsonLoops.h
+++ b/programs/qed-fvol/WilsonLoops.h
@@ -45,7 +45,7 @@ public:
                             const std::vector<GaugeMat> &U) {
     LatticeComplex sitePlaq(U[0]._grid);
     Plaq = zero;
-    for (int mu = 1; mu < Nd; mu++) {
+    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
       for (int nu = 0; nu < mu; nu++) {
         traceDirPlaquette(sitePlaq, U, mu, nu);
         Plaq = Plaq + sitePlaq;
@@ -58,7 +58,7 @@ public:
   static Real sumPlaquette(const GaugeLorentz &Umu) {
     std::vector<GaugeMat> U(4, Umu._grid);
 
-    for (int mu = 0; mu < Nd; mu++) {
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
       U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
     }
 
@@ -74,10 +74,11 @@ public:
   // average over all x,y,z,t and over all planes of plaquette
   //////////////////////////////////////////////////
   static Real avgPlaquette(const GaugeLorentz &Umu) {
+    int ndim = Umu._grid->_ndimension;
     Real sumplaq = sumPlaquette(Umu);
-    double vol = Umu._grid->gSites();
-    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
-    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
+    Real vol = Umu._grid->gSites();
+    Real faces = (1.0 * ndim * (ndim - 1)) / 2.0;
+    return sumplaq / vol / faces / Nc; // Nc dependent... FIXME
   }
 
   //////////////////////////////////////////////////
@@ -123,7 +124,42 @@ public:
                             const int R1, const int R2) {
     LatticeComplex siteWl(U[0]._grid);
     Wl = zero;
-    for (int mu = 1; mu < Nd; mu++) {
+    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
+        Wl = Wl + siteWl;
+        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
+        Wl = Wl + siteWl;
+      }
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum over planes of Wilson loop with length R1
+  // in the time direction
+  //////////////////////////////////////////////////
+  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U,
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+
+    int ndim = U[0]._grid->_ndimension;
+
+    Wl = zero;
+    for (int nu = 0; nu < ndim - 1; nu++) {
+      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
+      Wl = Wl + siteWl;
+    }
+  }
+  //////////////////////////////////////////////////
+  // sum Wilson loop over all planes orthogonal to the time direction
+  //////////////////////////////////////////////////
+  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
+                            const std::vector<GaugeMat> &U,
+                            const int R1, const int R2) {
+    LatticeComplex siteWl(U[0]._grid);
+
+    Wl = zero;
+    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
       for (int nu = 0; nu < mu; nu++) {
         traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
         Wl = Wl + siteWl;
@@ -139,7 +175,7 @@ public:
                             const int R1, const int R2) {
     std::vector<GaugeMat> U(4, Umu._grid);
 
-    for (int mu = 0; mu < Nd; mu++) {
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
       U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
     }
 
@@ -152,14 +188,75 @@ public:
     return p.real();
   }
   //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of timelike Wilson loop
+  //////////////////////////////////////////////////
+  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteTimelikeWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of spatial Wilson loop
+  //////////////////////////////////////////////////
+  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    LatticeComplex Wl(Umu._grid);
+
+    siteSpatialWilsonLoop(Wl, U, R1, R2);
+
+    TComplex Tp = sum(Wl);
+    Complex p = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
   // average over all x,y,z,t and over all planes of Wilson loop
   //////////////////////////////////////////////////
   static Real avgWilsonLoop(const GaugeLorentz &Umu,
                             const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
     Real sumWl = sumWilsonLoop(Umu, R1, R2);
-    double vol = Umu._grid->gSites();
-    double faces = 1.0 * Nd * (Nd - 1);
-    return sumWl / vol / faces / Nc; // Nd , Nc dependent... FIXME
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * ndim * (ndim - 1);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of timelike Wilson loop
+  //////////////////////////////////////////////////
+  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
+    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * (ndim - 1);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of spatial Wilson loop
+  //////////////////////////////////////////////////
+  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
+                            const int R1, const int R2) {
+    int ndim = Umu._grid->_ndimension;
+    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
+    Real vol = Umu._grid->gSites();
+    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
+    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
   }
 };
 
diff --git a/programs/qed-fvol/qed-fvol.cc b/programs/qed-fvol/qed-fvol.cc
index 68705b8f..d026057e 100644
--- a/programs/qed-fvol/qed-fvol.cc
+++ b/programs/qed-fvol/qed-fvol.cc
@@ -64,7 +64,7 @@ int main(int argc, char *argv[])
     EmField          a(&grid);
     EmField          expA(&grid);
 
-    Real avgPlaqAexp, avgWl2x2Aexp;
+    Real avgPlaqAexp, avgWl2x2Aexp, avgWl2x2Aexp_time, avgWl2x2Aexp_space;
 
     pRNG.SeedRandomDevice();
     photon.StochasticField(a, pRNG);
@@ -117,14 +117,20 @@ int main(int argc, char *argv[])
     // Calculate plaquette from exponentiated photon field
     avgPlaqAexp = NewWilsonLoops<QedPeriodicGimplR>::avgPlaquette(expA);
     avgWl2x2Aexp = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, 2, 2);
+    avgWl2x2Aexp_time = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, 2, 2);
+    avgWl2x2Aexp_space = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, 2, 2);
 
     avgPlaqAexp = avgPlaqAexp*3;
     avgWl2x2Aexp = avgWl2x2Aexp*3;
+    avgWl2x2Aexp_time = avgWl2x2Aexp_time*3;
+    avgWl2x2Aexp_space = avgWl2x2Aexp_space*3;
 
     LOG(Message) << "Plaquette average (from A): " << avgPlaqA << std::endl;
     LOG(Message) << "Plaquette average (from exp(A)): " << avgPlaqAexp << std::endl;
     LOG(Message) << "2x2 Wilson Loop average (from A): " << avgWlA << std::endl;
     LOG(Message) << "2x2 Wilson Loop average (from exp(A)): " << avgWl2x2Aexp << std::endl;
+    LOG(Message) << "2x2 Wilson Loop timelike average (from exp(A)): " << avgWl2x2Aexp_time << std::endl;
+    LOG(Message) << "2x2 Wilson Loop spatial average (from exp(A)): " << avgWl2x2Aexp_space << std::endl;
     LOG(Message) << "Plaquette (one site): " << plaqsite / faces << std::endl;
 
     // epilogue

From 92ec3404f8a404e7d6420ebfa0f113af5eb6ec6d Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Mon, 14 Nov 2016 17:59:02 +0000
Subject: [PATCH 008/177] Set imaginary part of stochastic QED field to zero
 after FFT into position space

---
 lib/qcd/action/gauge/Photon.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index 852ecb3e..ca0a8d40 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -172,6 +172,8 @@ namespace QCD{
       pokeLorentz(aTilde, r, mu);
     }
     fft.FFT_all_dim(out, aTilde, FFT::backward);
+
+    out = 0.5*(out + conjugate(out));
   }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,

From a71b69389b6fc7360dbebbf5ed8d4fa3a6952016 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Mon, 14 Nov 2016 18:23:04 +0000
Subject: [PATCH 009/177] QedFVol: calculate square Wilson loops up to 10x10

---
 programs/qed-fvol/qed-fvol.cc | 74 +++++++----------------------------
 1 file changed, 14 insertions(+), 60 deletions(-)

diff --git a/programs/qed-fvol/qed-fvol.cc b/programs/qed-fvol/qed-fvol.cc
index d026057e..31312b1e 100644
--- a/programs/qed-fvol/qed-fvol.cc
+++ b/programs/qed-fvol/qed-fvol.cc
@@ -64,75 +64,29 @@ int main(int argc, char *argv[])
     EmField          a(&grid);
     EmField          expA(&grid);
 
-    Real avgPlaqAexp, avgWl2x2Aexp, avgWl2x2Aexp_time, avgWl2x2Aexp_space;
+    Real wlA, logWlA;
 
     pRNG.SeedRandomDevice();
     photon.StochasticField(a, pRNG);
 
     // Exponentiate photon field
     Complex imag_unit(0, 1);
-    expA = exp(imag_unit*0.5*(a+conjugate(a)));
+    expA = exp(imag_unit*a);
 
-    // Calculate plaquette from photon field
-    EmComp              plaqA(&grid);
-    EmComp              wlA(&grid);
-    EmComp              tmp(&grid);
-    std::vector<EmComp> a_comp(4, &grid);
-
-    for (int dir = 0; dir < Nd; dir++) {
-      a_comp[dir] = PeekIndex<LorentzIndex>(a, dir);
+    // Calculate Wilson loops
+    for(int i=1; i<=10; i++){
+        LOG(Message) << i << 'x' << i << " Wilson loop" << std::endl;
+        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, i, i) * 3;
+        logWlA = -2*log(wlA);
+        LOG(Message) << "-2log(W) average: " << logWlA << std::endl;
+        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, i, i) * 3;
+        logWlA = -2*log(wlA);
+        LOG(Message) << "-2log(W) timelike: " << logWlA << std::endl;
+        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, i, i) * 3;
+        logWlA = -2*log(wlA);
+        LOG(Message) << "-2log(W) spatial: " << logWlA << std::endl;
     }
 
-    plaqA = zero;
-    wlA = zero;
-
-    for(int mu = 1; mu < Nd; mu++) {
-        for(int nu = 0; nu < mu; nu++) {
-            tmp = a_comp[mu] + Cshift(a_comp[nu], mu, 1) - Cshift(a_comp[mu], nu, 1) - a_comp[nu];
-            plaqA = plaqA + cos(tmp);
-
-            tmp = a_comp[mu] + Cshift(a_comp[mu], mu, 1)
-                  + Cshift(a_comp[nu], mu, 2) + Cshift(Cshift(a_comp[nu], mu, 2), nu, 1)
-                  - Cshift(Cshift(a_comp[mu], nu, 2), mu, 1) - Cshift(a_comp[mu], nu, 2)
-                  - Cshift(a_comp[nu], nu, 1) - a_comp[nu];
-            wlA = wlA + cos(tmp);
-        }
-    }
-
-    Real vol = grid.gSites();
-    Real faces = (1.0 * Nd * (Nd - 1)) / 2.0;
-
-    Complex avgPlaqA = sum(trace(plaqA));
-    avgPlaqA = avgPlaqA / vol / faces;
-
-    Complex avgWlA = sum(trace(wlA));
-    avgWlA = avgWlA / vol / faces;
-
-    TComplex tplaqsite;
-    LatticeComplex plaqtrace = trace(plaqA);
-    std::vector<int> site0 = {0,0,0,0};
-    peekSite(tplaqsite, plaqtrace, site0);
-    Complex plaqsite = TensorRemove(tplaqsite);
-
-    // Calculate plaquette from exponentiated photon field
-    avgPlaqAexp = NewWilsonLoops<QedPeriodicGimplR>::avgPlaquette(expA);
-    avgWl2x2Aexp = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, 2, 2);
-    avgWl2x2Aexp_time = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, 2, 2);
-    avgWl2x2Aexp_space = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, 2, 2);
-
-    avgPlaqAexp = avgPlaqAexp*3;
-    avgWl2x2Aexp = avgWl2x2Aexp*3;
-    avgWl2x2Aexp_time = avgWl2x2Aexp_time*3;
-    avgWl2x2Aexp_space = avgWl2x2Aexp_space*3;
-
-    LOG(Message) << "Plaquette average (from A): " << avgPlaqA << std::endl;
-    LOG(Message) << "Plaquette average (from exp(A)): " << avgPlaqAexp << std::endl;
-    LOG(Message) << "2x2 Wilson Loop average (from A): " << avgWlA << std::endl;
-    LOG(Message) << "2x2 Wilson Loop average (from exp(A)): " << avgWl2x2Aexp << std::endl;
-    LOG(Message) << "2x2 Wilson Loop timelike average (from exp(A)): " << avgWl2x2Aexp_time << std::endl;
-    LOG(Message) << "2x2 Wilson Loop spatial average (from exp(A)): " << avgWl2x2Aexp_space << std::endl;
-    LOG(Message) << "Plaquette (one site): " << plaqsite / faces << std::endl;
-
     // epilogue
     LOG(Message) << "Grid is finalizing now" << std::endl;
     Grid_finalize();

From 739c2308b5ce9a9464dbbd9057dbe49f6b04cf59 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Tue, 15 Nov 2016 13:07:52 +0000
Subject: [PATCH 010/177] Set imaginary part of stochastic QED field to zero
 using real() instead of conjugate().

---
 lib/qcd/action/gauge/Photon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index ca0a8d40..b6c1b76f 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -173,7 +173,7 @@ namespace QCD{
     }
     fft.FFT_all_dim(out, aTilde, FFT::backward);
 
-    out = 0.5*(out + conjugate(out));
+    out = real(out);
   }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,

From 6ad73145bc9754a5f26093eee5a34473ba0cff82 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Wed, 30 Nov 2016 15:17:22 +0000
Subject: [PATCH 011/177] Calculate Wilson loop average over multiple
 configurations.

---
 programs/qed-fvol/qed-fvol.cc | 47 +++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/programs/qed-fvol/qed-fvol.cc b/programs/qed-fvol/qed-fvol.cc
index 31312b1e..f0f5079f 100644
--- a/programs/qed-fvol/qed-fvol.cc
+++ b/programs/qed-fvol/qed-fvol.cc
@@ -31,6 +31,9 @@ typedef Photon<QedGimplR>               PhotonR;
 typedef PhotonR::GaugeField             EmField;
 typedef PhotonR::GaugeLinkField         EmComp;
 
+const int NCONFIGS = 10;
+const int NWILSON = 10;
+
 int main(int argc, char *argv[])
 {
     // parse command line
@@ -64,27 +67,39 @@ int main(int argc, char *argv[])
     EmField          a(&grid);
     EmField          expA(&grid);
 
-    Real wlA, logWlA;
+    Complex imag_unit(0, 1);
+
+    Real wlA;
+    std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0);
 
     pRNG.SeedRandomDevice();
-    photon.StochasticField(a, pRNG);
 
-    // Exponentiate photon field
-    Complex imag_unit(0, 1);
-    expA = exp(imag_unit*a);
+    LOG(Message) << "Wilson loop calculation beginning" << std::endl;
+    for(int ic = 0; ic < NCONFIGS; ic++){
+        LOG(Message) << "Configuration " << ic <<std::endl;
+        photon.StochasticField(a, pRNG);
 
+        // Exponentiate photon field
+        expA = exp(imag_unit*a);
+
+        // Calculate Wilson loops
+        for(int iw=1; iw<=NWILSON; iw++){
+            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, iw, iw) * 3;
+            logWlAvg[iw-1] -= 2*log(wlA);
+            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3;
+            logWlTime[iw-1] -= 2*log(wlA);
+            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3;
+            logWlSpace[iw-1] -= 2*log(wlA);
+        }
+    }
+    LOG(Message) << "Wilson loop calculation completed" << std::endl;
+    
     // Calculate Wilson loops
-    for(int i=1; i<=10; i++){
-        LOG(Message) << i << 'x' << i << " Wilson loop" << std::endl;
-        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, i, i) * 3;
-        logWlA = -2*log(wlA);
-        LOG(Message) << "-2log(W) average: " << logWlA << std::endl;
-        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, i, i) * 3;
-        logWlA = -2*log(wlA);
-        LOG(Message) << "-2log(W) timelike: " << logWlA << std::endl;
-        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, i, i) * 3;
-        logWlA = -2*log(wlA);
-        LOG(Message) << "-2log(W) spatial: " << logWlA << std::endl;
+    for(int iw=1; iw<=10; iw++){
+        LOG(Message) << iw << 'x' << iw << " Wilson loop" << std::endl;
+        LOG(Message) << "-2log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl;
+        LOG(Message) << "-2log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl;
+        LOG(Message) << "-2log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl;
     }
 
     // epilogue

From 2e3c5890b6035a4c9d661102c2117c53f93f00fd Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 15 Dec 2016 20:06:46 +0000
Subject: [PATCH 012/177] qed-fvol: build fix

---
 extras/Makefile.am          | 2 +-
 extras/qed-fvol/Makefile.am | 2 +-
 lib/qcd/action/Actions.h    | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/extras/Makefile.am b/extras/Makefile.am
index d8c2b675..416a9fc8 100644
--- a/extras/Makefile.am
+++ b/extras/Makefile.am
@@ -1 +1 @@
-SUBDIRS = Hadrons
\ No newline at end of file
+SUBDIRS = Hadrons qed-fvol
\ No newline at end of file
diff --git a/extras/qed-fvol/Makefile.am b/extras/qed-fvol/Makefile.am
index cd762e94..0a9030c7 100644
--- a/extras/qed-fvol/Makefile.am
+++ b/extras/qed-fvol/Makefile.am
@@ -1,4 +1,4 @@
-AM_CXXFLAGS += -I$(top_srcdir)/programs -I../$(top_srcdir)/programs
+AM_CXXFLAGS += -I$(top_srcdir)/extras
 
 bin_PROGRAMS = qed-fvol
 
diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h
index 4a30f8c3..fea75f8a 100644
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -57,6 +57,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
+#include <Grid/qcd/action/gauge/Photon.h>
 #include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
 #include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>
 

From 2af9ab903445291377bb323ee349ddf9c7e94abf Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 20 Dec 2016 12:40:26 +0100
Subject: [PATCH 013/177] old Makefile cleaning

---
 programs/Makefile.am | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 programs/Makefile.am

diff --git a/programs/Makefile.am b/programs/Makefile.am
deleted file mode 100644
index ff7f6584..00000000
--- a/programs/Makefile.am
+++ /dev/null
@@ -1 +0,0 @@
-SUBDIRS = qed-fvol

From 9ac3ac41df095e3208c126f4b52bdf9f1b58937a Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 20 Dec 2016 12:41:01 +0100
Subject: [PATCH 014/177] serialisable Photon parameters

---
 lib/qcd/action/gauge/Photon.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index b6c1b76f..bbe3ebf7 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -28,6 +28,7 @@
 #ifndef QCD_PHOTON_ACTION_H
 #define QCD_PHOTON_ACTION_H
 
+
 namespace Grid{
 namespace QCD{
   
@@ -36,8 +37,8 @@ namespace QCD{
   {
   public:
     INHERIT_GIMPL_TYPES(Gimpl);
-    enum class Gauge    {Feynman, Coulomb, Landau};
-    enum class ZmScheme {QedL, QedTL};
+    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
+    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
   public:
     Photon(Gauge gauge, ZmScheme zmScheme);
     virtual ~Photon(void) = default;
@@ -104,7 +105,7 @@ namespace QCD{
     
     switch (zmScheme_)
     {
-      case ZmScheme::QedTL:
+      case ZmScheme::qedTL:
       {
         std::vector<int> zm(nd,0);
         TComplex         Tzero = Complex(0.0,0.0);
@@ -113,7 +114,7 @@ namespace QCD{
         
         break;
       }
-      case ZmScheme::QedL:
+      case ZmScheme::qedL:
       {
         LatticeInteger spNrm(grid), coor(grid);
         GaugeLinkField z(grid);

From db9c28a773c5d93d3c757ea4f8af75876106b948 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 20 Dec 2016 12:41:39 +0100
Subject: [PATCH 015/177] qed-fvol: Photon parameter name fix

---
 extras/qed-fvol/qed-fvol.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extras/qed-fvol/qed-fvol.cc b/extras/qed-fvol/qed-fvol.cc
index f0f5079f..951c36ad 100644
--- a/extras/qed-fvol/qed-fvol.cc
+++ b/extras/qed-fvol/qed-fvol.cc
@@ -62,8 +62,8 @@ int main(int argc, char *argv[])
     std::vector<int> mpi_layout  = GridDefaultMpi();
     GridCartesian    grid(latt_size,simd_layout,mpi_layout);
     GridParallelRNG  pRNG(&grid);
-    PhotonR          photon(PhotonR::Gauge::Feynman,
-                            PhotonR::ZmScheme::QedL);
+    PhotonR          photon(PhotonR::Gauge::feynman,
+                            PhotonR::ZmScheme::qedL);
     EmField          a(&grid);
     EmField          expA(&grid);
 

From 17b3a10d46e46823f0a380647c4953c2ffd74ea4 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 22 Dec 2016 00:29:19 +0100
Subject: [PATCH 016/177] stochastic QED: function to cache 1/sqrt(khat^2)

---
 lib/qcd/action/gauge/Photon.h | 47 +++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index bbe3ebf7..faa63b42 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -44,7 +44,10 @@ namespace QCD{
     virtual ~Photon(void) = default;
     void FreePropagator(const GaugeField &in, GaugeField &out);
     void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
+    void StochasticWeight(GaugeLinkField &weight);
     void StochasticField(GaugeField &out, GridParallelRNG &rng);
+    void StochasticField(GaugeField &out, GridParallelRNG &rng,
+                         const GaugeLinkField &weight);
   private:
     void invKHatSquared(GaugeLinkField &out);
     void zmSub(GaugeLinkField &out);
@@ -148,32 +151,50 @@ namespace QCD{
   }
   
   template<class Gimpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
   {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
-    const unsigned int nd = grid->_ndimension;
-    std::vector<int> latt_size   = grid->_fdimensions;
-    GaugeLinkField     sqrtK2Inv(grid), r(grid);
-    GaugeField         aTilde(grid);
-    FFT                fft(grid);
+    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
+    const unsigned int nd        = grid->_ndimension;
+    std::vector<int>   latt_size = grid->_fdimensions;
     
     Integer vol = 1;
     for(int d = 0; d < nd; d++)
     {
       vol = vol * latt_size[d];
     }
-
-    invKHatSquared(sqrtK2Inv);
-    sqrtK2Inv = sqrt(vol*real(sqrtK2Inv));
-    zmSub(sqrtK2Inv);
+    invKHatSquared(weight);
+    weight = sqrt(vol*real(weight));
+    zmSub(weight);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  {
+    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
+    GaugeLinkField weight(grid);
+    
+    StochasticWeight(weight);
+    StochasticField(out, rng, weight);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
+                                      const GaugeLinkField &weight)
+  {
+    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
+    const unsigned int nd = grid->_ndimension;
+    GaugeLinkField     r(grid);
+    GaugeField         aTilde(grid);
+    FFT                fft(grid);
+    
     for(int mu = 0; mu < nd; mu++)
     {
       gaussian(rng, r);
-      r = sqrtK2Inv*r;
+      r = weight*r;
       pokeLorentz(aTilde, r, mu);
     }
     fft.FFT_all_dim(out, aTilde, FFT::backward);
-
+    
     out = real(out);
   }
 //  template<class Gimpl>

From 4c3fd9fa3f6976c1297715d1e2239797bb0dd45b Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 22 Dec 2016 00:29:41 +0100
Subject: [PATCH 017/177] stochastic QED field module in Hadrons

---
 extras/Hadrons/Modules.hpp                |  1 +
 extras/Hadrons/Modules/MGauge/StochEm.cc  | 88 +++++++++++++++++++++
 extras/Hadrons/Modules/MGauge/StochEm.hpp | 96 +++++++++++++++++++++++
 extras/Hadrons/modules.inc                |  2 +
 4 files changed, 187 insertions(+)
 create mode 100644 extras/Hadrons/Modules/MGauge/StochEm.cc
 create mode 100644 extras/Hadrons/Modules/MGauge/StochEm.hpp

diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp
index 77ae08b7..5d1a456c 100644
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -32,6 +32,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
+#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
diff --git a/extras/Hadrons/Modules/MGauge/StochEm.cc b/extras/Hadrons/Modules/MGauge/StochEm.cc
new file mode 100644
index 00000000..c7a9fc4f
--- /dev/null
+++ b/extras/Hadrons/Modules/MGauge/StochEm.cc
@@ -0,0 +1,88 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: extras/Hadrons/Modules/MGauge/StochEm.cc
+
+Copyright (C) 2015
+Copyright (C) 2016
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+/******************************************************************************
+*                  TStochEm implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+TStochEm::TStochEm(const std::string name)
+: Module<StochEmPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+std::vector<std::string> TStochEm::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+std::vector<std::string> TStochEm::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+void TStochEm::setup(void)
+{
+    if (!env().hasRegisteredObject("_" + getName() + "_weight"))
+    {
+        env().registerLattice<EmComp>("_" + getName() + "_weight");
+    }
+    env().registerLattice<EmField>(getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+void TStochEm::execute(void)
+{
+    PhotonR photon(par().gauge, par().zmScheme);
+    EmField &a = *env().createLattice<EmField>(getName());
+    EmComp  *w;
+    
+    if (!env().hasCreatedObject("_" + getName() + "_weight"))
+    {
+        LOG(Message) << "Caching stochatic EM potential weight (gauge: "
+                     << par().gauge << ", zero-mode scheme: "
+                     << par().zmScheme << ")..." << std::endl;
+        w = env().createLattice<EmComp>("_" + getName() + "_weight");
+        photon.StochasticWeight(*w);
+    }
+    else
+    {
+        w = env().getObject<EmComp>("_" + getName() + "_weight");
+    }
+    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
+    photon.StochasticField(a, *env().get4dRng(), *w);
+}
diff --git a/extras/Hadrons/Modules/MGauge/StochEm.hpp b/extras/Hadrons/Modules/MGauge/StochEm.hpp
new file mode 100644
index 00000000..04a7c48c
--- /dev/null
+++ b/extras/Hadrons/Modules/MGauge/StochEm.hpp
@@ -0,0 +1,96 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
+
+Copyright (C) 2015
+Copyright (C) 2016
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_StochEm_hpp_
+#define Hadrons_StochEm_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Hadrons/Module.hpp>
+#include <Grid/Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         StochEm                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+template <class S>
+class QedGimpl
+{
+public:
+    typedef S Simd;
+    
+    template <typename vtype>
+    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
+    template <typename vtype>
+    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
+    
+    typedef iImplGaugeLink<Simd> SiteGaugeLink;
+    typedef iImplGaugeField<Simd> SiteGaugeField;
+    
+    typedef Lattice<SiteGaugeLink> GaugeLinkField;
+    typedef Lattice<SiteGaugeField> GaugeField;
+};
+
+typedef QedGimpl<vComplex> QedGimplR;
+typedef Photon<QedGimplR>  PhotonR;
+
+class StochEmPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
+                                    PhotonR::Gauge,    gauge,
+                                    PhotonR::ZmScheme, zmScheme);
+};
+
+class TStochEm: public Module<StochEmPar>
+{
+public:
+    typedef PhotonR::GaugeField     EmField;
+    typedef PhotonR::GaugeLinkField EmComp;
+public:
+    // constructor
+    TStochEm(const std::string name);
+    // destructor
+    virtual ~TStochEm(void) = default;
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_NS(StochEm, TStochEm, MGauge);
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_StochEm_hpp_
diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc
index 4251ffa3..8b559024 100644
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -1,6 +1,7 @@
 modules_cc =\
   Modules/MGauge/Load.cc \
   Modules/MGauge/Random.cc \
+  Modules/MGauge/StochEm.cc \
   Modules/MGauge/Unit.cc
 
 modules_hpp =\
@@ -10,6 +11,7 @@ modules_hpp =\
   Modules/MContraction/Meson.hpp \
   Modules/MGauge/Load.hpp \
   Modules/MGauge/Random.hpp \
+  Modules/MGauge/StochEm.hpp \
   Modules/MGauge/Unit.hpp \
   Modules/MSolver/RBPrecCG.hpp \
   Modules/MSource/Point.hpp \

From 8c3cc3236447b4b8eef95a29da1b48166b5eb03d Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 29 Dec 2016 22:42:58 +0100
Subject: [PATCH 018/177] Scalar action

---
 lib/qcd/action/Actions.h       |   5 +
 lib/qcd/action/scalar/Scalar.h | 211 +++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 lib/qcd/action/scalar/Scalar.h

diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h
index fea75f8a..efd6a5bc 100644
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -292,4 +292,9 @@ typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 
+////////////////////
+// Scalar actions
+////////////////////
+#include <Grid/qcd/action/scalar/Scalar.h>
+
 #endif
diff --git a/lib/qcd/action/scalar/Scalar.h b/lib/qcd/action/scalar/Scalar.h
new file mode 100644
index 00000000..194f6767
--- /dev/null
+++ b/lib/qcd/action/scalar/Scalar.h
@@ -0,0 +1,211 @@
+#ifndef QCD_SCALAR_ACTION_H
+#define QCD_SCALAR_ACTION_H
+
+#define INHERIT_SIMPL_TYPES(Impl)\
+typedef typename Impl::SiteScalar      SiteScalar;	 \
+typedef typename Impl::SiteSpinor      SiteSpinor;	 \
+typedef typename Impl::SitePropagator  SitePropagator;   \
+typedef typename Impl::ScalarField     ScalarField;	 \
+typedef typename Impl::FermionField    FermionField;	 \
+typedef typename Impl::PropagatorField PropagatorField;  \
+typedef typename Impl::StencilImpl     StencilImpl;
+
+namespace Grid{
+namespace QCD{
+  // Scalar implementation class ///////////////////////////////////////////////
+  // FIXME: it is not very nice to have the FImpl aliases
+  template <class S,
+            class Representation = FundamentalRep<1>,
+            class _Coeff_t = RealD>
+  class ScalarImpl:
+    public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension>>
+  {
+  public:
+    static constexpr unsigned int rDim = Representation::Dimension;
+  public:
+    // gauge types
+    typedef PeriodicGaugeImpl<GaugeImplTypes<S, rDim>> Gimpl;
+    INHERIT_GIMPL_TYPES(Gimpl);
+    // site types
+    // (using classes instead of aliases to allow for partial specialisation)
+    template <typename vtype, unsigned int d>
+    class iImplScalar
+    {
+    public:
+      typedef iScalar<iScalar<iVector<vtype, d>>> type;
+    };
+    template <typename vtype>
+    class iImplScalar<vtype, 1>
+    {
+    public:
+      typedef iScalar<iScalar<iScalar<vtype>>> type;
+    };
+    template <typename vtype, unsigned int d>
+    class iImplPropagator
+    {
+    public:
+      typedef iScalar<iScalar<iMatrix<vtype, d>>> type;
+    };
+    template <typename vtype>
+    class iImplPropagator<vtype, 1>
+    {
+    public:
+      typedef iScalar<iScalar<iScalar<vtype>>> type;
+    };
+    // type aliases
+    typedef typename iImplScalar<S, rDim>::type      SiteScalar;
+    typedef SiteScalar                               SiteSpinor;
+    typedef typename iImplPropagator<S, rDim>::type  SitePropagator;
+    typedef Lattice<SiteScalar>                      ScalarField;
+    typedef ScalarField                              FermionField;
+    typedef Lattice<SitePropagator>                  PropagatorField;
+    typedef CartesianStencil<SiteScalar, SiteScalar> StencilImpl;
+  };
+  
+  // single scalar implementation
+  typedef ScalarImpl<vComplex> ScalarImplR;
+  
+  // Scalar action /////////////////////////////////////////////////////////////
+  template <typename SImpl>
+  class Scalar:
+    public CheckerBoardedSparseMatrixBase<typename SImpl::ScalarField>,
+    public SImpl
+  {
+  public:
+    INHERIT_GIMPL_TYPES(SImpl);
+    INHERIT_SIMPL_TYPES(SImpl);
+  public:
+    // constructor
+    Scalar(GaugeField &_Umu, GridCartesian &Sgrid, GridRedBlackCartesian &Hgrid,
+           RealD _mass)
+    : _grid(&Sgrid)
+    , _cbgrid(&Hgrid)
+    , mass(_mass)
+    , Lebesgue(_grid)
+    , LebesgueEvenOdd(_cbgrid)
+    , Umu(&Sgrid)
+    , UmuEven(&Hgrid)
+    , UmuOdd(&Hgrid)
+    {
+      Umu = _Umu;
+      pickCheckerboard(Even, UmuEven, Umu);
+      pickCheckerboard(Odd, UmuOdd, Umu);
+    }
+    // grid access
+    virtual GridBase *RedBlackGrid(void) {return _grid;}
+    // half checkerboard operations
+    // FIXME: do implementation
+    virtual void Meooe(const ScalarField &in, ScalarField &out)
+    {
+      assert(0);
+    }
+    virtual void Mooee(const ScalarField &in, ScalarField &out)
+    {
+      assert(0);
+    }
+    virtual void MooeeInv(const ScalarField &in, ScalarField &out)
+    {
+      assert(0);
+    }
+    virtual void MeooeDag(const ScalarField &in, ScalarField &out)
+    {
+      assert(0);
+    }
+    virtual void MooeeDag(const ScalarField &in, ScalarField &out)
+    {
+      assert(0);
+    }
+    virtual void MooeeInvDag(const ScalarField &in, ScalarField &out)
+    {
+      assert(0);
+    }
+    // free propagators
+    static void MomentumSpacePropagator(ScalarField &out, RealD m);
+    static void FreePropagator(const ScalarField &in, ScalarField &out,
+                               const ScalarField &momKernel);
+    static void FreePropagator(const ScalarField &in, ScalarField &out, RealD m);
+  public:
+    RealD mass;
+    
+    GridBase *_grid;
+    GridBase *_cbgrid;
+    
+    // Defines the stencils for even and odd
+    StencilImpl Stencil;
+    StencilImpl StencilEven;
+    StencilImpl StencilOdd;
+    
+    // Copy of the gauge field, with even and odd subsets
+    GaugeField Umu;
+    GaugeField UmuEven;
+    GaugeField UmuOdd;
+    
+    LebesgueOrder Lebesgue;
+    LebesgueOrder LebesgueEvenOdd;
+  };
+  
+  template <typename SImpl>
+  void Scalar<SImpl>::MomentumSpacePropagator(ScalarField &out, RealD m)
+  {
+    GridBase           *grid = out._grid;
+    ScalarField        kmu(grid);
+    const unsigned int nd    = grid->_ndimension;
+    std::vector<int>   &l    = grid->_fdimensions;
+    
+    out = m*m;
+    for(int mu = 0; mu < nd; mu++)
+    {
+      Real twoPiL = M_PI*2./l[mu];
+      
+      LatticeCoordinate(kmu,mu);
+      kmu = 2.*sin(.5*twoPiL*kmu);
+      out = out + kmu*kmu;
+    }
+  }
+  
+  template <typename SImpl>
+  void Scalar<SImpl>::FreePropagator(const ScalarField &in, ScalarField &out,
+                                     const ScalarField &FTKernel)
+  {
+    FFT         fft((GridCartesian *)in._grid);
+    ScalarField inFT(in._grid);
+    
+    fft.FFT_all_dim(inFT, in, FFT::forward);
+    inFT = inFT*FTKernel;
+    fft.FFT_all_dim(out, inFT, FFT::backward);
+  }
+  
+  template <typename SImpl>
+  void Scalar<SImpl>::FreePropagator(const ScalarField &in, ScalarField &out,
+                                     RealD m)
+  {
+    ScalarField FTKernel(in._grid);
+    
+    MomentumSpacePropagator(FTKernel, m);
+    FreePropagator(in, out, FTKernel);
+  }
+  
+  template <class SImpl>
+  void ScalarToProp(typename SImpl::PropagatorField &p,
+                    const typename SImpl::ScalarField &s,
+                    const int c)
+  {
+    for(int i = 0; i < SImpl::rDim; ++i)
+    {
+      pokeColour(p, peekColour(s, i), i);
+    }
+  }
+  
+  template <class SImpl>
+  void PropToScalar(typename SImpl::ScalarField &s,
+                    const typename SImpl::PropagatorField &p,
+                    const int c)
+  {
+    for(int i = 0; i < SImpl::rDim; ++i)
+    {
+      pokeColour(s, peekColour(p, i), i);
+    }
+  }
+}}
+
+#endif

From afbf7d4c37df8f134aa6bb191d1fd29d1709b16e Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 29 Dec 2016 22:43:38 +0100
Subject: [PATCH 019/177] QED Gimpl moved in Photon.h

---
 extras/Hadrons/Modules/MGauge/StochEm.hpp | 21 ---------------------
 extras/qed-fvol/qed-fvol.cc               | 22 ----------------------
 lib/qcd/action/gauge/Photon.h             | 22 +++++++++++++++++++++-
 3 files changed, 21 insertions(+), 44 deletions(-)

diff --git a/extras/Hadrons/Modules/MGauge/StochEm.hpp b/extras/Hadrons/Modules/MGauge/StochEm.hpp
index 04a7c48c..50a77435 100644
--- a/extras/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/extras/Hadrons/Modules/MGauge/StochEm.hpp
@@ -39,27 +39,6 @@ BEGIN_HADRONS_NAMESPACE
  ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 
-template <class S>
-class QedGimpl
-{
-public:
-    typedef S Simd;
-    
-    template <typename vtype>
-    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
-    template <typename vtype>
-    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
-    
-    typedef iImplGaugeLink<Simd> SiteGaugeLink;
-    typedef iImplGaugeField<Simd> SiteGaugeField;
-    
-    typedef Lattice<SiteGaugeLink> GaugeLinkField;
-    typedef Lattice<SiteGaugeField> GaugeField;
-};
-
-typedef QedGimpl<vComplex> QedGimplR;
-typedef Photon<QedGimplR>  PhotonR;
-
 class StochEmPar: Serializable
 {
 public:
diff --git a/extras/qed-fvol/qed-fvol.cc b/extras/qed-fvol/qed-fvol.cc
index 951c36ad..3ecac2fc 100644
--- a/extras/qed-fvol/qed-fvol.cc
+++ b/extras/qed-fvol/qed-fvol.cc
@@ -5,29 +5,7 @@ using namespace Grid;
 using namespace QCD;
 using namespace QedFVol;
 
-template <class S> 
-class QedGimpl 
-{
-public:
-  typedef S Simd;
-
-  template <typename vtype>
-  using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
-  template <typename vtype>
-  using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
-
-  typedef iImplGaugeLink<Simd> SiteGaugeLink;
-  typedef iImplGaugeField<Simd> SiteGaugeField;
-
-  typedef Lattice<SiteGaugeLink> GaugeLinkField; // bit ugly naming; polarised
-                                                 // gauge field, lorentz... all
-                                                 // ugly
-  typedef Lattice<SiteGaugeField> GaugeField;
-};
-
-typedef QedGimpl<vComplex>              QedGimplR;
 typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR;
-typedef Photon<QedGimplR>               PhotonR;
 typedef PhotonR::GaugeField             EmField;
 typedef PhotonR::GaugeLinkField         EmComp;
 
diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index faa63b42..73405297 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -28,9 +28,27 @@
 #ifndef QCD_PHOTON_ACTION_H
 #define QCD_PHOTON_ACTION_H
 
-
 namespace Grid{
 namespace QCD{
+  template <class S>
+  class QedGimpl
+  {
+  public:
+    typedef S Simd;
+    
+    template <typename vtype>
+    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
+    template <typename vtype>
+    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
+    
+    typedef iImplGaugeLink<Simd> SiteGaugeLink;
+    typedef iImplGaugeField<Simd> SiteGaugeField;
+    
+    typedef Lattice<SiteGaugeLink> GaugeLinkField;
+    typedef Lattice<SiteGaugeField> GaugeField;
+  };
+  
+  typedef QedGimpl<vComplex> QedGimplR;
   
   template<class Gimpl>
   class Photon
@@ -56,6 +74,8 @@ namespace QCD{
     ZmScheme zmScheme_;
   };
 
+  typedef Photon<QedGimplR>  PhotonR;
+  
   template<class Gimpl>
   Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
   : gauge_(gauge), zmScheme_(zmScheme)

From 4c60e31070f3d05ba7e52c15c4bf6e59644a046a Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 29 Dec 2016 22:44:08 +0100
Subject: [PATCH 020/177] Hadrons: code cleaning

---
 extras/Hadrons/Modules/Quark.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extras/Hadrons/Modules/Quark.hpp b/extras/Hadrons/Modules/Quark.hpp
index e441a096..0cf7314b 100644
--- a/extras/Hadrons/Modules/Quark.hpp
+++ b/extras/Hadrons/Modules/Quark.hpp
@@ -133,7 +133,7 @@ void TQuark<FImpl>::execute(void)
     for (unsigned int c = 0; c < Nc; ++c)
     {
         LOG(Message) << "Inversion for spin= " << s << ", color= " << c
-        << std::endl;
+                     << std::endl;
         // source conversion for 4D sources
         if (!env().isObject5d(par().source))
         {

From bbc0eff078cfd331ca31f2dd0c95b3030ee8a261 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 29 Dec 2016 22:44:22 +0100
Subject: [PATCH 021/177] Hadrons: scalar sources

---
 extras/Hadrons/Modules/MSource/Point.hpp | 5 +++--
 extras/Hadrons/Modules/MSource/Z2.hpp    | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/extras/Hadrons/Modules/MSource/Point.hpp b/extras/Hadrons/Modules/MSource/Point.hpp
index a0ecbc2a..8d0b4de8 100644
--- a/extras/Hadrons/Modules/MSource/Point.hpp
+++ b/extras/Hadrons/Modules/MSource/Point.hpp
@@ -63,7 +63,7 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TPoint(const std::string name);
@@ -78,7 +78,8 @@ public:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Point, TPoint<FIMPL>, MSource);
+MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,       MSource);
+MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplR>, MSource);
 
 /******************************************************************************
  *                       TPoint template implementation                       *
diff --git a/extras/Hadrons/Modules/MSource/Z2.hpp b/extras/Hadrons/Modules/MSource/Z2.hpp
index cd5727be..6fa49cfe 100644
--- a/extras/Hadrons/Modules/MSource/Z2.hpp
+++ b/extras/Hadrons/Modules/MSource/Z2.hpp
@@ -67,7 +67,7 @@ template <typename FImpl>
 class TZ2: public Module<Z2Par>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TZ2(const std::string name);
@@ -82,7 +82,8 @@ public:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Z2, TZ2<FIMPL>, MSource);
+MODULE_REGISTER_NS(Z2,       TZ2<FIMPL>,       MSource);
+MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplR>, MSource);
 
 /******************************************************************************
  *                       TZ2 template implementation                          *

From 673994b281e6c464b4021c62c80a9976e0035176 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 29 Dec 2016 22:44:58 +0100
Subject: [PATCH 022/177] Hadrons: modules for scalar propagators

---
 extras/Hadrons/Global.hpp                     | 25 ++++++--
 extras/Hadrons/Modules.hpp                    | 30 +---------
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 40 +++++++++++++
 .../Hadrons/Modules/MScalar/ChargedProp.hpp   | 44 ++++++++++++++
 extras/Hadrons/Modules/MScalar/FreeProp.cc    | 57 +++++++++++++++++++
 extras/Hadrons/Modules/MScalar/FreeProp.hpp   | 47 +++++++++++++++
 extras/Hadrons/modules.inc                    |  6 +-
 7 files changed, 215 insertions(+), 34 deletions(-)
 create mode 100644 extras/Hadrons/Modules/MScalar/ChargedProp.cc
 create mode 100644 extras/Hadrons/Modules/MScalar/ChargedProp.hpp
 create mode 100644 extras/Hadrons/Modules/MScalar/FreeProp.cc
 create mode 100644 extras/Hadrons/Modules/MScalar/FreeProp.hpp

diff --git a/extras/Hadrons/Global.hpp b/extras/Hadrons/Global.hpp
index 81afab13..bcb282fc 100644
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -51,23 +51,38 @@ using Grid::operator<<;
  * error with GCC 5 (clang & GCC 6 compile fine without it).
  */
 
-// FIXME: find a way to do that in a more general fashion
 #ifndef FIMPL
 #define FIMPL WilsonImplR
 #endif
+#ifndef SIMPL
+#define SIMPL ScalarImplR
+#endif
 
 BEGIN_HADRONS_NAMESPACE
 
 // type aliases
-#define TYPE_ALIASES(FImpl, suffix)\
+#define FERM_TYPE_ALIASES(FImpl, suffix)\
 typedef FermionOperator<FImpl>                       FMat##suffix;             \
 typedef typename FImpl::FermionField                 FermionField##suffix;     \
 typedef typename FImpl::PropagatorField              PropagatorField##suffix;  \
-typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \
-typedef typename FImpl::DoubledGaugeField            DoubledGaugeField##suffix;\
-typedef std::function<void(FermionField##suffix &,                             \
+typedef typename FImpl::SitePropagator               SitePropagator##suffix;
+
+#define GAUGE_TYPE_ALIASES(FImpl, suffix)\
+typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
+
+#define SCALAR_TYPE_ALIASES(SImpl, suffix)\
+typedef typename SImpl::ScalarField     ScalarField##suffix;\
+typedef typename SImpl::PropagatorField PropagatorField##suffix;
+
+#define SOLVER_TYPE_ALIASES(FImpl, suffix)\
+typedef std::function<void(FermionField##suffix &,\
                       const FermionField##suffix &)> SolverFn##suffix;
 
+#define TYPE_ALIASES(FImpl, suffix)\
+FERM_TYPE_ALIASES(FImpl, suffix)\
+GAUGE_TYPE_ALIASES(FImpl, suffix)\
+SOLVER_TYPE_ALIASES(FImpl, suffix)
+
 // logger
 class HadronsLogger: public Logger
 {
diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp
index 5d1a456c..ad31d2a7 100644
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -1,31 +1,3 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: extras/Hadrons/Modules.hpp
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
@@ -34,6 +6,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
+#include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
+#include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
new file mode 100644
index 00000000..1137c6f0
--- /dev/null
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -0,0 +1,40 @@
+#include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalar;
+
+/******************************************************************************
+*                  TChargedProp implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+TChargedProp::TChargedProp(const std::string name)
+: Module<ChargedPropPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+std::vector<std::string> TChargedProp::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+std::vector<std::string> TChargedProp::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+void TChargedProp::setup(void)
+{
+
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+void TChargedProp::execute(void)
+{
+
+}
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
new file mode 100644
index 00000000..7a60c2ad
--- /dev/null
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -0,0 +1,44 @@
+#ifndef Hadrons_ChargedProp_hpp_
+#define Hadrons_ChargedProp_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Hadrons/Module.hpp>
+#include <Grid/Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         ChargedProp                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalar)
+
+class ChargedPropPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
+                                    unsigned int, i);
+};
+
+class TChargedProp: public Module<ChargedPropPar>
+{
+public:
+    // constructor
+    TChargedProp(const std::string name);
+    // destructor
+    virtual ~TChargedProp(void) = default;
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_ChargedProp_hpp_
diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.cc b/extras/Hadrons/Modules/MScalar/FreeProp.cc
new file mode 100644
index 00000000..7419a954
--- /dev/null
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.cc
@@ -0,0 +1,57 @@
+#include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalar;
+
+/******************************************************************************
+*                  TFreeProp implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+TFreeProp::TFreeProp(const std::string name)
+: Module<FreePropPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+std::vector<std::string> TFreeProp::getInput(void)
+{
+    std::vector<std::string> in = {par().source};
+    
+    return in;
+}
+
+std::vector<std::string> TFreeProp::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+void TFreeProp::setup(void)
+{
+    env().registerLattice<ScalarField>(getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+void TFreeProp::execute(void)
+{
+    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
+    ScalarField &source = *env().getObject<ScalarField>(par().source);
+    ScalarField *momKernel;
+    std::string kerName = "_" + getName() + "_momKernel";
+    
+    if (!env().hasCreatedObject(kerName))
+    {
+        LOG(Message) << "Caching momentum space free scalar propagator"
+                     << "(mass= " << par().mass << ")..." << std::endl;
+        momKernel = env().template createLattice<ScalarField>(kerName);
+        Scalar<SIMPL>::MomentumSpacePropagator(*momKernel, par().mass);
+    }
+    else
+    {
+        momKernel = env().getObject<ScalarField>(kerName);
+    }
+    LOG(Message) << "Computing free scalar propagator..." << std::endl;
+    Scalar<SIMPL>::FreePropagator(source, prop, *momKernel);
+}
diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.hpp b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
new file mode 100644
index 00000000..6a0cd930
--- /dev/null
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -0,0 +1,47 @@
+#ifndef Hadrons_FreeProp_hpp_
+#define Hadrons_FreeProp_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Hadrons/Module.hpp>
+#include <Grid/Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         FreeProp                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalar)
+
+class FreePropPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
+                                    std::string, source,
+                                    double,      mass);
+};
+
+class TFreeProp: public Module<FreePropPar>
+{
+public:
+    SCALAR_TYPE_ALIASES(SIMPL,);
+public:
+    // constructor
+    TFreeProp(const std::string name);
+    // destructor
+    virtual ~TFreeProp(void) = default;
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_FreeProp_hpp_
diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc
index 8b559024..b091c38b 100644
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -2,7 +2,9 @@ modules_cc =\
   Modules/MGauge/Load.cc \
   Modules/MGauge/Random.cc \
   Modules/MGauge/StochEm.cc \
-  Modules/MGauge/Unit.cc
+  Modules/MGauge/Unit.cc \
+  Modules/MScalar/ChargedProp.cc \
+  Modules/MScalar/FreeProp.cc
 
 modules_hpp =\
   Modules/MAction/DWF.hpp \
@@ -13,6 +15,8 @@ modules_hpp =\
   Modules/MGauge/Random.hpp \
   Modules/MGauge/StochEm.hpp \
   Modules/MGauge/Unit.hpp \
+  Modules/MScalar/ChargedProp.hpp \
+  Modules/MScalar/FreeProp.hpp \
   Modules/MSolver/RBPrecCG.hpp \
   Modules/MSource/Point.hpp \
   Modules/MSource/SeqGamma.hpp \

From 82b3f546970fde47b1e1220679f3c6c772d5eff0 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 5 Jan 2017 14:58:07 +0000
Subject: [PATCH 023/177] scalar free propagator fix

---
 lib/qcd/action/scalar/Scalar.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/qcd/action/scalar/Scalar.h b/lib/qcd/action/scalar/Scalar.h
index 194f6767..c053e15e 100644
--- a/lib/qcd/action/scalar/Scalar.h
+++ b/lib/qcd/action/scalar/Scalar.h
@@ -148,10 +148,11 @@ namespace QCD{
   void Scalar<SImpl>::MomentumSpacePropagator(ScalarField &out, RealD m)
   {
     GridBase           *grid = out._grid;
-    ScalarField        kmu(grid);
+    ScalarField        kmu(grid), one(grid);
     const unsigned int nd    = grid->_ndimension;
     std::vector<int>   &l    = grid->_fdimensions;
     
+    one = Complex(1.0,0.0);
     out = m*m;
     for(int mu = 0; mu < nd; mu++)
     {
@@ -161,6 +162,7 @@ namespace QCD{
       kmu = 2.*sin(.5*twoPiL*kmu);
       out = out + kmu*kmu;
     }
+    out = one/out;
   }
   
   template <typename SImpl>

From 97843e2b5818667ab5f6802003bb4c04d6076503 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 5 Jan 2017 14:58:55 +0000
Subject: [PATCH 024/177] Hadrons: free scalar buffer fix and output

---
 extras/Hadrons/Modules/MScalar/FreeProp.cc  | 34 ++++++++++++++++++---
 extras/Hadrons/Modules/MScalar/FreeProp.hpp |  5 +--
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.cc b/extras/Hadrons/Modules/MScalar/FreeProp.cc
index 7419a954..ba85e041 100644
--- a/extras/Hadrons/Modules/MScalar/FreeProp.cc
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.cc
@@ -1,11 +1,13 @@
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 
+#define KERNAME "_" + getName() + "_momKernel"
+
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 
 /******************************************************************************
-*                  TFreeProp implementation                             *
+*                        TFreeProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TFreeProp::TFreeProp(const std::string name)
@@ -30,6 +32,12 @@ std::vector<std::string> TFreeProp::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TFreeProp::setup(void)
 {
+    std::string kerName = KERNAME;
+    
+    if (!env().hasRegisteredObject(kerName))
+    {
+        env().registerLattice<ScalarField>(kerName);
+    }
     env().registerLattice<ScalarField>(getName());
 }
 
@@ -39,13 +47,13 @@ void TFreeProp::execute(void)
     ScalarField &prop   = *env().createLattice<ScalarField>(getName());
     ScalarField &source = *env().getObject<ScalarField>(par().source);
     ScalarField *momKernel;
-    std::string kerName = "_" + getName() + "_momKernel";
-    
+    std::string kerName = KERNAME;
+
     if (!env().hasCreatedObject(kerName))
     {
         LOG(Message) << "Caching momentum space free scalar propagator"
-                     << "(mass= " << par().mass << ")..." << std::endl;
-        momKernel = env().template createLattice<ScalarField>(kerName);
+                     << " (mass= " << par().mass << ")..." << std::endl;
+        momKernel = env().createLattice<ScalarField>(kerName);
         Scalar<SIMPL>::MomentumSpacePropagator(*momKernel, par().mass);
     }
     else
@@ -54,4 +62,20 @@ void TFreeProp::execute(void)
     }
     LOG(Message) << "Computing free scalar propagator..." << std::endl;
     Scalar<SIMPL>::FreePropagator(source, prop, *momKernel);
+    
+    if (!par().output.empty())
+    {
+        TextWriter            writer(par().output + "." +
+                                     std::to_string(env().getTrajectory()));
+        std::vector<TComplex> buf;
+        std::vector<Complex>  result;
+        
+        sliceSum(prop, buf, Tp);
+        result.resize(buf.size());
+        for (unsigned int t = 0; t < buf.size(); ++t)
+        {
+            result[t] = TensorRemove(buf[t]);
+        }
+        write(writer, "prop", result);
+    }
 }
diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.hpp b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
index 6a0cd930..81bb8121 100644
--- a/extras/Hadrons/Modules/MScalar/FreeProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -8,7 +8,7 @@
 BEGIN_HADRONS_NAMESPACE
 
 /******************************************************************************
- *                         FreeProp                                 *
+ *                               FreeProp                                     *
  ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 
@@ -17,7 +17,8 @@ class FreePropPar: Serializable
 public:
     GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
                                     std::string, source,
-                                    double,      mass);
+                                    double,      mass,
+                                    std::string, output);
 };
 
 class TFreeProp: public Module<FreePropPar>

From fc760016b3e12b5fea25c8ca288525d4e2dad7c7 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 11 Jan 2017 18:39:58 +0000
Subject: [PATCH 025/177] More uniform cache name for scalar momentum
 propagators

---
 extras/Hadrons/Modules.hpp                  |  1 +
 extras/Hadrons/Modules/MScalar/FreeProp.cc  | 22 ++++++++++-----------
 extras/Hadrons/Modules/MScalar/FreeProp.hpp |  2 ++
 extras/Hadrons/Modules/MScalar/Scalar.hpp   |  6 ++++++
 extras/Hadrons/modules.inc                  |  1 +
 5 files changed, 20 insertions(+), 12 deletions(-)
 create mode 100644 extras/Hadrons/Modules/MScalar/Scalar.hpp

diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp
index ad31d2a7..a25419c5 100644
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -8,6 +8,7 @@
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
+#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.cc b/extras/Hadrons/Modules/MScalar/FreeProp.cc
index ba85e041..f0a503ff 100644
--- a/extras/Hadrons/Modules/MScalar/FreeProp.cc
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.cc
@@ -1,6 +1,5 @@
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
-
-#define KERNAME "_" + getName() + "_momKernel"
+#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -32,11 +31,11 @@ std::vector<std::string> TFreeProp::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TFreeProp::setup(void)
 {
-    std::string kerName = KERNAME;
+    freeMomPropName_ = FREEMOMPROP(par().mass);
     
-    if (!env().hasRegisteredObject(kerName))
+    if (!env().hasRegisteredObject(freeMomPropName_))
     {
-        env().registerLattice<ScalarField>(kerName);
+        env().registerLattice<ScalarField>(freeMomPropName_);
     }
     env().registerLattice<ScalarField>(getName());
 }
@@ -46,22 +45,21 @@ void TFreeProp::execute(void)
 {
     ScalarField &prop   = *env().createLattice<ScalarField>(getName());
     ScalarField &source = *env().getObject<ScalarField>(par().source);
-    ScalarField *momKernel;
-    std::string kerName = KERNAME;
+    ScalarField *freeMomProp;
 
-    if (!env().hasCreatedObject(kerName))
+    if (!env().hasCreatedObject(freeMomPropName_))
     {
         LOG(Message) << "Caching momentum space free scalar propagator"
                      << " (mass= " << par().mass << ")..." << std::endl;
-        momKernel = env().createLattice<ScalarField>(kerName);
-        Scalar<SIMPL>::MomentumSpacePropagator(*momKernel, par().mass);
+        freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
+        Scalar<SIMPL>::MomentumSpacePropagator(*freeMomProp, par().mass);
     }
     else
     {
-        momKernel = env().getObject<ScalarField>(kerName);
+        freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
     }
     LOG(Message) << "Computing free scalar propagator..." << std::endl;
-    Scalar<SIMPL>::FreePropagator(source, prop, *momKernel);
+    Scalar<SIMPL>::FreePropagator(source, prop, *freeMomProp);
     
     if (!par().output.empty())
     {
diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.hpp b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
index 81bb8121..29f15eda 100644
--- a/extras/Hadrons/Modules/MScalar/FreeProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -37,6 +37,8 @@ public:
     virtual void setup(void);
     // execution
     virtual void execute(void);
+private:
+    std::string freeMomPropName_;
 };
 
 MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
diff --git a/extras/Hadrons/Modules/MScalar/Scalar.hpp b/extras/Hadrons/Modules/MScalar/Scalar.hpp
new file mode 100644
index 00000000..db702ff2
--- /dev/null
+++ b/extras/Hadrons/Modules/MScalar/Scalar.hpp
@@ -0,0 +1,6 @@
+#ifndef Hadrons_Scalar_hpp_
+#define Hadrons_Scalar_hpp_
+
+#define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m)
+
+#endif // Hadrons_Scalar_hpp_
diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc
index b091c38b..dfbe85ff 100644
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -17,6 +17,7 @@ modules_hpp =\
   Modules/MGauge/Unit.hpp \
   Modules/MScalar/ChargedProp.hpp \
   Modules/MScalar/FreeProp.hpp \
+  Modules/MScalar/Scalar.hpp \
   Modules/MSolver/RBPrecCG.hpp \
   Modules/MSource/Point.hpp \
   Modules/MSource/SeqGamma.hpp \

From ad98b6193d4b08ad42c9da79370b4ccd7382b4cb Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 11 Jan 2017 18:40:43 +0000
Subject: [PATCH 026/177] creating the necessary caches for the FFT EM scalar
 propagator

---
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 68 ++++++++++++++++++-
 .../Hadrons/Modules/MScalar/ChargedProp.hpp   | 10 ++-
 2 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index 1137c6f0..1cd0cae6 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -1,4 +1,5 @@
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
+#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -15,7 +16,7 @@ TChargedProp::TChargedProp(const std::string name)
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TChargedProp::getInput(void)
 {
-    std::vector<std::string> in;
+    std::vector<std::string> in = {par().source, par().emField};
     
     return in;
 }
@@ -30,11 +31,72 @@ std::vector<std::string> TChargedProp::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TChargedProp::setup(void)
 {
-
+    freeMomPropName_ = FREEMOMPROP(par().mass);
+    shiftedMomPropName_.clear();
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        shiftedMomPropName_.push_back(freeMomPropName_ + "_"
+                                      + std::to_string(mu));
+    }
+    if (!env().hasRegisteredObject(freeMomPropName_))
+    {
+        env().registerLattice<ScalarField>(freeMomPropName_);
+    }
+    if (!env().hasRegisteredObject(shiftedMomPropName_[0]))
+    {
+        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+        {
+            env().registerLattice<ScalarField>(shiftedMomPropName_[mu]);
+        }
+    }
+    env().registerLattice<ScalarField>(getName());
+    
 }
 
 // execution ///////////////////////////////////////////////////////////////////
 void TChargedProp::execute(void)
 {
-
+    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
+    ScalarField &source = *env().getObject<ScalarField>(par().source);
+    ScalarField *freeMomProp;
+    std::vector<ScalarField *> shiftedMomProp;
+    Complex                    ci(0.0,1.0);
+    
+    if (!env().hasCreatedObject(freeMomPropName_))
+    {
+        LOG(Message) << "Caching momentum space free scalar propagator"
+                     << " (mass= " << par().mass << ")..." << std::endl;
+        freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
+        Scalar<SIMPL>::MomentumSpacePropagator(*freeMomProp, par().mass);
+    }
+    else
+    {
+        freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
+    }
+    if (!env().hasCreatedObject(shiftedMomPropName_[0]))
+    {
+        std::vector<int> &l = env().getGrid()->_fdimensions;
+        
+        LOG(Message) << "Caching shifted momentum space free scalar propagator"
+                     << " (mass= " << par().mass << ")..." << std::endl;
+        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+        {
+            Real    twoPiL = M_PI*2./l[mu];
+            
+            shiftedMomProp.push_back(
+                env().createLattice<ScalarField>(shiftedMomPropName_[mu]));
+            LatticeCoordinate(*(shiftedMomProp[mu]), mu);
+            *(shiftedMomProp[mu]) = exp(ci*twoPiL*(*(shiftedMomProp[mu])))
+                                    *(*freeMomProp);
+        }
+    }
+    else
+    {
+        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+        {
+            shiftedMomProp.push_back(
+                env().getObject<ScalarField>(shiftedMomPropName_[mu]));
+        }
+    }
+    
 }
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
index 7a60c2ad..91ea2355 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -16,11 +16,16 @@ class ChargedPropPar: Serializable
 {
 public:
     GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
-                                    unsigned int, i);
+                                    std::string, emField,
+                                    std::string, source,
+                                    double,      mass,
+                                    std::string, output);
 };
 
 class TChargedProp: public Module<ChargedPropPar>
 {
+public:
+    SCALAR_TYPE_ALIASES(SIMPL,);
 public:
     // constructor
     TChargedProp(const std::string name);
@@ -33,6 +38,9 @@ public:
     virtual void setup(void);
     // execution
     virtual void execute(void);
+private:
+    std::string              freeMomPropName_;
+    std::vector<std::string> shiftedMomPropName_;
 };
 
 MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);

From 889d828bc289d2ee4ea5939af29ff56fe7466db5 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 12 Jan 2017 18:17:44 +0000
Subject: [PATCH 027/177] Code cleaning

---
 extras/Hadrons/Modules/MScalar/ChargedProp.cc  | 2 +-
 extras/Hadrons/Modules/MScalar/ChargedProp.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index 1cd0cae6..ff53fa0b 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -6,7 +6,7 @@ using namespace Hadrons;
 using namespace MScalar;
 
 /******************************************************************************
-*                  TChargedProp implementation                             *
+*                     TChargedProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TChargedProp::TChargedProp(const std::string name)
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
index 91ea2355..001f6494 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -8,7 +8,7 @@
 BEGIN_HADRONS_NAMESPACE
 
 /******************************************************************************
- *                         ChargedProp                                 *
+ *                       Charged scalar propagator                            *
  ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 

From 65987a8a5810434b3f7ee54e8b6a4cf400108c74 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 12 Jan 2017 20:44:23 +0000
Subject: [PATCH 028/177] First implementation of the scalar QED propagator,
 runs but absolutely not checked

---
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 122 +++++++++++++++---
 .../Hadrons/Modules/MScalar/ChargedProp.hpp   |  13 +-
 2 files changed, 115 insertions(+), 20 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index ff53fa0b..dd260798 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -32,23 +32,28 @@ std::vector<std::string> TChargedProp::getOutput(void)
 void TChargedProp::setup(void)
 {
     freeMomPropName_ = FREEMOMPROP(par().mass);
-    shiftedMomPropName_.clear();
+    phaseName_.clear();
     for (unsigned int mu = 0; mu < env().getNd(); ++mu)
     {
-        shiftedMomPropName_.push_back(freeMomPropName_ + "_"
+        phaseName_.push_back(freeMomPropName_ + "_"
                                       + std::to_string(mu));
     }
+    GFSrcName_ = "_" + getName() + "_DinvSrc";
     if (!env().hasRegisteredObject(freeMomPropName_))
     {
         env().registerLattice<ScalarField>(freeMomPropName_);
     }
-    if (!env().hasRegisteredObject(shiftedMomPropName_[0]))
+    if (!env().hasRegisteredObject(phaseName_[0]))
     {
         for (unsigned int mu = 0; mu < env().getNd(); ++mu)
         {
-            env().registerLattice<ScalarField>(shiftedMomPropName_[mu]);
+            env().registerLattice<ScalarField>(phaseName_[mu]);
         }
     }
+    if (!env().hasRegisteredObject(GFSrcName_))
+    {
+        env().registerLattice<ScalarField>(GFSrcName_);
+    }
     env().registerLattice<ScalarField>(getName());
     
 }
@@ -56,24 +61,26 @@ void TChargedProp::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TChargedProp::execute(void)
 {
+    // CACHING ANALYTIC EXPRESSIONS
     ScalarField &prop   = *env().createLattice<ScalarField>(getName());
     ScalarField &source = *env().getObject<ScalarField>(par().source);
-    ScalarField *freeMomProp;
-    std::vector<ScalarField *> shiftedMomProp;
-    Complex                    ci(0.0,1.0);
+    Complex     ci(0.0,1.0);
+    FFT         fft(env().getGrid());
     
+    // cache free scalar propagator
     if (!env().hasCreatedObject(freeMomPropName_))
     {
         LOG(Message) << "Caching momentum space free scalar propagator"
                      << " (mass= " << par().mass << ")..." << std::endl;
-        freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
-        Scalar<SIMPL>::MomentumSpacePropagator(*freeMomProp, par().mass);
+        freeMomProp_ = env().createLattice<ScalarField>(freeMomPropName_);
+        Scalar<SIMPL>::MomentumSpacePropagator(*freeMomProp_, par().mass);
     }
     else
     {
-        freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
+        freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
     }
-    if (!env().hasCreatedObject(shiftedMomPropName_[0]))
+    // cache phases
+    if (!env().hasCreatedObject(phaseName_[0]))
     {
         std::vector<int> &l = env().getGrid()->_fdimensions;
         
@@ -83,20 +90,99 @@ void TChargedProp::execute(void)
         {
             Real    twoPiL = M_PI*2./l[mu];
             
-            shiftedMomProp.push_back(
-                env().createLattice<ScalarField>(shiftedMomPropName_[mu]));
-            LatticeCoordinate(*(shiftedMomProp[mu]), mu);
-            *(shiftedMomProp[mu]) = exp(ci*twoPiL*(*(shiftedMomProp[mu])))
-                                    *(*freeMomProp);
+            phase_.push_back(env().createLattice<ScalarField>(phaseName_[mu]));
+            LatticeCoordinate(*(phase_[mu]), mu);
+            *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu])));
         }
     }
     else
     {
         for (unsigned int mu = 0; mu < env().getNd(); ++mu)
         {
-            shiftedMomProp.push_back(
-                env().getObject<ScalarField>(shiftedMomPropName_[mu]));
+            phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
         }
     }
+    // cache G*F*src
+    if (!env().hasCreatedObject(GFSrcName_))
+        
+    {
+        GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
+        fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
+        *GFSrc_ = (*freeMomProp_)*(*GFSrc_);
+    }
+    else
+    {
+        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
+    }
     
+    // PROPAGATOR CALCULATION
+    ScalarField buf(env().getGrid());
+    ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
+    double      q = par().charge;
+    
+    // G*F*Src
+    prop = GFSrc;
+    // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
+    buf = GFSrc;
+    momD1(buf, fft);
+    buf = G*buf;
+    prop = prop - q*buf;
+    // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
+    momD1(buf, fft);
+    prop = prop + q*q*G*buf;
+    // + q^2*G*momD2*G*F*Src (momD1 = F*D2*Finv)
+    buf = GFSrc;
+    momD2(buf, fft);
+    prop = prop + q*q*G*buf;
+    // final FT
+    fft.FFT_all_dim(prop, prop, FFT::backward);
+}
+
+void TChargedProp::momD1(ScalarField &s, FFT &fft)
+{
+    EmField     &A = *env().getObject<EmField>(par().emField);
+    ScalarField buf(env().getGrid()), Amu(env().getGrid());
+    Complex     ci(0.0,1.0);
+    
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        fft.FFT_all_dim(buf, s, FFT::backward);
+        buf = Amu*buf;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        s = s + ci*adj(*phase_[mu])*buf;
+    }
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        buf = (*phase_[mu])*s;
+        fft.FFT_all_dim(buf, buf, FFT::backward);
+        buf = Amu*buf;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        s = s - ci*buf;
+    }
+}
+
+void TChargedProp::momD2(ScalarField &s, FFT &fft)
+{
+    EmField     &A = *env().getObject<EmField>(par().emField);
+    ScalarField buf(env().getGrid()), Amu(env().getGrid());
+    
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        fft.FFT_all_dim(buf, s, FFT::backward);
+        buf = Amu*Amu*buf;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        s = s + .5*adj(*phase_[mu])*buf;
+    }
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        buf = (*phase_[mu])*s;
+        fft.FFT_all_dim(buf, buf, FFT::backward);
+        buf = Amu*Amu*buf;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        s = s + .5*buf;
+    }
 }
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
index 001f6494..8bb5faa0 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -19,6 +19,7 @@ public:
                                     std::string, emField,
                                     std::string, source,
                                     double,      mass,
+                                    double,      charge,
                                     std::string, output);
 };
 
@@ -26,6 +27,8 @@ class TChargedProp: public Module<ChargedPropPar>
 {
 public:
     SCALAR_TYPE_ALIASES(SIMPL,);
+    typedef PhotonR::GaugeField     EmField;
+    typedef PhotonR::GaugeLinkField EmComp;
 public:
     // constructor
     TChargedProp(const std::string name);
@@ -39,8 +42,14 @@ public:
     // execution
     virtual void execute(void);
 private:
-    std::string              freeMomPropName_;
-    std::vector<std::string> shiftedMomPropName_;
+    void momD1(ScalarField &s, FFT &fft);
+    void momD2(ScalarField &s, FFT &fft);
+private:
+    std::string                freeMomPropName_, GFSrcName_;
+    std::vector<std::string>   phaseName_;
+    ScalarField                *freeMomProp_, *GFSrc_;
+    std::vector<ScalarField *> phase_;
+    EmField                    *A;
 };
 
 MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);

From 92f8950a5658f75fa4e184fdffda492d5e45b200 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 13 Jan 2017 13:30:56 +0000
Subject: [PATCH 029/177] Charged scalar prop: cleaning and output

---
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 61 +++++++++++++------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index dd260798..f8323705 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -35,8 +35,7 @@ void TChargedProp::setup(void)
     phaseName_.clear();
     for (unsigned int mu = 0; mu < env().getNd(); ++mu)
     {
-        phaseName_.push_back(freeMomPropName_ + "_"
-                                      + std::to_string(mu));
+        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
     }
     GFSrcName_ = "_" + getName() + "_DinvSrc";
     if (!env().hasRegisteredObject(freeMomPropName_))
@@ -55,14 +54,12 @@ void TChargedProp::setup(void)
         env().registerLattice<ScalarField>(GFSrcName_);
     }
     env().registerLattice<ScalarField>(getName());
-    
 }
 
 // execution ///////////////////////////////////////////////////////////////////
 void TChargedProp::execute(void)
 {
     // CACHING ANALYTIC EXPRESSIONS
-    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
     ScalarField &source = *env().getObject<ScalarField>(par().source);
     Complex     ci(0.0,1.0);
     FFT         fft(env().getGrid());
@@ -79,13 +76,24 @@ void TChargedProp::execute(void)
     {
         freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
     }
+    // cache G*F*src
+    if (!env().hasCreatedObject(GFSrcName_))
+        
+    {
+        GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
+        fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
+        *GFSrc_ = (*freeMomProp_)*(*GFSrc_);
+    }
+    else
+    {
+        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
+    }
     // cache phases
     if (!env().hasCreatedObject(phaseName_[0]))
     {
         std::vector<int> &l = env().getGrid()->_fdimensions;
         
-        LOG(Message) << "Caching shifted momentum space free scalar propagator"
-                     << " (mass= " << par().mass << ")..." << std::endl;
+        LOG(Message) << "Caching shift phases..." << std::endl;
         for (unsigned int mu = 0; mu < env().getNd(); ++mu)
         {
             Real    twoPiL = M_PI*2./l[mu];
@@ -102,20 +110,13 @@ void TChargedProp::execute(void)
             phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
         }
     }
-    // cache G*F*src
-    if (!env().hasCreatedObject(GFSrcName_))
-        
-    {
-        GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
-        fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
-        *GFSrc_ = (*freeMomProp_)*(*GFSrc_);
-    }
-    else
-    {
-        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
-    }
-    
+
     // PROPAGATOR CALCULATION
+    LOG(Message) << "Computing charged scalar propagator"
+                 << " (mass= " << par().mass
+                 << ", charge= " << par().charge << ")..." << std::endl;
+    
+    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
     ScalarField buf(env().getGrid());
     ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
     double      q = par().charge;
@@ -136,6 +137,28 @@ void TChargedProp::execute(void)
     prop = prop + q*q*G*buf;
     // final FT
     fft.FFT_all_dim(prop, prop, FFT::backward);
+    
+    // OUTPUT IF NECESSARY
+    if (!par().output.empty())
+    {
+        std::string           filename = par().output + "." +
+                                         std::to_string(env().getTrajectory());
+        
+        LOG(Message) << "Saving zero-momentum projection to '"
+                     << filename << "'..." << std::endl;
+        
+        TextWriter            writer(filename);
+        std::vector<TComplex> vecBuf;
+        std::vector<Complex>  result;
+        
+        sliceSum(prop, vecBuf, Tp);
+        result.resize(vecBuf.size());
+        for (unsigned int t = 0; t < vecBuf.size(); ++t)
+        {
+            result[t] = TensorRemove(vecBuf[t]);
+        }
+        write(writer, "prop", result);
+    }
 }
 
 void TChargedProp::momD1(ScalarField &s, FFT &fft)

From ae99e99da235ebf5d9a149ef4a7ad0b93d1f7474 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Mon, 23 Jan 2017 17:27:50 +0000
Subject: [PATCH 030/177] Fixed bug in ChargedProp

---
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 40 ++++++++++++-------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index f8323705..d88fdc45 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -123,18 +123,22 @@ void TChargedProp::execute(void)
     
     // G*F*Src
     prop = GFSrc;
+
     // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
     buf = GFSrc;
     momD1(buf, fft);
     buf = G*buf;
     prop = prop - q*buf;
+
     // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
     momD1(buf, fft);
     prop = prop + q*q*G*buf;
-    // + q^2*G*momD2*G*F*Src (momD1 = F*D2*Finv)
+
+    // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv)
     buf = GFSrc;
     momD2(buf, fft);
-    prop = prop + q*q*G*buf;
+    prop = prop - q*q*G*buf;
+
     // final FT
     fft.FFT_all_dim(prop, prop, FFT::backward);
     
@@ -164,16 +168,18 @@ void TChargedProp::execute(void)
 void TChargedProp::momD1(ScalarField &s, FFT &fft)
 {
     EmField     &A = *env().getObject<EmField>(par().emField);
-    ScalarField buf(env().getGrid()), Amu(env().getGrid());
+    ScalarField buf(env().getGrid()), fs(env().getGrid()), result(env().getGrid()), Amu(env().getGrid());
     Complex     ci(0.0,1.0);
-    
+
+    result = zero;
+
+    fft.FFT_all_dim(fs, s, FFT::backward);
     for (unsigned int mu = 0; mu < env().getNd(); ++mu)
     {
         Amu = peekLorentz(A, mu);
-        fft.FFT_all_dim(buf, s, FFT::backward);
-        buf = Amu*buf;
+        buf = Amu*fs;
         fft.FFT_all_dim(buf, buf, FFT::forward);
-        s = s + ci*adj(*phase_[mu])*buf;
+        result = result + ci*adj(*phase_[mu])*buf;
     }
     for (unsigned int mu = 0; mu < env().getNd(); ++mu)
     {
@@ -182,22 +188,26 @@ void TChargedProp::momD1(ScalarField &s, FFT &fft)
         fft.FFT_all_dim(buf, buf, FFT::backward);
         buf = Amu*buf;
         fft.FFT_all_dim(buf, buf, FFT::forward);
-        s = s - ci*buf;
+        result = result - ci*buf;
     }
+
+    s = result;
 }
 
 void TChargedProp::momD2(ScalarField &s, FFT &fft)
 {
     EmField     &A = *env().getObject<EmField>(par().emField);
-    ScalarField buf(env().getGrid()), Amu(env().getGrid());
+    ScalarField buf(env().getGrid()), fs(env().getGrid()), result(env().getGrid()), Amu(env().getGrid());
+
+    result = zero;
     
+    fft.FFT_all_dim(fs, s, FFT::backward);
     for (unsigned int mu = 0; mu < env().getNd(); ++mu)
     {
-        Amu = peekLorentz(A, mu);
-        fft.FFT_all_dim(buf, s, FFT::backward);
-        buf = Amu*Amu*buf;
+        Amu = peekLorentz(A, mu);        
+        buf = Amu*Amu*fs;
         fft.FFT_all_dim(buf, buf, FFT::forward);
-        s = s + .5*adj(*phase_[mu])*buf;
+        result = result + .5*adj(*phase_[mu])*buf;
     }
     for (unsigned int mu = 0; mu < env().getNd(); ++mu)
     {
@@ -206,6 +216,8 @@ void TChargedProp::momD2(ScalarField &s, FFT &fft)
         fft.FFT_all_dim(buf, buf, FFT::backward);
         buf = Amu*Amu*buf;
         fft.FFT_all_dim(buf, buf, FFT::forward);
-        s = s + .5*buf;
+        result = result + .5*buf;
     }
+
+    s = result;
 }

From f65a585236f420d7ed966b8f9e0b7cbfb0857d8c Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Thu, 26 Jan 2017 15:02:30 +0000
Subject: [PATCH 031/177] ChargedProp: Switch to HDF5 output

---
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index d88fdc45..f2890b2a 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -151,7 +151,7 @@ void TChargedProp::execute(void)
         LOG(Message) << "Saving zero-momentum projection to '"
                      << filename << "'..." << std::endl;
         
-        TextWriter            writer(filename);
+        Hdf5Writer            writer(filename);
         std::vector<TComplex> vecBuf;
         std::vector<Complex>  result;
         
@@ -161,6 +161,7 @@ void TChargedProp::execute(void)
         {
             result[t] = TensorRemove(vecBuf[t]);
         }
+        write(writer, "charge", q);
         write(writer, "prop", result);
     }
 }

From ee93f0218bebd84d98211d0ed87cff48951d76d7 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Fri, 27 Jan 2017 12:22:48 +0000
Subject: [PATCH 032/177] ChargedProp: remove ScalarField fs

---
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 38 ++++++++++---------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index f2890b2a..40d4504c 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -169,19 +169,12 @@ void TChargedProp::execute(void)
 void TChargedProp::momD1(ScalarField &s, FFT &fft)
 {
     EmField     &A = *env().getObject<EmField>(par().emField);
-    ScalarField buf(env().getGrid()), fs(env().getGrid()), result(env().getGrid()), Amu(env().getGrid());
+    ScalarField buf(env().getGrid()), result(env().getGrid()),
+                Amu(env().getGrid());
     Complex     ci(0.0,1.0);
 
     result = zero;
 
-    fft.FFT_all_dim(fs, s, FFT::backward);
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Amu = peekLorentz(A, mu);
-        buf = Amu*fs;
-        fft.FFT_all_dim(buf, buf, FFT::forward);
-        result = result + ci*adj(*phase_[mu])*buf;
-    }
     for (unsigned int mu = 0; mu < env().getNd(); ++mu)
     {
         Amu = peekLorentz(A, mu);
@@ -191,6 +184,14 @@ void TChargedProp::momD1(ScalarField &s, FFT &fft)
         fft.FFT_all_dim(buf, buf, FFT::forward);
         result = result - ci*buf;
     }
+    fft.FFT_all_dim(s, s, FFT::backward);
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        buf = Amu*s;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        result = result + ci*adj(*phase_[mu])*buf;
+    }
 
     s = result;
 }
@@ -198,18 +199,11 @@ void TChargedProp::momD1(ScalarField &s, FFT &fft)
 void TChargedProp::momD2(ScalarField &s, FFT &fft)
 {
     EmField     &A = *env().getObject<EmField>(par().emField);
-    ScalarField buf(env().getGrid()), fs(env().getGrid()), result(env().getGrid()), Amu(env().getGrid());
+    ScalarField buf(env().getGrid()), result(env().getGrid()),
+                Amu(env().getGrid());
 
     result = zero;
     
-    fft.FFT_all_dim(fs, s, FFT::backward);
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Amu = peekLorentz(A, mu);        
-        buf = Amu*Amu*fs;
-        fft.FFT_all_dim(buf, buf, FFT::forward);
-        result = result + .5*adj(*phase_[mu])*buf;
-    }
     for (unsigned int mu = 0; mu < env().getNd(); ++mu)
     {
         Amu = peekLorentz(A, mu);
@@ -219,6 +213,14 @@ void TChargedProp::momD2(ScalarField &s, FFT &fft)
         fft.FFT_all_dim(buf, buf, FFT::forward);
         result = result + .5*buf;
     }
+    fft.FFT_all_dim(s, s, FFT::backward);
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);        
+        buf = Amu*Amu*s;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        result = result + .5*adj(*phase_[mu])*buf;
+    }
 
     s = result;
 }

From b39f0d1fb675f453e710ed953583bb68bfe2b18f Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 27 Jan 2017 18:12:35 -0800
Subject: [PATCH 033/177] Hadrons: default I/O to HDF5 if possible, XML
 otherwise

---
 extras/Hadrons/Global.hpp                     | 9 +++++++++
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/extras/Hadrons/Global.hpp b/extras/Hadrons/Global.hpp
index bcb282fc..8dbb08ca 100644
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -160,6 +160,15 @@ std::string typeName(void)
     return typeName(typeIdPt<T>());
 }
 
+// default writers/readers
+#ifdef HAVE_HDF5
+typedef Hdf5Reader CorrReader;
+typedef Hdf5Writer CorrWriter;
+#else
+typedef XmlReader CorrReader;
+typedef XmlWriter CorrWriter;
+#endif
+
 END_HADRONS_NAMESPACE
 
 #endif // Hadrons_Global_hpp_
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index 40d4504c..dc6481f3 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -151,7 +151,7 @@ void TChargedProp::execute(void)
         LOG(Message) << "Saving zero-momentum projection to '"
                      << filename << "'..." << std::endl;
         
-        Hdf5Writer            writer(filename);
+        CorrWriter            writer(filename);
         std::vector<TComplex> vecBuf;
         std::vector<Complex>  result;
         

From 831ca4e3bf8e0b4231f395b2af9308e007b73186 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Tue, 14 Mar 2017 14:55:18 +0900
Subject: [PATCH 034/177] Added Scalar action for fields in the adjoint
 representation

---
 lib/qcd/action/Actions.h                      |   5 +
 lib/qcd/action/scalar/ScalarAction.h          |  61 ++++++-----
 lib/qcd/action/scalar/ScalarImpl.h            |  93 ++++++++--------
 .../action/scalar/ScalarInteractionAction.h   |  84 +++++++--------
 lib/qcd/hmc/GenericHMCrunner.h                |   3 +
 lib/qcd/representations/hmc_types.h           |   2 +-
 tests/hmc/Test_hmc_ScalarActionNxN.cc         | 100 ++++++++++++++++++
 7 files changed, 227 insertions(+), 121 deletions(-)
 create mode 100644 tests/hmc/Test_hmc_ScalarActionNxN.cc

diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h
index daf64f3d..0214b8f4 100644
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -69,6 +69,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 #include <Grid/qcd/action/scalar/ScalarImpl.h>
 #include <Grid/qcd/action/scalar/ScalarAction.h>
+#include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
 
 namespace Grid {
 namespace QCD {
@@ -106,6 +107,10 @@ typedef ScalarAction<ScalarImplR>                 ScalarActionR;
 typedef ScalarAction<ScalarImplF>                 ScalarActionF;
 typedef ScalarAction<ScalarImplD>                 ScalarActionD;
 
+typedef ScalarInteractionAction<ScalarAdjImplR>                 ScalarAdjActionR;
+typedef ScalarInteractionAction<ScalarAdjImplF>                 ScalarAdjActionF;
+typedef ScalarInteractionAction<ScalarAdjImplD>                 ScalarAdjActionD;
+
 }}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/qcd/action/scalar/ScalarAction.h b/lib/qcd/action/scalar/ScalarAction.h
index f10ec9a6..2c82d2e3 100644
--- a/lib/qcd/action/scalar/ScalarAction.h
+++ b/lib/qcd/action/scalar/ScalarAction.h
@@ -6,10 +6,10 @@
 
   Copyright (C) 2015
 
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+  Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+  Author: neo <cossu@post.kek.jp>
+  Author: paboyle <paboyle@ph.ed.ac.uk>
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -35,50 +35,49 @@ directory
 
 namespace Grid {
   // FIXME drop the QCD namespace everywhere here
-  
-  template <class Impl>
-  class ScalarAction : public QCD::Action<typename Impl::Field> {
-  public:
+
+template <class Impl>
+class ScalarAction : public QCD::Action<typename Impl::Field> {
+ public:
     INHERIT_FIELD_TYPES(Impl);
-    
-  private:
+
+ private:
     RealD mass_square;
     RealD lambda;
-    
-  public:
-    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
 
-    virtual std::string LogParameters(){
+ public:
+    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
+
+    virtual std::string LogParameters() {
       std::stringstream sstream;
       sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
       sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
       return sstream.str();
-      
     }
-    
-    virtual std::string action_name(){return "ScalarAction";}
-    
-    virtual void refresh(const Field &U,
-			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
-    
+    virtual std::string action_name() {return "ScalarAction";}
+
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
+
     virtual RealD S(const Field &p) {
       return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
-	ScalarObs<Impl>::sumphider(p);
+    (lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+    ScalarObs<Impl>::sumphider(p);
     };
-    
+
     virtual void deriv(const Field &p,
-		       Field &force) {
+                       Field &force) {
       Field tmp(p._grid);
       Field p2(p._grid);
       ScalarObs<Impl>::phisquared(p2, p);
       tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
       for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
-    };
-  };
-  
-} // Grid
+
+      force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+    }
+};
+
+
+
+}  // namespace Grid
 
 #endif // SCALAR_ACTION_H
diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h
index ee2d2fb8..6d14b61a 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -5,96 +5,99 @@
 namespace Grid {
   //namespace QCD {
 
-  template <class S>
-  class ScalarImplTypes {
-  public:
+template <class S>
+class ScalarImplTypes {
+ public:
     typedef S Simd;
-    
+
     template <typename vtype>
     using iImplField = iScalar<iScalar<iScalar<vtype> > >;
-    
+
     typedef iImplField<Simd> SiteField;
-    
-    
+
     typedef Lattice<SiteField> Field;
-    
-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
+
+    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
       gaussian(pRNG, P);
     }
-    
+
     static inline Field projectForce(Field& P){return P;}
-    
-    static inline void update_field(Field& P, Field& U, double ep){
+
+    static inline void update_field(Field& P, Field& U, double ep) {
       U += P*ep;
     }
-    
-    static inline RealD FieldSquareNorm(Field& U){
+
+    static inline RealD FieldSquareNorm(Field& U) {
       return (- sum(trace(U*U))/2.0);
     }
-    
+
     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
       gaussian(pRNG, U);
     }
-    
+
     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
       gaussian(pRNG, U);
     }
-    
+
     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
       U = 1.0;
     }
-    
+
   };
 
   template <class S, unsigned int N>
-  class ScalarMatrixImplTypes {
+  class ScalarAdjMatrixImplTypes {
   public:
     typedef S Simd;
-    
     template <typename vtype>
     using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
-    
+
     typedef iImplField<Simd> SiteField;
-    
-    
+
     typedef Lattice<SiteField> Field;
-    
-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
-      gaussian(pRNG, P);
+
+    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
+      QCD::SU<N>::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
     }
-    
-    static inline Field projectForce(Field& P){return P;}
-    
-    static inline void update_field(Field& P, Field& U, double ep){
+
+    static inline Field projectForce(Field& P) {return P;}
+
+    static inline void update_field(Field& P, Field& U, double ep) {
       U += P*ep;
     }
-    
-    static inline RealD FieldSquareNorm(Field& U){
-      return (TensorRemove(- sum(trace(U*U))*0.5).real());
+
+    static inline RealD FieldSquareNorm(Field& U) {
+      return (TensorRemove(sum(trace(U*U))).real());
     }
-    
+
     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      QCD::SU<N>::LieRandomize(pRNG, U);
     }
-    
+
     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      QCD::SU<N>::LieRandomize(pRNG, U, 0.01);
     }
-    
+
     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-      U = 1.0;
+      U = zero;
     }
-    
+
   };
 
 
-  
-  
+
+
   typedef ScalarImplTypes<vReal> ScalarImplR;
   typedef ScalarImplTypes<vRealF> ScalarImplF;
   typedef ScalarImplTypes<vRealD> ScalarImplD;
-  
-  //} 
-} 
+
+  // Hardcoding here the size of the matrices
+  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
+  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
+  typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
+
+
+  //}
+}
 
 #endif
diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index bd54a010..2607b041 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -6,10 +6,7 @@
 
   Copyright (C) 2015
 
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+  Author: Guido Cossu <guido,cossu@ed.ac.uk>
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -30,55 +27,54 @@ directory
   *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef SCALAR_ACTION_H
-#define SCALAR_ACTION_H
+#ifndef SCALAR_INT_ACTION_H
+#define SCALAR_INT_ACTION_H
 
 namespace Grid {
   // FIXME drop the QCD namespace everywhere here
-  
-  template <class Impl>
-  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
-  public:
-    INHERIT_FIELD_TYPES(Impl);
-    
-  private:
+
+template <class Impl>
+class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
     RealD mass_square;
     RealD lambda;
-    
-  public:
-    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
 
-    virtual std::string LogParameters(){
+ public:
+    INHERIT_FIELD_TYPES(Impl);
+    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
+
+    virtual std::string LogParameters() {
       std::stringstream sstream;
       sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
       sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
       return sstream.str();
-      
     }
-    
-    virtual std::string action_name(){return "ScalarAction";}
-    
-    virtual void refresh(const Field &U,
-			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
-    
-    virtual RealD S(const Field &p) {
-      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
-	ScalarObs<Impl>::sumphider(p);
-    };
-    
-    virtual void deriv(const Field &p,
-		       Field &force) {
-      Field tmp(p._grid);
-      Field p2(p._grid);
-      ScalarObs<Impl>::phisquared(p2, p);
-      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
-      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
-    };
-  };
-  
-} // Grid
 
-#endif // SCALAR_ACTION_H
+    virtual std::string action_name() {return "ScalarAction";}
+
+    virtual void refresh(const Field &U,
+                         GridParallelRNG &pRNG) {}  // noop as no pseudoferms
+
+    virtual RealD S(const Field &p) {
+        Field action(p._grid);
+        Field pshift(p._grid);
+        Field phisquared(p._grid);
+        phisquared = p*p;
+        action = (2.0*QCD::Nd + mass_square)*phisquared + lambda*phisquared*phisquared;
+        for (int mu = 0; mu < QCD::Nd; mu++) {
+            pshift = Cshift(p, mu, +1);  // not efficient implement with stencils
+            action -= pshift*p + p*pshift;
+        }
+        return -(TensorRemove(sum(trace(action)))).real();
+    };
+
+    virtual void deriv(const Field &p,
+                       Field &force) {
+        force = (2.0*QCD::Nd + mass_square)*p + 2.0*lambda*p*p*p;
+        // following is inefficient
+        for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+    }
+};
+
+}  // namespace Grid
+
+#endif  // SCALAR_INT_ACTION_H
diff --git a/lib/qcd/hmc/GenericHMCrunner.h b/lib/qcd/hmc/GenericHMCrunner.h
index 66b16435..a97fb4e4 100644
--- a/lib/qcd/hmc/GenericHMCrunner.h
+++ b/lib/qcd/hmc/GenericHMCrunner.h
@@ -202,6 +202,9 @@ using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator,
 typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields>
     ScalarGenericHMCRunner;
 
+typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
+    ScalarAdjGenericHMCRunner;
+
 }  // namespace QCD
 }  // namespace Grid
 
diff --git a/lib/qcd/representations/hmc_types.h b/lib/qcd/representations/hmc_types.h
index 3701c9b2..b4991941 100644
--- a/lib/qcd/representations/hmc_types.h
+++ b/lib/qcd/representations/hmc_types.h
@@ -62,7 +62,7 @@ class Representations {
 
 typedef Representations<FundamentalRepresentation> NoHirep;
 typedef Representations<EmptyRep<typename ScalarImplR::Field> > ScalarFields;
-  //typedef Representations<EmptyRep<typename ScalarMatrixImplR::Field> > ScalarMatrixFields;
+typedef Representations<EmptyRep<typename ScalarAdjImplR::Field> > ScalarMatrixFields;
 
 // Helper classes to access the elements
 // Strips the first N parameters from the tuple
diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc
new file mode 100644
index 00000000..8b93efde
--- /dev/null
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -0,0 +1,100 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
+
+Copyright (C) 2016
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+namespace Grid{
+class ScalarActionParameters : Serializable {
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters,
+    double, mass_squared,
+    double, lambda);
+};
+
+}
+int main(int argc, char **argv) {
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef ScalarAdjGenericHMCRunner HMCWrapper;  // Uses the default minimum norm, real scalar fields
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+
+  // Grid from the command line
+  GridModule ScalarGrid;
+  ScalarGrid.set_full( SpaceTimeGrid::makeFourDimGrid(
+        GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+        GridDefaultMpi()));
+  ScalarGrid.set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(ScalarGrid.get_full()));
+  TheHMC.Resources.AddGrid("scalar", ScalarGrid);
+  // Possibile to create the module by hand
+  // hardcoding parameters or using a Reader
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_scalar_lat";
+  CPparams.rng_prefix = "ckpoint_scalar_rng";
+  CPparams.saveInterval = 50;
+  CPparams.format = "IEEE64BIG";
+
+  TheHMC.Resources.LoadBinaryCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+
+  // Scalar action in adjoint representation
+  ScalarActionParameters SPar;
+  SPar.mass_squared = 0.5;
+  SPar.lambda       = 0.1;
+  ScalarAdjActionR Saction(SPar.mass_squared, SPar.lambda);
+
+  // Collect actions
+  ActionLevel<ScalarAdjActionR::Field, ScalarMatrixFields> Level1(1);
+  Level1.push_back(&Saction);
+  TheHMC.TheAction.push_back(Level1);
+  /////////////////////////////////////////////////////////////
+
+  // HMC parameters are serialisable
+  TheHMC.Parameters.MD.MDsteps = 10;
+  TheHMC.Parameters.MD.trajL   = 1.0;
+
+  TheHMC.ReadCommandLine(argc, argv);
+  TheHMC.Run();
+
+  Grid_finalize();
+
+} // main

From 38806343a873ea10264c79103db31182d6770947 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 15 Mar 2017 15:16:16 +0900
Subject: [PATCH 035/177] Improving efficiency of the force term

---
 .../action/scalar/ScalarInteractionAction.h   | 91 ++++++++++++++++---
 tests/Test_stencil.cc                         | 43 +++++----
 tests/hmc/Test_hmc_ScalarActionNxN.cc         | 11 +--
 3 files changed, 104 insertions(+), 41 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index 2607b041..5a322a5e 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -30,17 +30,34 @@ directory
 #ifndef SCALAR_INT_ACTION_H
 #define SCALAR_INT_ACTION_H
 
+
+// Note: this action can completely absorb the ScalarAction for real float fields
+// use the scalarObjs to generalise the structure
+
 namespace Grid {
   // FIXME drop the QCD namespace everywhere here
 
 template <class Impl>
 class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
+public:
+    INHERIT_FIELD_TYPES(Impl);
+private:
     RealD mass_square;
     RealD lambda;
 
+
+    typedef typename Field::vector_object vobj;
+    typedef CartesianStencil<vobj,vobj> Stencil;
+
+    SimpleCompressor<vobj> compressor;
+    int npoint = 8;
+    std::vector<int> directions    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
+    std::vector<int> displacements = {1,1,1,1, -1,-1,-1,-1};
+
+
  public:
-    INHERIT_FIELD_TYPES(Impl);
-    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
+
+    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l){}
 
     virtual std::string LogParameters() {
       std::stringstream sstream;
@@ -51,27 +68,75 @@ class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
 
     virtual std::string action_name() {return "ScalarAction";}
 
-    virtual void refresh(const Field &U,
-                         GridParallelRNG &pRNG) {}  // noop as no pseudoferms
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
 
     virtual RealD S(const Field &p) {
-        Field action(p._grid);
-        Field pshift(p._grid);
-        Field phisquared(p._grid);
+        static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+        phiStencil.HaloExchange(p, compressor);
+
+        Field action(p._grid), pshift(p._grid), phisquared(p._grid);
         phisquared = p*p;
         action = (2.0*QCD::Nd + mass_square)*phisquared + lambda*phisquared*phisquared;
         for (int mu = 0; mu < QCD::Nd; mu++) {
-            pshift = Cshift(p, mu, +1);  // not efficient implement with stencils
-            action -= pshift*p + p*pshift;
+            //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
+            PARALLEL_FOR_LOOP
+            for (int i = 0; i < p._grid->oSites(); i++) {
+                int permute_type;
+                StencilEntry *SE;
+                vobj temp2;
+                vobj *temp;
+                vobj *t_p;
+
+                SE = phiStencil.GetEntry(permute_type, mu, i);
+                t_p  = &p._odata[i];
+                if ( SE->_is_local ) {
+                    temp = &p._odata[SE->_offset];
+                    if ( SE->_permute ) {
+                        permute(temp2, *temp, permute_type);
+                        action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
+                    } else {
+                  action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp);
+                    }
+                } else {
+                  action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
+                }
+            }
+            //  action -= pshift*p + p*pshift;
         }
+        // NB the trace in the algebra is normalised to 1/2
+        // minus sign coming from the antihermitian fields
         return -(TensorRemove(sum(trace(action)))).real();
     };
 
-    virtual void deriv(const Field &p,
-                       Field &force) {
+    virtual void deriv(const Field &p, Field &force) {
         force = (2.0*QCD::Nd + mass_square)*p + 2.0*lambda*p*p*p;
-        // following is inefficient
-        for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+        // move this outside
+        static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+        phiStencil.HaloExchange(p, compressor);
+
+        //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+        for (int point = 0; point < npoint; point++) {
+            PARALLEL_FOR_LOOP
+            for (int i = 0; i < p._grid->oSites(); i++) {
+                vobj *temp;
+                vobj temp2;
+                int permute_type;
+                StencilEntry *SE;
+                SE = phiStencil.GetEntry(permute_type, point, i);
+
+                if ( SE->_is_local ) {
+                    temp = &p._odata[SE->_offset];
+                    if ( SE->_permute ) {
+                        permute(temp2, *temp, permute_type);
+                        force._odata[i] -= temp2;
+                    } else {
+                        force._odata[i] -= *temp;
+                    }
+                } else {
+                    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+                }
+            }
+        }
     }
 };
 
diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc
index 1b71b8a5..1d35e1bb 100644
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -1,6 +1,6 @@
     /*************************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid
 
     Source file: ./tests/Test_stencil.cc
 
@@ -33,9 +33,8 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
+int main(int argc, char ** argv) {
+  Grid_init(&argc, &argv);
 
   //  typedef LatticeColourMatrix Field;
   typedef LatticeComplex Field;
@@ -47,7 +46,7 @@ int main (int argc, char ** argv)
   std::vector<int> mpi_layout  = GridDefaultMpi();
 
   double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
-    
+
   GridCartesian Fine(latt_size,simd_layout,mpi_layout);
   GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
   GridParallelRNG       fRNG(&Fine);
@@ -55,14 +54,14 @@ int main (int argc, char ** argv)
   //  fRNG.SeedRandomDevice();
   std::vector<int> seeds({1,2,3,4});
   fRNG.SeedFixedIntegers(seeds);
-  
+
   Field Foo(&Fine);
   Field Bar(&Fine);
   Field Check(&Fine);
   Field Diff(&Fine);
   LatticeComplex lex(&Fine);
 
-  lex = zero;  
+  lex = zero;
   random(fRNG,Foo);
   gaussian(fRNG,Bar);
 
@@ -98,7 +97,7 @@ int main (int argc, char ** argv)
 	  Fine.oCoorFromOindex(ocoor,o);
 	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 	}
-	
+
 	SimpleCompressor<vobj> compress;
 	myStencil.HaloExchange(Foo,compress);
 
@@ -106,16 +105,16 @@ int main (int argc, char ** argv)
 
 	// Implement a stencil code that should agree with cshift!
 	for(int i=0;i<Check._grid->oSites();i++){
-	  
+
 	  int permute_type;
 	  StencilEntry *SE;
 	  SE = myStencil.GetEntry(permute_type,0,i);
-	  
+
 	  if ( SE->_is_local && SE->_permute )
 	    permute(Check._odata[i],Foo._odata[SE->_offset],permute_type);
 	  else if (SE->_is_local)
 	    Check._odata[i] = Foo._odata[SE->_offset];
-	  else 
+	  else
 	    Check._odata[i] = myStencil.CommBuf()[SE->_offset];
 	}
 
@@ -144,7 +143,7 @@ int main (int argc, char ** argv)
 		      <<") " <<check<<" vs "<<bar<<std::endl;
 	  }
 
-	 
+
 	}}}}
 
 
@@ -179,18 +178,18 @@ int main (int argc, char ** argv)
 	  Fine.oCoorFromOindex(ocoor,o);
 	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 	}
-	
+
 	SimpleCompressor<vobj> compress;
 
 	EStencil.HaloExchange(EFoo,compress);
 	OStencil.HaloExchange(OFoo,compress);
-	
+
 	Bar = Cshift(Foo,dir,disp);
 
 	if ( disp & 0x1 ) {
 	  ECheck.checkerboard = Even;
 	  OCheck.checkerboard = Odd;
-	} else { 
+	} else {
 	  ECheck.checkerboard = Odd;
 	  OCheck.checkerboard = Even;
 	}
@@ -206,7 +205,7 @@ int main (int argc, char ** argv)
 	    permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type);
 	  else if (SE->_is_local)
 	    OCheck._odata[i] = EFoo._odata[SE->_offset];
-	  else 
+	  else
 	    OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
 	}
 	for(int i=0;i<ECheck._grid->oSites();i++){
@@ -214,18 +213,18 @@ int main (int argc, char ** argv)
 	  StencilEntry *SE;
 	  SE = OStencil.GetEntry(permute_type,0,i);
 	  //	  std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
-	  
+
 	  if ( SE->_is_local && SE->_permute )
 	    permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type);
 	  else if (SE->_is_local)
 	    ECheck._odata[i] = OFoo._odata[SE->_offset];
-	  else 
+	  else
 	    ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
 	}
-	
+
 	setCheckerboard(Check,ECheck);
 	setCheckerboard(Check,OCheck);
-	
+
 	Real nrmC = norm2(Check);
 	Real nrmB = norm2(Bar);
 	Diff = Check-Bar;
@@ -248,10 +247,10 @@ int main (int argc, char ** argv)
 	  diff =norm2(ddiff);
 	  if ( diff > 0){
 	    std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3] <<") "
-		      <<"shift "<<disp<<" dir "<< dir 
+		      <<"shift "<<disp<<" dir "<< dir
 		      << "  stencil impl " <<check<<" vs cshift impl "<<bar<<std::endl;
 	  }
-	 
+
 	}}}}
 
 
diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc
index 8b93efde..f63936b5 100644
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
-namespace Grid{
+namespace Grid {
 class ScalarActionParameters : Serializable {
  public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters,
@@ -44,7 +44,7 @@ int main(int argc, char **argv) {
   // here make a routine to print all the relevant information on the run
   std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
 
-   // Typedefs to simplify notation
+  // Typedefs to simplify notation
   typedef ScalarAdjGenericHMCRunner HMCWrapper;  // Uses the default minimum norm, real scalar fields
 
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
@@ -52,7 +52,7 @@ int main(int argc, char **argv) {
 
   // Grid from the command line
   GridModule ScalarGrid;
-  ScalarGrid.set_full( SpaceTimeGrid::makeFourDimGrid(
+  ScalarGrid.set_full(SpaceTimeGrid::makeFourDimGrid(
         GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
         GridDefaultMpi()));
   ScalarGrid.set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(ScalarGrid.get_full()));
@@ -89,12 +89,11 @@ int main(int argc, char **argv) {
   /////////////////////////////////////////////////////////////
 
   // HMC parameters are serialisable
-  TheHMC.Parameters.MD.MDsteps = 10;
+  TheHMC.Parameters.MD.MDsteps = 20;
   TheHMC.Parameters.MD.trajL   = 1.0;
 
   TheHMC.ReadCommandLine(argc, argv);
   TheHMC.Run();
 
   Grid_finalize();
-
-} // main
+}  // main

From 038b6ee9cdfc5902b27a8645b1f1758c9db3656f Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 16 Mar 2017 01:09:24 +0900
Subject: [PATCH 036/177] Fixing JSON compilation error

---
 lib/json/json.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/json/json.hpp b/lib/json/json.hpp
index 97214f0b..bfb38c3e 100644
--- a/lib/json/json.hpp
+++ b/lib/json/json.hpp
@@ -64,7 +64,7 @@ SOFTWARE.
     #endif
 #elif defined(__GNUC__)
     #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-    #if GCC_VERSION < 40900
+    #if GCC_VERSION < 40800
         #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
     #endif
 #endif

From 7b03d8d0879d7f7922b8867eefa9346cb0e5c425 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 5 Apr 2017 16:17:46 +0100
Subject: [PATCH 037/177] Fixing the remaining merge conflicts

---
 lib/qcd/action/scalar/Scalar.h | 5 +++++
 tests/Test_stencil.cc          | 7 -------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/lib/qcd/action/scalar/Scalar.h b/lib/qcd/action/scalar/Scalar.h
index e5bea275..cae38360 100644
--- a/lib/qcd/action/scalar/Scalar.h
+++ b/lib/qcd/action/scalar/Scalar.h
@@ -31,6 +31,7 @@ directory
 
 #include <Grid/qcd/action/scalar/ScalarImpl.h>
 #include <Grid/qcd/action/scalar/ScalarAction.h>
+#include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
 
 namespace Grid {
 namespace QCD {
@@ -39,6 +40,10 @@ namespace QCD {
   typedef ScalarAction<ScalarImplF>                 ScalarActionF;
   typedef ScalarAction<ScalarImplD>                 ScalarActionD;
 
+  typedef ScalarInteractionAction<ScalarAdjImplR>   ScalarAdjActionR;
+  typedef ScalarInteractionAction<ScalarAdjImplF>   ScalarAdjActionF;
+  typedef ScalarInteractionAction<ScalarAdjImplD>   ScalarAdjActionD;
+  
 }
 }
 
diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc
index 2a4744f3..fa4b0b57 100644
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -189,13 +189,6 @@ int main(int argc, char ** argv) {
 
 	SimpleCompressor<vobj> compress;
 
-<<<<<<< HEAD
-	EStencil.HaloExchange(EFoo,compress);
-	OStencil.HaloExchange(OFoo,compress);
-
-=======
-	
->>>>>>> feature/hmc_generalise
 	Bar = Cshift(Foo,dir,disp);
 
 	if ( disp & 0x1 ) {

From 140741875555fb9c788e78bb8b6080e480776c0f Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 13 Apr 2017 15:32:30 +0100
Subject: [PATCH 038/177] Old qed-fvol program build disabled

---
 extras/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extras/Makefile.am b/extras/Makefile.am
index 416a9fc8..d8c2b675 100644
--- a/extras/Makefile.am
+++ b/extras/Makefile.am
@@ -1 +1 @@
-SUBDIRS = Hadrons qed-fvol
\ No newline at end of file
+SUBDIRS = Hadrons
\ No newline at end of file

From 741bc836f69d37623cba76cf4aee06dee3f6c84e Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 5 May 2017 17:36:43 +0100
Subject: [PATCH 039/177] Exposing support for Ncolours and Ndimensions and
 JSON input file for the ScalarAction

---
 lib/qcd/action/scalar/Scalar.h                |   6 +-
 lib/qcd/action/scalar/ScalarImpl.h            |   5 +-
 .../action/scalar/ScalarInteractionAction.h   | 152 +++++++++---------
 lib/qcd/hmc/GenericHMCrunner.h                |   3 +
 lib/qcd/representations/hmc_types.h           |   3 +
 lib/stencil/Stencil.h                         |   2 +-
 tests/hmc/Test_hmc_ScalarActionNxN.cc         | 104 ++++++++----
 7 files changed, 168 insertions(+), 107 deletions(-)

diff --git a/lib/qcd/action/scalar/Scalar.h b/lib/qcd/action/scalar/Scalar.h
index cae38360..485a6765 100644
--- a/lib/qcd/action/scalar/Scalar.h
+++ b/lib/qcd/action/scalar/Scalar.h
@@ -40,9 +40,9 @@ namespace QCD {
   typedef ScalarAction<ScalarImplF>                 ScalarActionF;
   typedef ScalarAction<ScalarImplD>                 ScalarActionD;
 
-  typedef ScalarInteractionAction<ScalarAdjImplR>   ScalarAdjActionR;
-  typedef ScalarInteractionAction<ScalarAdjImplF>   ScalarAdjActionF;
-  typedef ScalarInteractionAction<ScalarAdjImplD>   ScalarAdjActionD;
+  template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
+  template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
+  template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
   
 }
 }
diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h
index 6d14b61a..8b5e3aa2 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -96,7 +96,10 @@ class ScalarImplTypes {
   typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
   typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
 
-
+  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
+  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
+  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
+  
   //}
 }
 
diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index 5a322a5e..ca8207bd 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -37,11 +37,11 @@ directory
 namespace Grid {
   // FIXME drop the QCD namespace everywhere here
 
-template <class Impl>
-class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
-public:
+  template <class Impl, int Ndim >
+  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
+  public:
     INHERIT_FIELD_TYPES(Impl);
-private:
+  private:
     RealD mass_square;
     RealD lambda;
 
@@ -50,14 +50,19 @@ private:
     typedef CartesianStencil<vobj,vobj> Stencil;
 
     SimpleCompressor<vobj> compressor;
-    int npoint = 8;
-    std::vector<int> directions    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
-    std::vector<int> displacements = {1,1,1,1, -1,-1,-1,-1};
+    int npoint = 2*Ndim;
+    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
+    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
 
 
- public:
+  public:
 
-    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l){}
+    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
+      for (int mu = 0 ; mu < Ndim; mu++){
+		directions[mu]         = mu; directions[mu+Ndim]    = mu;
+		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
+      }
+    }
 
     virtual std::string LogParameters() {
       std::stringstream sstream;
@@ -71,75 +76,74 @@ private:
     virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
 
     virtual RealD S(const Field &p) {
-        static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-        phiStencil.HaloExchange(p, compressor);
-
-        Field action(p._grid), pshift(p._grid), phisquared(p._grid);
-        phisquared = p*p;
-        action = (2.0*QCD::Nd + mass_square)*phisquared + lambda*phisquared*phisquared;
-        for (int mu = 0; mu < QCD::Nd; mu++) {
-            //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-            PARALLEL_FOR_LOOP
-            for (int i = 0; i < p._grid->oSites(); i++) {
-                int permute_type;
-                StencilEntry *SE;
-                vobj temp2;
-                vobj *temp;
-                vobj *t_p;
-
-                SE = phiStencil.GetEntry(permute_type, mu, i);
-                t_p  = &p._odata[i];
-                if ( SE->_is_local ) {
-                    temp = &p._odata[SE->_offset];
-                    if ( SE->_permute ) {
-                        permute(temp2, *temp, permute_type);
-                        action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
-                    } else {
-                  action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp);
-                    }
-                } else {
-                  action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
-                }
-            }
-            //  action -= pshift*p + p*pshift;
-        }
-        // NB the trace in the algebra is normalised to 1/2
-        // minus sign coming from the antihermitian fields
-        return -(TensorRemove(sum(trace(action)))).real();
+      assert(p._grid->Nd() == Ndim);
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+      phiStencil.HaloExchange(p, compressor);
+      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
+      phisquared = p*p;
+      action = (2.0*Ndim + mass_square)*phisquared + lambda*phisquared*phisquared;
+      for (int mu = 0; mu < Ndim; mu++) {
+	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
+	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
+	  int permute_type;
+	  StencilEntry *SE;
+	  vobj temp2;
+	  vobj *temp;
+	  vobj *t_p;
+	    
+	  SE = phiStencil.GetEntry(permute_type, mu, i);
+	  t_p  = &p._odata[i];
+	  if ( SE->_is_local ) {
+	    temp = &p._odata[SE->_offset];
+	    if ( SE->_permute ) {
+	      permute(temp2, *temp, permute_type);
+	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
+	    } else {
+	      action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp);
+	    }
+	  } else {
+	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
+	  }
+	}
+	//  action -= pshift*p + p*pshift;
+      }
+      // NB the trace in the algebra is normalised to 1/2
+      // minus sign coming from the antihermitian fields
+      return -(TensorRemove(sum(trace(action)))).real();
     };
 
     virtual void deriv(const Field &p, Field &force) {
-        force = (2.0*QCD::Nd + mass_square)*p + 2.0*lambda*p*p*p;
-        // move this outside
-        static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-        phiStencil.HaloExchange(p, compressor);
-
-        //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-        for (int point = 0; point < npoint; point++) {
-            PARALLEL_FOR_LOOP
-            for (int i = 0; i < p._grid->oSites(); i++) {
-                vobj *temp;
-                vobj temp2;
-                int permute_type;
-                StencilEntry *SE;
-                SE = phiStencil.GetEntry(permute_type, point, i);
-
-                if ( SE->_is_local ) {
-                    temp = &p._odata[SE->_offset];
-                    if ( SE->_permute ) {
-                        permute(temp2, *temp, permute_type);
-                        force._odata[i] -= temp2;
-                    } else {
-                        force._odata[i] -= *temp;
-                    }
-                } else {
-                    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
-                }
-            }
-        }
+      assert(p._grid->Nd() == Ndim);
+      force = (2.0*Ndim + mass_square)*p + 2.0*lambda*p*p*p;
+      // move this outside
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+      phiStencil.HaloExchange(p, compressor);
+      
+      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+      for (int point = 0; point < npoint; point++) {
+	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
+	  vobj *temp;
+	  vobj temp2;
+	  int permute_type;
+	  StencilEntry *SE;
+	  SE = phiStencil.GetEntry(permute_type, point, i);
+	  
+	  if ( SE->_is_local ) {
+	    temp = &p._odata[SE->_offset];
+	    if ( SE->_permute ) {
+	      permute(temp2, *temp, permute_type);
+	      force._odata[i] -= temp2;
+	    } else {
+	      force._odata[i] -= *temp;
+	    }
+	  } else {
+	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+	  }
+	}
+      }
     }
-};
-
+  };
+  
 }  // namespace Grid
 
 #endif  // SCALAR_INT_ACTION_H
diff --git a/lib/qcd/hmc/GenericHMCrunner.h b/lib/qcd/hmc/GenericHMCrunner.h
index 353b4905..4f6c1af0 100644
--- a/lib/qcd/hmc/GenericHMCrunner.h
+++ b/lib/qcd/hmc/GenericHMCrunner.h
@@ -210,6 +210,9 @@ typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields>
 typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
     ScalarAdjGenericHMCRunner;
 
+template <int Colours> 
+using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
+
 }  // namespace QCD
 }  // namespace Grid
 
diff --git a/lib/qcd/representations/hmc_types.h b/lib/qcd/representations/hmc_types.h
index b4991941..3fee377e 100644
--- a/lib/qcd/representations/hmc_types.h
+++ b/lib/qcd/representations/hmc_types.h
@@ -64,6 +64,9 @@ typedef Representations<FundamentalRepresentation> NoHirep;
 typedef Representations<EmptyRep<typename ScalarImplR::Field> > ScalarFields;
 typedef Representations<EmptyRep<typename ScalarAdjImplR::Field> > ScalarMatrixFields;
 
+template < int Colours> 
+using ScalarNxNMatrixFields = Representations<EmptyRep<typename ScalarNxNAdjImplR<Colours>::Field> >;
+
 // Helper classes to access the elements
 // Strips the first N parameters from the tuple
 // sequence of classes to obtain the S sequence
diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h
index d1c28e78..887142c4 100644
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -286,7 +286,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   {
     int dimension    = _directions[point];
     int displacement = _distances[point];
-
+    
     int fd = _grid->_fdimensions[dimension];
     int rd = _grid->_rdimensions[dimension];
     
diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc
index f63936b5..b3ce6840 100644
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -32,68 +32,116 @@ class ScalarActionParameters : Serializable {
   GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters,
     double, mass_squared,
     double, lambda);
+
+    template <class ReaderClass >
+  ScalarActionParameters(Reader<ReaderClass>& Reader){
+    read(Reader, "ScalarAction", *this);
+  }
+
 };
 
 }
 int main(int argc, char **argv) {
   using namespace Grid;
   using namespace Grid::QCD;
-
+  typedef Grid::JSONReader       Serialiser;
+  
   Grid_init(&argc, &argv);
   int threads = GridThread::GetThreads();
   // here make a routine to print all the relevant information on the run
   std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
 
   // Typedefs to simplify notation
-  typedef ScalarAdjGenericHMCRunner HMCWrapper;  // Uses the default minimum norm, real scalar fields
-
+  constexpr int Ncolours    = 4;
+  constexpr int Ndimensions = 3;
+  typedef ScalarNxNAdjGenericHMCRunner<Ncolours> HMCWrapper;  // Uses the default minimum norm, real scalar fields
+  typedef ScalarAdjActionR<Ncolours, Ndimensions> ScalarAction;
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
   HMCWrapper TheHMC;
+  TheHMC.ReadCommandLine(argc, argv);
+
+  if (TheHMC.ParameterFile.empty()){
+    std::cout << "Input file not specified."
+              << "Use --ParameterFile option in the command line.\nAborting" 
+              << std::endl;
+    exit(1);
+  }
+  Serialiser Reader(TheHMC.ParameterFile);
 
   // Grid from the command line
   GridModule ScalarGrid;
-  ScalarGrid.set_full(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
-        GridDefaultMpi()));
-  ScalarGrid.set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(ScalarGrid.get_full()));
+  if (GridDefaultLatt().size() != Ndimensions){
+    std::cout << "Incorrect dimension of the grid\n. Expected dim="<< Ndimensions << std::endl;
+    exit(1);
+  }
+  if (GridDefaultMpi().size() != Ndimensions){
+    std::cout << "Incorrect dimension of the mpi grid\n. Expected dim="<< Ndimensions << std::endl;
+    exit(1);
+  }
+  ScalarGrid.set_full(new GridCartesian(GridDefaultLatt(),GridDefaultSimd(Ndimensions, vComplex::Nsimd()),GridDefaultMpi()));
+  ScalarGrid.set_rb(new GridRedBlackCartesian(ScalarGrid.get_full()));
   TheHMC.Resources.AddGrid("scalar", ScalarGrid);
-  // Possibile to create the module by hand
-  // hardcoding parameters or using a Reader
+  std::cout << "Lattice size : " << GridDefaultLatt() << std::endl;
 
   // Checkpointer definition
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_scalar_lat";
-  CPparams.rng_prefix = "ckpoint_scalar_rng";
-  CPparams.saveInterval = 50;
-  CPparams.format = "IEEE64BIG";
-
+  CheckpointerParameters CPparams(Reader);
   TheHMC.Resources.LoadBinaryCheckpointer(CPparams);
 
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
+  RNGModuleParameters RNGpar(Reader);
   TheHMC.Resources.SetRNGSeeds(RNGpar);
   /////////////////////////////////////////////////////////////
   // Collect actions, here use more encapsulation
 
   // Scalar action in adjoint representation
-  ScalarActionParameters SPar;
-  SPar.mass_squared = 0.5;
-  SPar.lambda       = 0.1;
-  ScalarAdjActionR Saction(SPar.mass_squared, SPar.lambda);
+  ScalarActionParameters SPar(Reader);
+  ScalarAction Saction(SPar.mass_squared, SPar.lambda);
 
   // Collect actions
-  ActionLevel<ScalarAdjActionR::Field, ScalarMatrixFields> Level1(1);
+  ActionLevel<ScalarAction::Field, ScalarNxNMatrixFields<Ncolours>> Level1(1);
   Level1.push_back(&Saction);
   TheHMC.TheAction.push_back(Level1);
   /////////////////////////////////////////////////////////////
+  TheHMC.Parameters.initialize(Reader);
 
-  // HMC parameters are serialisable
-  TheHMC.Parameters.MD.MDsteps = 20;
-  TheHMC.Parameters.MD.trajL   = 1.0;
-
-  TheHMC.ReadCommandLine(argc, argv);
   TheHMC.Run();
 
   Grid_finalize();
 }  // main
+
+/* Examples for input files
+
+JSON
+
+{
+    "Checkpointer": {
+    "config_prefix": "ckpoint_scalar_lat",
+    "rng_prefix": "ckpoint_scalar_rng",
+    "saveInterval": 1,
+    "format": "IEEE64BIG"
+    },
+    "RandomNumberGenerator": {
+    "serial_seeds": "1 2 3 4 6",
+    "parallel_seeds": "6 7 8 9 11"
+    },
+    "ScalarAction":{
+      "mass_squared": 0.5,
+      "lambda": 0.1
+    },
+    "HMC":{
+    "StartTrajectory": 0,
+    "Trajectories": 100,
+    "MetropolisTest": true,
+    "NoMetropolisUntil": 10,
+    "StartingType": "HotStart",
+    "MD":{
+        "name": "MinimumNorm2",
+	      "MDsteps": 15,
+	      "trajL": 2.0
+	    }
+    }
+}
+
+
+XML example not provided yet
+
+*/
\ No newline at end of file

From 43c817cc67c6447bbf69bfc7d7772fba4e7ff9eb Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 11 May 2017 00:07:17 +0100
Subject: [PATCH 040/177] Scalar action: const fix

---
 lib/qcd/action/scalar/ScalarInteractionAction.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index ca8207bd..5f4c630c 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -88,8 +88,7 @@ namespace Grid {
 	  int permute_type;
 	  StencilEntry *SE;
 	  vobj temp2;
-	  vobj *temp;
-	  vobj *t_p;
+	  const vobj *temp, *t_p;
 	    
 	  SE = phiStencil.GetEntry(permute_type, mu, i);
 	  t_p  = &p._odata[i];
@@ -122,7 +121,7 @@ namespace Grid {
       //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
       for (int point = 0; point < npoint; point++) {
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
-	  vobj *temp;
+	  const vobj *temp;
 	  vobj temp2;
 	  int permute_type;
 	  StencilEntry *SE;

From d1ece741370d1b829f5946afc7c21c585a158d31 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 11 May 2017 11:40:44 +0100
Subject: [PATCH 041/177] HMC scalar test: magnetisation measurement

---
 tests/hmc/Test_hmc_ScalarActionNxN.cc | 54 ++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc
index b3ce6840..bcaee31d 100644
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -39,11 +39,50 @@ class ScalarActionParameters : Serializable {
   }
 
 };
-
 }
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+template <class Impl>
+class MagLogger : public HmcObservable<typename Impl::Field> {
+public:
+  typedef typename Impl::Field Field;
+  typedef typename Impl::Simd::scalar_type Trace;
+  
+  void TrajectoryComplete(int traj,
+                          Field &U,
+                          GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
+    
+    int def_prec = std::cout.precision();
+    
+    std::cout << std::setprecision(std::numeric_limits<Real>::digits10 + 1);
+    std::cout << GridLogMessage
+              << "m= " << TensorRemove(trace(sum(U))) << std::endl;
+    std::cout << GridLogMessage
+              << "m^2= " << TensorRemove(trace(sum(U)*sum(U))) << std::endl;
+    std::cout.precision(def_prec);
+    
+  }
+private:
+  
+};
+
+template <class Impl>
+class MagMod: public ObservableModule<MagLogger<Impl>, NoParameters>{
+  typedef ObservableModule<MagLogger<Impl>, NoParameters> ObsBase;
+  using ObsBase::ObsBase; // for constructors
+  
+  // acquire resource
+  virtual void initialize(){
+    this->ObservablePtr.reset(new MagLogger<Impl>());
+  }
+public:
+  MagMod(): ObsBase(NoParameters()){}
+};
+
 int main(int argc, char **argv) {
-  using namespace Grid;
-  using namespace Grid::QCD;
   typedef Grid::JSONReader       Serialiser;
   
   Grid_init(&argc, &argv);
@@ -52,7 +91,7 @@ int main(int argc, char **argv) {
   std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
 
   // Typedefs to simplify notation
-  constexpr int Ncolours    = 4;
+  constexpr int Ncolours    = 2;
   constexpr int Ndimensions = 3;
   typedef ScalarNxNAdjGenericHMCRunner<Ncolours> HMCWrapper;  // Uses the default minimum norm, real scalar fields
   typedef ScalarAdjActionR<Ncolours, Ndimensions> ScalarAction;
@@ -89,6 +128,11 @@ int main(int argc, char **argv) {
 
   RNGModuleParameters RNGpar(Reader);
   TheHMC.Resources.SetRNGSeeds(RNGpar);
+  
+  // Construct observables
+  typedef MagMod<HMCWrapper::ImplPolicy> MagObs;
+  TheHMC.Resources.AddObservable<MagObs>();
+  
   /////////////////////////////////////////////////////////////
   // Collect actions, here use more encapsulation
 
@@ -144,4 +188,4 @@ JSON
 
 XML example not provided yet
 
-*/
\ No newline at end of file
+*/

From 3f858d675557536feb6bac6312e4205c987857d9 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 17 May 2017 13:25:14 +0200
Subject: [PATCH 042/177] Scalar: phi^2 observable

---
 tests/hmc/Test_hmc_ScalarActionNxN.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc
index bcaee31d..a7490f51 100644
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -62,6 +62,8 @@ public:
               << "m= " << TensorRemove(trace(sum(U))) << std::endl;
     std::cout << GridLogMessage
               << "m^2= " << TensorRemove(trace(sum(U)*sum(U))) << std::endl;
+    std::cout << GridLogMessage
+    << "phi^2= " << TensorRemove(sum(trace(U*U))) << std::endl;
     std::cout.precision(def_prec);
     
   }

From a8c10b1933948d491371da0d4df32cb3059c3b97 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 25 May 2017 11:43:33 +0100
Subject: [PATCH 043/177] Use a global-X x Local-Y chunksize for parallel
 binary I/O.

Gives O(32 x 8 x 18*8*8) chunk size on configuration I/O.

At 150KB should be getting close to packet sizes and 4MB filesystem
block sizes that are reasonably (!?) performant. We shall see once I move
this off my laptop and over to BNL and time it.
---
 lib/parallelIO/BinaryIO.h | 196 +++++++++++++++++++++-----------------
 lib/parallelIO/NerscIO.h  |   6 +-
 2 files changed, 113 insertions(+), 89 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index afa7eb2e..ab449f92 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -217,32 +217,34 @@ class BinaryIO {
     Umu = zero;
     uint32_t csum=0;
     uint64_t bytes=0;
-    fobj file_object;
-    sobj munged;
-    
+
+    int lx = grid->_fdimensions[0];
+    std::vector<fobj> file_object(lx);
+    std::vector<sobj> munged(lx);
     for(int t=0;t<grid->_fdimensions[3];t++){
     for(int z=0;z<grid->_fdimensions[2];z++){
     for(int y=0;y<grid->_fdimensions[1];y++){
-    for(int x=0;x<grid->_fdimensions[0];x++){
-
-      std::vector<int> site({x,y,z,t});
-
+    {
+      bytes += sizeof(fobj)*lx;
       if (grid->IsBoss()) {
-        fin.read((char *)&file_object, sizeof(file_object));assert( fin.fail()==0);
-        bytes += sizeof(file_object);
-        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
-        if (ieee32)    le32toh_v((void *)&file_object, sizeof(file_object));
-        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
-        if (ieee64)    le64toh_v((void *)&file_object, sizeof(file_object));
-
-        munge(file_object, munged, csum);
+        fin.read((char *)&file_object[0], sizeof(fobj)*lx); assert( fin.fail()==0);
+	for(int x=0;x<lx;x++){
+	  if (ieee32big) be32toh_v((void *)&file_object[x], sizeof(fobj));
+	  if (ieee32)    le32toh_v((void *)&file_object[x], sizeof(fobj));
+	  if (ieee64big) be64toh_v((void *)&file_object[x], sizeof(fobj));
+	  if (ieee64)    le64toh_v((void *)&file_object[x], sizeof(fobj));
+	  munge(file_object[x], munged[x], csum);
+	}
+      }
+      for(int x=0;x<lx;x++){
+	std::vector<int> site({x,y,z,t});
+	// The boss who read the file has their value poked
+	pokeSite(munged[x],Umu,site);
       }
-      // The boss who read the file has their value poked
-      pokeSite(munged,Umu,site);
     }}}}
     timer.Stop();
     std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
 
     grid->Broadcast(0,(void *)&csum,sizeof(csum));
     return csum;
@@ -274,31 +276,34 @@ class BinaryIO {
     }
     uint64_t bytes=0;
     uint32_t csum=0;
-    fobj file_object;
-    sobj unmunged;
+    int lx = grid->_fdimensions[0];
+    std::vector<fobj> file_object(lx);
+    std::vector<sobj> unmunged(lx);
     for(int t=0;t<grid->_fdimensions[3];t++){
     for(int z=0;z<grid->_fdimensions[2];z++){
     for(int y=0;y<grid->_fdimensions[1];y++){
-    for(int x=0;x<grid->_fdimensions[0];x++){
+    {
 
-      std::vector<int> site({x,y,z,t});
+      std::vector<int> site({0,y,z,t});
       // peek & write
-      peekSite(unmunged,Umu,site);
-
-      munge(unmunged,file_object,csum);
-
+      for(int x=0;x<lx;x++){
+	site[0]=x;
+	peekSite(unmunged[x],Umu,site);
+      }
       
       if ( grid->IsBoss() ) {
-	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
-
-	// NB could gather an xstrip as an optimisation.
-	fout.write((char *)&file_object,sizeof(file_object));assert( fout.fail()==0);
-	bytes+=sizeof(file_object);
+	for(int x=0;x<lx;x++){
+	  munge(unmunged[x],file_object[x],csum);
+	  if(ieee32big) htobe32_v((void *)&file_object[x],sizeof(fobj));
+	  if(ieee32)    htole32_v((void *)&file_object[x],sizeof(fobj));
+	  if(ieee64big) htobe64_v((void *)&file_object[x],sizeof(fobj));
+	  if(ieee64)    htole64_v((void *)&file_object[x],sizeof(fobj));
+	}
+	fout.write((char *)&file_object[0],sizeof(fobj)*lx);assert( fout.fail()==0);
+	bytes+=sizeof(fobj)*lx;
       }
     }}}}
+
     timer.Stop();
     std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
@@ -370,7 +375,7 @@ class BinaryIO {
 
     timer.Stop();
 
-    std::cout << GridLogMessage   << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
     std::cout << GridLogMessage << "RNG state saved in " << timer.Elapsed() << std::endl;
     return csum;
   }
@@ -414,8 +419,6 @@ class BinaryIO {
       grid->GlobalIndexToGlobalCoor(gidx,gcoor);
       grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
       int l_idx=parallel.generator_idx(o_idx,i_idx);
-      //std::cout << GridLogDebug << "l_idx " << l_idx << " o_idx " << o_idx
-      //          << " i_idx " << i_idx << " rank " << rank << std::endl;
 
       if ( grid->IsBoss() ) {
 	fin.read((char *)&saved[0],bytes);assert( fin.fail()==0);
@@ -460,14 +463,12 @@ class BinaryIO {
     int ieee64    = (format == std::string("IEEE64"));
 
 
-    // Take into account block size of parallel file systems want about
-    // 4-16MB chunks.
     // Ideally one reader/writer per xy plane and read these contiguously
     // with comms from nominated I/O nodes.
     std::ifstream fin;
 
     int nd = grid->_ndimension;
-    std::vector<int> parallel(nd,1);
+    std::vector<int> parallel(nd,1); parallel[0] = 0;
     std::vector<int> ioproc  (nd);
     std::vector<int> start(nd);
     std::vector<int> range(nd);
@@ -479,9 +480,15 @@ class BinaryIO {
     uint64_t slice_vol = 1;
 
     int IOnode = 1;
-    for(int d=0;d<grid->_ndimension;d++) {
+    int gstrip = grid->_gdimensions[0];
+    int lstrip = grid->_ldimensions[0];
 
-      if ( d == 0 ) parallel[d] = 0;
+    int chunk ;
+    if ( nd==1) chunk = gstrip;
+    else        chunk = gstrip*grid->_ldimensions[1];
+
+    for(int d=0;d<grid->_ndimension;d++) {
+      
       if (parallel[d]) {
 	range[d] = grid->_ldimensions[d];
 	start[d] = grid->_processor_coor[d]*range[d];
@@ -500,13 +507,16 @@ class BinaryIO {
       uint32_t tmp = IOnode;
       grid->GlobalSum(tmp);
       std::cout<< std::dec ;
-      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
+      std::cout<< GridLogMessage<< "Parallel read I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
       for(int d=0;d<grid->_ndimension;d++){
 	std::cout<< range[d];
 	if( d< grid->_ndimension-1 ) 
 	  std::cout<< " x ";
       }
       std::cout << std::endl;
+      std::cout<< GridLogMessage<< "Parallel I/O local  strip size is "<< lstrip <<std::endl;
+      std::cout<< GridLogMessage<< "Parallel I/O global strip size is "<< gstrip <<std::endl;
+      std::cout<< GridLogMessage<< "Parallel I/O chunk size is "<< chunk  <<std::endl;
     }
 
     GridStopWatch timer; timer.Start();
@@ -515,10 +525,11 @@ class BinaryIO {
     int myrank = grid->ThisRank();
     int iorank = grid->RankFromProcessorCoor(ioproc);
 
-    if (!ILDG.is_ILDG)
-    	if ( IOnode ) { 
-    		fin.open(file,std::ios::binary|std::ios::in);
-    	}
+    if (!ILDG.is_ILDG) {
+      if ( IOnode ) { 
+	fin.open(file,std::ios::binary|std::ios::in);
+      }
+    }
 
     //////////////////////////////////////////////////////////
     // Find the location of each site and send to primary node
@@ -528,16 +539,15 @@ class BinaryIO {
     Umu = zero;
     static uint32_t csum; csum=0;//static for SHMEM
 
-    fobj fileObj;
-    static sobj siteObj; // Static to place in symmetric region for SHMEM
+    std::vector<fobj> fileObj(chunk); // FIXME
+    std::vector<sobj> siteObj(chunk); // Use comm allocator to place in symmetric region for SHMEM
 
-      // need to implement these loops in Nd independent way with a lexico conversion
-    for(int tlex=0;tlex<slice_vol;tlex++){
+    // need to implement these loops in Nd independent way with a lexico conversion
+    for(int tlex=0;tlex<slice_vol;tlex+=chunk){
 
       std::vector<int> tsite(nd); // temporary mixed up site
       std::vector<int> gsite(nd);
       std::vector<int> lsite(nd);
-      std::vector<int> iosite(nd);
 
       Lexicographic::CoorFromIndex(tsite,tlex,range);
 
@@ -546,53 +556,68 @@ class BinaryIO {
 	gsite[d] = tsite[d]+start[d];               // global site
       }
 
-
-      /////////////////////////
-      // Get the rank of owner of data
-      /////////////////////////
+      ///////////////////////////////////////////
+      // Get the global lexico base of the chunk
+      ///////////////////////////////////////////
       int rank, o_idx,i_idx, g_idx;
       grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
       grid->GlobalCoorToGlobalIndex(gsite,g_idx);
-      
+
       ////////////////////////////////
       // iorank reads from the seek
       ////////////////////////////////
       if (myrank == iorank) {
 
-
       	if (ILDG.is_ILDG){
-      		// use C-LIME to populate the record
-          #ifdef HAVE_LIME
-          uint64_t sizeFO = sizeof(fileObj);
+#ifdef HAVE_LIME
+	  // use C-LIME to populate the record
+          uint64_t sizeFO = sizeof(fobj)*chunk;
           limeReaderSeek(ILDG.LR, g_idx*sizeFO, SEEK_SET);
-          int status = limeReaderReadData((void *)&fileObj, &sizeFO, ILDG.LR);
-          #endif
+          int status = limeReaderReadData((void *)&fileObj[0], &sizeFO, ILDG.LR);
+#endif
         } else{
-          fin.seekg(offset+g_idx*sizeof(fileObj));
-          fin.read((char *)&fileObj,sizeof(fileObj));
+          fin.seekg(offset+g_idx*sizeof(fobj));
+          fin.read((char *)&fileObj[0],sizeof(fobj)*chunk);
         }
-        bytes+=sizeof(fileObj);
+        bytes+=sizeof(fobj)*chunk;
 
-        if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-        if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-        if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-        if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+        if(ieee32big) be32toh_v((void *)&fileObj[0],sizeof(fobj)*chunk);
+        if(ieee32)    le32toh_v((void *)&fileObj[0],sizeof(fobj)*chunk);
+        if(ieee64big) be64toh_v((void *)&fileObj[0],sizeof(fobj)*chunk);
+        if(ieee64)    le64toh_v((void *)&fileObj[0],sizeof(fobj)*chunk);
 
-        munge(fileObj,siteObj,csum);
+	for(int c=0;c<chunk;c++) munge(fileObj[c],siteObj[c],csum);
 
       } 
-      
+     
       // Possibly do transport through pt2pt 
-      if ( rank != iorank ) { 
-	if ( (myrank == rank) || (myrank==iorank) ) {
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+      for(int cc=0;cc<chunk;cc+=lstrip){
+
+	/////////////////////////////////
+	// Get the rank of owner of strip
+	/////////////////////////////////
+	Lexicographic::CoorFromIndex(tsite,tlex+cc,range);
+
+	for(int d=0;d<nd;d++){
+	  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+	  gsite[d] = tsite[d]+start[d];               // global site
 	}
+	grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
+
+	if ( rank != iorank ) { 
+	  if ( (myrank == rank) || (myrank==iorank) ) {
+	    grid->SendRecvPacket((void *)&siteObj[cc],(void *)&siteObj[cc],iorank,rank,sizeof(sobj)*lstrip);
+	  }
+	}
+	// Poke at destination
+	if ( myrank == rank ) {
+	  for(int x=0;x<lstrip;x++){
+	    lsite[0]=x;
+	    pokeLocalSite(siteObj[cc+x],Umu,lsite);
+	  }
+	}
+	grid->Barrier(); // necessary?
       }
-      // Poke at destination
-      if ( myrank == rank ) {
-	pokeLocalSite(siteObj,Umu,lsite);
-      }
-      grid->Barrier(); // necessary?
     }
 
     grid->GlobalSum(csum);
@@ -601,7 +626,7 @@ class BinaryIO {
 
     timer.Stop();
     std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+	     <<(double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
     return csum;
   }
 
@@ -623,11 +648,8 @@ class BinaryIO {
     int ieee64 = (format == std::string("IEEE64"));
 
     if (!(ieee32big || ieee32 || ieee64big || ieee64)) {
-      std::cout << GridLogError << "Unrecognized file format " << format
-                << std::endl;
-      std::cout << GridLogError
-                << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
-                << std::endl;
+      std::cout << GridLogError << "Unrecognized file format " << format << std::endl;
+      std::cout << GridLogError << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64" << std::endl;
       exit(0);
     }
 
@@ -715,10 +737,10 @@ class BinaryIO {
     // need to implement these loops in Nd independent way with a lexico
     // conversion
     for (int tlex = 0; tlex < slice_vol; tlex++) {
+
       std::vector<int> tsite(nd);  // temporary mixed up site
       std::vector<int> gsite(nd);
       std::vector<int> lsite(nd);
-      std::vector<int> iosite(nd);
 
       Lexicographic::CoorFromIndex(tsite, tlex, range);
 
diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index f0159d41..cd20c841 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -30,6 +30,9 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 
+#define PARALLEL_READ
+#undef PARALLEL_WRITE
+
 #include <algorithm>
 #include <iostream>
 #include <iomanip>
@@ -326,8 +329,6 @@ namespace Grid {
       /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       // Now the meat: the object readers
       /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-#define PARALLEL_READ
-#define PARALLEL_WRITE
 
       template<class vsimd>
       static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file)
@@ -399,6 +400,7 @@ namespace Grid {
 	       <<" header    "<<header.plaquette<<std::endl;
       std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
 	       <<" header    "<<header.link_trace<<std::endl;
+
       assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
       assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
       assert(csum == header.checksum );

From b8b5934193da2855cb34edb2e569b9cd501ff5ef Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 25 May 2017 13:32:24 +0100
Subject: [PATCH 044/177] Attempts to speed up the parallel IO

---
 lib/parallelIO/BinaryIO.h | 204 +++++++++++++++++++++-----------------
 lib/parallelIO/NerscIO.h  |  18 +++-
 tests/IO/Test_nersc_io.cc |   2 +-
 3 files changed, 133 insertions(+), 91 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index ab449f92..c1fca348 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -228,11 +228,11 @@ class BinaryIO {
       bytes += sizeof(fobj)*lx;
       if (grid->IsBoss()) {
         fin.read((char *)&file_object[0], sizeof(fobj)*lx); assert( fin.fail()==0);
+	if (ieee32big) be32toh_v((void *)&file_object[0], sizeof(fobj)*lx);
+	if (ieee32)    le32toh_v((void *)&file_object[0], sizeof(fobj)*lx);
+	if (ieee64big) be64toh_v((void *)&file_object[0], sizeof(fobj)*lx);
+	if (ieee64)    le64toh_v((void *)&file_object[0], sizeof(fobj)*lx);
 	for(int x=0;x<lx;x++){
-	  if (ieee32big) be32toh_v((void *)&file_object[x], sizeof(fobj));
-	  if (ieee32)    le32toh_v((void *)&file_object[x], sizeof(fobj));
-	  if (ieee64big) be64toh_v((void *)&file_object[x], sizeof(fobj));
-	  if (ieee64)    le64toh_v((void *)&file_object[x], sizeof(fobj));
 	  munge(file_object[x], munged[x], csum);
 	}
       }
@@ -294,11 +294,12 @@ class BinaryIO {
       if ( grid->IsBoss() ) {
 	for(int x=0;x<lx;x++){
 	  munge(unmunged[x],file_object[x],csum);
-	  if(ieee32big) htobe32_v((void *)&file_object[x],sizeof(fobj));
-	  if(ieee32)    htole32_v((void *)&file_object[x],sizeof(fobj));
-	  if(ieee64big) htobe64_v((void *)&file_object[x],sizeof(fobj));
-	  if(ieee64)    htole64_v((void *)&file_object[x],sizeof(fobj));
 	}
+	if(ieee32big) htobe32_v((void *)&file_object[0],sizeof(fobj)*lx);
+	if(ieee32)    htole32_v((void *)&file_object[0],sizeof(fobj)*lx);
+	if(ieee64big) htobe64_v((void *)&file_object[0],sizeof(fobj)*lx);
+	if(ieee64)    htole64_v((void *)&file_object[0],sizeof(fobj)*lx);
+
 	fout.write((char *)&file_object[0],sizeof(fobj)*lx);assert( fout.fail()==0);
 	bytes+=sizeof(fobj)*lx;
       }
@@ -350,10 +351,14 @@ class BinaryIO {
       int l_idx=parallel.generator_idx(o_idx,i_idx);
 
       if( rank == grid->ThisRank() ){
-	//  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
 	parallel.GetState(saved,l_idx);
       }
-      grid->Broadcast(rank, (void *)&saved[0], bytes);
+
+      if ( rank != 0 ) {
+	grid->Broadcast(rank, (void *)&saved[0], bytes);
+      }
+
+      grid->Barrier();
 
       if ( grid->IsBoss() ) {
 	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
@@ -370,8 +375,9 @@ class BinaryIO {
 
     grid->Broadcast(0, (void *)&csum, sizeof(csum));
 
-    if (grid->IsBoss()) 
+    if (grid->IsBoss()) {
       fout.close();
+    }
 
     timer.Stop();
 
@@ -426,6 +432,7 @@ class BinaryIO {
       }
       
       grid->Broadcast(0,(void *)&saved[0],bytes);
+      grid->Barrier();
 
       if( rank == grid->ThisRank() ){
         parallel.SetState(saved,l_idx);
@@ -434,8 +441,8 @@ class BinaryIO {
 
     if ( grid->IsBoss() ) {
       fin.read((char *)&saved[0],bytes);assert( fin.fail()==0);
-      serial.SetState(saved,0);
       Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+      serial.SetState(saved,0);
     }
 
     std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
@@ -445,7 +452,6 @@ class BinaryIO {
     return csum;
   }
 
-
   template <class vobj, class fobj, class munger>
   static inline uint32_t readObjectParallel(Lattice<vobj> &Umu,
                                             std::string file, 
@@ -528,6 +534,10 @@ class BinaryIO {
     if (!ILDG.is_ILDG) {
       if ( IOnode ) { 
 	fin.open(file,std::ios::binary|std::ios::in);
+	if ( !fin.is_open() ) { 
+	  std::cout << GridLogMessage << "readObjectParallel: Error opening file " << file << std::endl;
+          exit(0);
+	}
       }
     }
 
@@ -540,7 +550,7 @@ class BinaryIO {
     static uint32_t csum; csum=0;//static for SHMEM
 
     std::vector<fobj> fileObj(chunk); // FIXME
-    std::vector<sobj> siteObj(chunk); // Use comm allocator to place in symmetric region for SHMEM
+    std::vector<sobj> siteObj(chunk); // Use alignedAllocator to place in symmetric region for SHMEM
 
     // need to implement these loops in Nd independent way with a lexico conversion
     for(int tlex=0;tlex<slice_vol;tlex+=chunk){
@@ -549,17 +559,13 @@ class BinaryIO {
       std::vector<int> gsite(nd);
       std::vector<int> lsite(nd);
 
-      Lexicographic::CoorFromIndex(tsite,tlex,range);
-
-      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
-      }
+      int rank, o_idx,i_idx, g_idx;
 
       ///////////////////////////////////////////
       // Get the global lexico base of the chunk
       ///////////////////////////////////////////
-      int rank, o_idx,i_idx, g_idx;
+      Lexicographic::CoorFromIndex(tsite,tlex,range);
+      for(int d=0;d<nd;d++) gsite[d] = tsite[d]+start[d];
       grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
       grid->GlobalCoorToGlobalIndex(gsite,g_idx);
 
@@ -571,11 +577,14 @@ class BinaryIO {
       	if (ILDG.is_ILDG){
 #ifdef HAVE_LIME
 	  // use C-LIME to populate the record
-          uint64_t sizeFO = sizeof(fobj)*chunk;
+          uint64_t sizeFO   = sizeof(fobj);
+          uint64_t sizeChunk= sizeFO*chunk;
           limeReaderSeek(ILDG.LR, g_idx*sizeFO, SEEK_SET);
-          int status = limeReaderReadData((void *)&fileObj[0], &sizeFO, ILDG.LR);
+          int status = limeReaderReadData((void *)&fileObj[0], &sizeChunk, ILDG.LR);
+#else 
+	  assert(0);
 #endif
-        } else{
+        } else {
           fin.seekg(offset+g_idx*sizeof(fobj));
           fin.read((char *)&fileObj[0],sizeof(fobj)*chunk);
         }
@@ -630,6 +639,7 @@ class BinaryIO {
     return csum;
   }
 
+
   //////////////////////////////////////////////////////////
   // Parallel writer
   //////////////////////////////////////////////////////////
@@ -643,9 +653,9 @@ class BinaryIO {
     GridBase *grid = Umu._grid;
 
     int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32 = (format == std::string("IEEE32"));
+    int ieee32    = (format == std::string("IEEE32"));
     int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64 = (format == std::string("IEEE64"));
+    int ieee64    = (format == std::string("IEEE64"));
 
     if (!(ieee32big || ieee32 || ieee64big || ieee64)) {
       std::cout << GridLogError << "Unrecognized file format " << format << std::endl;
@@ -658,7 +668,9 @@ class BinaryIO {
       assert(grid->CheckerBoarded(d) == 0);
     }
 
-    std::vector<int> parallel(nd, 1);
+    // Parallel in yzt, serial funnelled in "x".
+    // gx x ly chunk size
+    std::vector<int> parallel(nd, 1); parallel[0] = 0;
     std::vector<int> ioproc(nd);
     std::vector<int> start(nd);
     std::vector<int> range(nd);
@@ -666,9 +678,13 @@ class BinaryIO {
     uint64_t slice_vol = 1;
 
     int IOnode = 1;
+    int gstrip = grid->_gdimensions[0];
+    int lstrip = grid->_ldimensions[0];
+    int chunk;
+    if ( nd==1) chunk = gstrip;
+    else        chunk = gstrip*grid->_ldimensions[1];
 
     for (int d = 0; d < grid->_ndimension; d++) {
-      if (d != grid->_ndimension - 1) parallel[d] = 0;
 
       if (parallel[d]) {
 	range[d] = grid->_ldimensions[d];
@@ -688,14 +704,16 @@ class BinaryIO {
     {
       uint32_t tmp = IOnode;
       grid->GlobalSum(tmp);
-      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file
-	       << " with " <<tmp<< " IOnodes for subslice ";
+      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
       for(int d=0;d<grid->_ndimension;d++){
 	std::cout<< range[d];
 	if( d< grid->_ndimension-1 ) 
 	  std::cout<< " x ";
       }
       std::cout << std::endl;
+      std::cout<< GridLogMessage<< "Parallel I/O local  strip size is "<< lstrip <<std::endl;
+      std::cout<< GridLogMessage<< "Parallel I/O global strip size is "<< gstrip <<std::endl;
+      std::cout<< GridLogMessage<< "Parallel I/O chunk size is "<< chunk  <<std::endl;
     }
     
     GridStopWatch timer;
@@ -706,21 +724,19 @@ class BinaryIO {
     int iorank = grid->RankFromProcessorCoor(ioproc);
 
     // Take into account block size of parallel file systems want about
-    // 4-16MB chunks.
     // Ideally one reader/writer per xy plane and read these contiguously
     // with comms from nominated I/O nodes.
     std::ofstream fout;
-    if (!ILDG.is_ILDG)
-    	if (IOnode){
-    		fout.open(file, std::ios::binary | std::ios::in | std::ios::out);
-    		if (!fout.is_open()) {
-    			std::cout << GridLogMessage << "writeObjectParallel: Error opening file " << file
-    			<< std::endl;
-    			exit(0);
-    		}
-    	}
-
-
+    if (!ILDG.is_ILDG) {
+      if (IOnode){
+	fout.open(file, std::ios::binary | std::ios::in | std::ios::out);
+	if (!fout.is_open()) {
+	  std::cout << GridLogMessage << "writeObjectParallel: Error opening file " << file << std::endl;
+	  exit(0);
+	}
+      }
+    }
+    
     //////////////////////////////////////////////////////////
     // Find the location of each site and send to primary node
     // Take loop order from Chroma; defines loop order now that NERSC doc no
@@ -729,72 +745,82 @@ class BinaryIO {
     //////////////////////////////////////////////////////////
 
     uint32_t csum = 0;
-    fobj fileObj;
-    static sobj siteObj;  // static for SHMEM target; otherwise dynamic allocate
-                          // with AlignedAllocator
+    std::vector<fobj> fileObj(chunk);
+    std::vector<sobj> siteObj(chunk);
 
     // should aggregate a whole chunk and then write.
     // need to implement these loops in Nd independent way with a lexico
     // conversion
-    for (int tlex = 0; tlex < slice_vol; tlex++) {
+    for (int tlex = 0; tlex < slice_vol; tlex+=chunk) {
 
       std::vector<int> tsite(nd);  // temporary mixed up site
       std::vector<int> gsite(nd);
       std::vector<int> lsite(nd);
 
-      Lexicographic::CoorFromIndex(tsite, tlex, range);
-
-      for(int d = 0;d < nd; d++){
-	lsite[d] = tsite[d] % grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d] + start[d];               // global site
-      }
-
-      /////////////////////////
-      // Get the rank of owner of data
-      /////////////////////////
       int rank, o_idx, i_idx, g_idx;
-      grid->GlobalCoorToRankIndex(rank, o_idx, i_idx, gsite);
-      grid->GlobalCoorToGlobalIndex(gsite, g_idx);
 
-      ////////////////////////////////
-      // iorank writes from the seek
-      ////////////////////////////////
+      // Possibly do transport through pt2pt 
+      for(int cc=0;cc<chunk;cc+=lstrip){
 
-      // Owner of data peeks it
-      peekLocalSite(siteObj, Umu, lsite);
+	// Get the rank of owner of strip
+	Lexicographic::CoorFromIndex(tsite,tlex+cc,range);
 
-      // Pair of nodes may need to do pt2pt send
-      if ( rank != iorank ) { // comms is necessary
-	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	  // Send to IOrank 
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+	for(int d=0;d<nd;d++){
+	  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+	  gsite[d] = tsite[d]+start[d];               // global site
+	}
+	grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
+
+	// Owner of data peeks it over lstrip
+	if ( myrank == rank ) {
+	  for(int x=0;x<lstrip;x++){
+	    lsite[0]=x;
+	    peekLocalSite(siteObj[cc+x],Umu,lsite);
+	  }
+	}
+
+	// Pair of nodes may need to do pt2pt send
+	if ( rank != iorank ) { // comms is necessary
+	  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+	    // Send to IOrank 
+	    grid->SendRecvPacket((void *)&siteObj[cc],(void *)&siteObj[cc],rank,iorank,sizeof(sobj)*lstrip);
+	  }
 	}
       }
 
       grid->Barrier();  // necessary?
 
+      /////////////////////////
+      // Get the global lexico base of the chunk
+      /////////////////////////
+      Lexicographic::CoorFromIndex(tsite, tlex, range);
+      for(int d = 0;d < nd; d++){ gsite[d] = tsite[d] + start[d];}
+      grid->GlobalCoorToRankIndex(rank, o_idx, i_idx, gsite);
+      grid->GlobalCoorToGlobalIndex(gsite, g_idx);
+
       if (myrank == iorank) {
-        munge(siteObj, fileObj, csum);
 
-        if (ieee32big) htobe32_v((void *)&fileObj, sizeof(fileObj));
-        if (ieee32) htole32_v((void *)&fileObj, sizeof(fileObj));
-        if (ieee64big) htobe64_v((void *)&fileObj, sizeof(fileObj));
-        if (ieee64) htole64_v((void *)&fileObj, sizeof(fileObj));
+	for(int c=0;c<chunk;c++) munge(siteObj[c],fileObj[c],csum);
 
+        if (ieee32big) htobe32_v((void *)&fileObj[0], sizeof(fobj)*chunk);
+        if (ieee32   ) htole32_v((void *)&fileObj[0], sizeof(fobj)*chunk);
+        if (ieee64big) htobe64_v((void *)&fileObj[0], sizeof(fobj)*chunk);
+        if (ieee64   ) htole64_v((void *)&fileObj[0], sizeof(fobj)*chunk);
 
         if (ILDG.is_ILDG) {
-          #ifdef HAVE_LIME
-          uint64_t sizeFO = sizeof(fileObj);
- 					limeWriterSeek(ILDG.LW, g_idx*sizeFO, SEEK_SET);
-          int status = limeWriteRecordData((void *)&fileObj, &sizeFO, ILDG.LW);
-          #endif
-        } 
-
-        else {
-          fout.seekp(offset + g_idx * sizeof(fileObj));
-          fout.write((char *)&fileObj, sizeof(fileObj));assert( fout.fail()==0);
+#ifdef HAVE_LIME
+          uint64_t sizeFO   = sizeof(fobj);
+          uint64_t sizeChunk= sizeof(fobj)*chunk;
+	  limeWriterSeek(ILDG.LW, g_idx*sizeFO, SEEK_SET);
+          int status = limeWriteRecordData((void *)&fileObj[0], &sizeChunk, ILDG.LW);
+#else 
+	  assert(0);
+#endif
+        } else {
+          fout.seekp(offset + g_idx * sizeof(fobj));
+          fout.write((char *)&fileObj[0], sizeof(fobj)*chunk);assert( fout.fail()==0);
         }
-        bytes += sizeof(fileObj);
+        bytes += sizeof(fobj)*chunk;
       }
     }
     
@@ -806,12 +832,12 @@ class BinaryIO {
               << " bytes in " << timer.Elapsed() << " "
               << (double)bytes / timer.useconds() << " MB/s " << std::endl;
 
-
-
      grid->Barrier();  // necessary?
-     if (IOnode) 
-      fout.close();
-
+     if (!ILDG.is_ILDG) {
+       if (IOnode) {
+	 fout.close();
+       }
+     }
 
     return csum;
   }
diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index cd20c841..cf3e41e4 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -31,7 +31,7 @@
 #define GRID_NERSC_IO_H
 
 #define PARALLEL_READ
-#undef PARALLEL_WRITE
+#define PARALLEL_WRITE
 
 #include <algorithm>
 #include <iostream>
@@ -401,6 +401,18 @@ namespace Grid {
       std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
 	       <<" header    "<<header.link_trace<<std::endl;
 
+      if ( fabs(clone.plaquette -header.plaquette ) >=  1.0e-5 ) { 
+	std::cout << " Plaquette mismatch "<<std::endl;
+	std::cout << Umu[0]<<std::endl;
+	std::cout << Umu[1]<<std::endl;
+      }
+      if ( csum != header.checksum ) { 
+	std::cerr << " checksum mismatch " << std::endl;
+	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
+	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
+	std::cerr << " csum  " <<std::hex<< csum << " " << header.checksum<< std::dec<< std::endl;
+	exit(0);
+      }
       assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
       assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
       assert(csum == header.checksum );
@@ -542,6 +554,10 @@ namespace Grid {
 	// munger is a function of <floating point, Real, data_type>
 	uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
 
+	if ( csum != header.checksum ) { 
+	  std::cerr << "checksum mismatch "<<std::hex<< csum <<" "<<header.checksum<<std::dec<<std::endl;
+	  exit(0);
+	}
 	assert(csum == header.checksum );
 
 	std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc
index cb33cfab..cf919a7d 100644
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
 
   std::vector<LatticeColourMatrix> U(4,&Fine);
   
-  SU3::ColdConfiguration(pRNGa,Umu);
+  SU3::HotConfiguration(pRNGa,Umu);
 
   NerscField header;
   std::string file("./ckpoint_lat.4000");

From 69470ccc10e688908b9d17ea94a6e18759a8dc1a Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 25 May 2017 13:41:26 +0100
Subject: [PATCH 045/177] Update to do list

---
 TODO | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/TODO b/TODO
index 672879cd..a5d4cabd 100644
--- a/TODO
+++ b/TODO
@@ -2,9 +2,9 @@ TODO:
 ---------------
 
 Peter's work list:
-2)- Precision conversion and sort out localConvert      <-- 
-3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started 
-4)- Binary I/O speed up & x-strips
+1)- Precision conversion and sort out localConvert      <-- 
+2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
+
 -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
 -- Physical propagator interface
 -- Conserved currents
@@ -13,6 +13,7 @@ Peter's work list:
 -- HDCR resume
 
 Recent DONE 
+-- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE

From 725c513d9421732e212fe693120de64020299275 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 29 May 2017 16:47:32 -0400
Subject: [PATCH 046/177] Better MPI3 benchmarking

---
 benchmarks/Benchmark_comms.cc | 127 ++++++++++++++++++++--------------
 1 file changed, 75 insertions(+), 52 deletions(-)

diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index ce881ef6..532532f8 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -66,7 +66,7 @@ int main (int argc, char ** argv)
   int threads = GridThread::GetThreads();
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
-  int Nloop=500;
+  int Nloop=100;
   int nmu=0;
   int maxlat=24;
   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
@@ -88,6 +88,9 @@ int main (int argc, char ** argv)
       				    lat*mpi_layout[3]});
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
 
       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
@@ -132,13 +135,13 @@ int main (int argc, char ** argv)
 	}
 	Grid.SendToRecvFromComplete(requests);
 	Grid.Barrier();
-  double stop=usecond();
-  t_time[i] = stop-start; // microseconds
+	double stop=usecond();
+	t_time[i] = stop-start; // microseconds
       }
 
       timestat.statistics(t_time);
 
-      double dbytes    = bytes;
+      double dbytes    = bytes*ppn;
       double xbytes    = dbytes*2.0*ncomm;
       double rbytes    = xbytes;
       double bidibytes = xbytes+rbytes;
@@ -165,6 +168,9 @@ int main (int argc, char ** argv)
       std::vector<int> latt_size  ({lat,lat,lat,lat});
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
 
       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
@@ -213,14 +219,14 @@ int main (int argc, char ** argv)
 	  }
 	}
 	Grid.Barrier();
-      double stop=usecond();
-    t_time[i] = stop-start; // microseconds
+	double stop=usecond();
+	t_time[i] = stop-start; // microseconds
 
       }
 
       timestat.statistics(t_time);
       
-      double dbytes    = bytes;
+      double dbytes    = bytes*ppn;
       double xbytes    = dbytes*2.0*ncomm;
       double rbytes    = xbytes;
       double bidibytes = xbytes+rbytes;
@@ -251,6 +257,9 @@ int main (int argc, char ** argv)
       				    lat*mpi_layout[3]});
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
 
       std::vector<HalfSpinColourVectorD *> xbuf(8);
       std::vector<HalfSpinColourVectorD *> rbuf(8);
@@ -258,59 +267,66 @@ int main (int argc, char ** argv)
       for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
       }
 
       int ncomm;
       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 
+      double dbytes;
       for(int i=0;i<Nloop;i++){
-      double start=usecond();
+	double start=usecond();
+
+	dbytes=0;
+	ncomm=0;
 
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 
-	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	
+
 	  if (mpi_layout[mu]>1 ) {
 	  
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.StencilSendToRecvFromBegin(requests,
-					    (void *)&xbuf[mu][0],
-					    xmit_to_rank,
-					    (void *)&rbuf[mu][0],
-					    recv_from_rank,
-					    bytes);
+	    dbytes+=
+	      Grid.StencilSendToRecvFromBegin(requests,
+					      (void *)&xbuf[mu][0],
+					      xmit_to_rank,
+					      (void *)&rbuf[mu][0],
+					      recv_from_rank,
+					      bytes);
 	
 	    comm_proc = mpi_layout[mu]-1;
 	  
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.StencilSendToRecvFromBegin(requests,
-					    (void *)&xbuf[mu+4][0],
-					    xmit_to_rank,
-					    (void *)&rbuf[mu+4][0],
-					    recv_from_rank,
-					    bytes);
+	    dbytes+=
+	      Grid.StencilSendToRecvFromBegin(requests,
+					      (void *)&xbuf[mu+4][0],
+					      xmit_to_rank,
+					      (void *)&rbuf[mu+4][0],
+					      recv_from_rank,
+					      bytes);
 	  
 	  }
 	}
 	Grid.StencilSendToRecvFromComplete(requests);
 	Grid.Barrier();
-      double stop=usecond();
-    t_time[i] = stop-start; // microseconds
-
+	double stop=usecond();
+	t_time[i] = stop-start; // microseconds
+	
       }
 
       timestat.statistics(t_time);
 
-      double dbytes    = bytes;
-      double xbytes    = dbytes*2.0*ncomm;
-      double rbytes    = xbytes;
-      double bidibytes = xbytes+rbytes;
+      dbytes=dbytes*ppn;
+      double xbytes    = dbytes*0.5;
+      double rbytes    = dbytes*0.5;
+      double bidibytes = dbytes;
 
       std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
                <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
@@ -338,6 +354,9 @@ int main (int argc, char ** argv)
       				    lat*mpi_layout[3]});
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
 
       std::vector<HalfSpinColourVectorD *> xbuf(8);
       std::vector<HalfSpinColourVectorD *> rbuf(8);
@@ -345,16 +364,18 @@ int main (int argc, char ** argv)
       for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
       }
 
       int ncomm;
       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-
+      double dbytes;
       for(int i=0;i<Nloop;i++){
-      double start=usecond();
+	double start=usecond();
 
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
-
+	dbytes=0;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	
@@ -366,41 +387,43 @@ int main (int argc, char ** argv)
 	    int recv_from_rank;
 	    
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.StencilSendToRecvFromBegin(requests,
-					    (void *)&xbuf[mu][0],
-					    xmit_to_rank,
-					    (void *)&rbuf[mu][0],
-					    recv_from_rank,
-					    bytes);
+	    dbytes+=
+	      Grid.StencilSendToRecvFromBegin(requests,
+					      (void *)&xbuf[mu][0],
+					      xmit_to_rank,
+					      (void *)&rbuf[mu][0],
+					      recv_from_rank,
+					      bytes);
 	    Grid.StencilSendToRecvFromComplete(requests);
 	    requests.resize(0);
 
 	    comm_proc = mpi_layout[mu]-1;
 	  
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.StencilSendToRecvFromBegin(requests,
-					    (void *)&xbuf[mu+4][0],
-					    xmit_to_rank,
-					    (void *)&rbuf[mu+4][0],
-					    recv_from_rank,
-					    bytes);
+	    dbytes+=
+	      Grid.StencilSendToRecvFromBegin(requests,
+					      (void *)&xbuf[mu+4][0],
+					      xmit_to_rank,
+					      (void *)&rbuf[mu+4][0],
+					      recv_from_rank,
+					      bytes);
 	    Grid.StencilSendToRecvFromComplete(requests);
 	    requests.resize(0);
 	  
 	  }
 	}
-	    Grid.Barrier();
-      double stop=usecond();
-      t_time[i] = stop-start; // microseconds
-
+	Grid.Barrier();
+	double stop=usecond();
+	t_time[i] = stop-start; // microseconds
+	
       }
 
       timestat.statistics(t_time);
 
-      double dbytes    = bytes;
-      double xbytes    = dbytes*2.0*ncomm;
-      double rbytes    = xbytes;
-      double bidibytes = xbytes+rbytes;
+      dbytes=dbytes*ppn;
+      double xbytes    = dbytes*0.5;
+      double rbytes    = dbytes*0.5;
+      double bidibytes = dbytes;
 
 
       std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"

From 0fb458879d066708de9fbe46712758f95983a53e Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 May 2017 23:37:02 +0100
Subject: [PATCH 047/177] Precision safe compile

---
 tests/forces/Test_contfrac_force.cc  | 2 +-
 tests/forces/Test_dwf_force.cc       | 2 +-
 tests/forces/Test_dwf_gpforce.cc     | 6 +++---
 tests/forces/Test_gp_rect_force.cc   | 2 +-
 tests/forces/Test_gpdwf_force.cc     | 2 +-
 tests/forces/Test_gpwilson_force.cc  | 2 +-
 tests/forces/Test_laplacian_force.cc | 2 +-
 tests/forces/Test_mobius_force.cc    | 2 +-
 tests/forces/Test_partfrac_force.cc  | 2 +-
 tests/forces/Test_rect_force.cc      | 2 +-
 tests/forces/Test_wilson_force.cc    | 6 +++---
 tests/forces/Test_zmobius_force.cc   | 2 +-
 12 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc
index 227ad5a0..2afb4dde 100644
--- a/tests/forces/Test_contfrac_force.cc
+++ b/tests/forces/Test_contfrac_force.cc
@@ -139,7 +139,7 @@ int main (int argc, char ** argv)
 
   }
 
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc
index 1f26caa7..59aee7b7 100644
--- a/tests/forces/Test_dwf_force.cc
+++ b/tests/forces/Test_dwf_force.cc
@@ -150,7 +150,7 @@ int main (int argc, char ** argv)
 
   }
 
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc
index 492be75f..98453aab 100644
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@@ -194,9 +194,9 @@ int main (int argc, char ** argv)
 
   }
 
-  Complex dSpred    = sum(dS);
-  Complex dSm       = sum(dSmom);
-  Complex dSm2      = sum(dSmom2);
+  ComplexD dSpred    = sum(dS);
+  ComplexD dSm       = sum(dSmom);
+  ComplexD dSm2      = sum(dSmom2);
 
 
   std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <<std::endl;
diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc
index c74288e1..bf308749 100644
--- a/tests/forces/Test_gp_rect_force.cc
+++ b/tests/forces/Test_gp_rect_force.cc
@@ -113,7 +113,7 @@ int main (int argc, char ** argv)
     dS = dS - trace(mommu*UdSdUmu)*dt*2.0;
 
   }
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc
index 4b970374..ca9b0a2c 100644
--- a/tests/forces/Test_gpdwf_force.cc
+++ b/tests/forces/Test_gpdwf_force.cc
@@ -143,7 +143,7 @@ int main (int argc, char ** argv)
     dS = dS+trace(mommu*forcemu)*dt;
   }
 
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   // From TwoFlavourPseudoFermion:
   //////////////////////////////////////////////////////
diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc
index f0ccd2fd..ebde61a5 100644
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -143,7 +143,7 @@ int main (int argc, char ** argv)
     dS = dS+trace(mommu*forcemu)*dt;
   }
 
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
diff --git a/tests/forces/Test_laplacian_force.cc b/tests/forces/Test_laplacian_force.cc
index 9f69fbc4..c5de1bbc 100644
--- a/tests/forces/Test_laplacian_force.cc
+++ b/tests/forces/Test_laplacian_force.cc
@@ -128,7 +128,7 @@ int main (int argc, char ** argv)
     dS = dS + trace(mommu*UdSdUmu)*dt*2.0;
   }
 
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc
index 3bb1502c..06a17ffe 100644
--- a/tests/forces/Test_mobius_force.cc
+++ b/tests/forces/Test_mobius_force.cc
@@ -141,7 +141,7 @@ int main (int argc, char ** argv)
 
   }
 
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " -- S         "<<S<<std::endl;
   std::cout << GridLogMessage << " -- Sprime    "<<Sprime<<std::endl;
diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc
index 2ee4c931..81f9aaa5 100644
--- a/tests/forces/Test_partfrac_force.cc
+++ b/tests/forces/Test_partfrac_force.cc
@@ -141,7 +141,7 @@ int main (int argc, char ** argv)
 
   }
 
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc
index b6bfd8a1..c312abeb 100644
--- a/tests/forces/Test_rect_force.cc
+++ b/tests/forces/Test_rect_force.cc
@@ -112,7 +112,7 @@ int main (int argc, char ** argv)
     dS = dS - trace(mommu*UdSdUmu)*dt*2.0;
 
   }
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc
index 51c67ed4..11bac1ae 100644
--- a/tests/forces/Test_wilson_force.cc
+++ b/tests/forces/Test_wilson_force.cc
@@ -178,9 +178,9 @@ int main (int argc, char ** argv)
 
   }
 
-  Complex dSpred    = sum(dS);
-  Complex dSm       = sum(dSmom);
-  Complex dSm2      = sum(dSmom2);
+  ComplexD dSpred    = sum(dS);
+  ComplexD dSm       = sum(dSmom);
+  ComplexD dSm2      = sum(dSmom2);
 
 
   std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <<std::endl;
diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc
index 1d909792..329d77e2 100644
--- a/tests/forces/Test_zmobius_force.cc
+++ b/tests/forces/Test_zmobius_force.cc
@@ -155,7 +155,7 @@ int main (int argc, char ** argv)
 
   }
 
-  Complex dSpred    = sum(dS);
+  ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;

From 62cf9cf6389de853874892ec7ee3437d974fe449 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 May 2017 23:38:02 +0100
Subject: [PATCH 048/177] Cleaner code

---
 lib/simd/Grid_vector_types.h | 31 +++++++------------------------
 lib/simd/Grid_vector_unops.h |  7 -------
 2 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 0048382f..1ebe7379 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -327,10 +327,6 @@ class Grid_simd {
   // provides support
   ///////////////////////////////////////
 
-  //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC push_options 
-  //#pragma GCC optimize ("O0") 
-  //#endif
   template <class functor>
   friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
     Grid_simd ret;
@@ -364,9 +360,6 @@ class Grid_simd {
     ret.v = cx.v;
     return ret;
   }
-  //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC pop_options
-  //#endif
   ///////////////////////
   // Exchange 
   // Al Ah , Bl Bh -> Al Bl Ah,Bh
@@ -428,7 +421,6 @@ class Grid_simd {
   
 };  // end of Grid_simd class definition
 
-
 inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
 inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
 inline void permute(RealD &y,RealD b, int perm) {  y=b; }
@@ -838,8 +830,6 @@ inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionCha
 inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
 inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
 
-
-
 // Check our vector types are of an appropriate size.
 #if defined QPX
 static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
@@ -854,21 +844,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 /////////////////////////////////////////
 template <typename T>
 struct is_simd : public std::false_type {};
-template <>
-struct is_simd<vRealF> : public std::true_type {};
-template <>
-struct is_simd<vRealD> : public std::true_type {};
-template <>
-struct is_simd<vComplexF> : public std::true_type {};
-template <>
-struct is_simd<vComplexD> : public std::true_type {};
-template <>
-struct is_simd<vInteger> : public std::true_type {};
+template <> struct is_simd<vRealF>     : public std::true_type {};
+template <> struct is_simd<vRealD>     : public std::true_type {};
+template <> struct is_simd<vComplexF>  : public std::true_type {};
+template <> struct is_simd<vComplexD>  : public std::true_type {};
+template <> struct is_simd<vInteger>   : public std::true_type {};
 
-template <typename T>
-using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
-template <typename T>
-using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
+template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
+template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
 }
 
 #endif
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
index 2afac190..2244566f 100644
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -179,13 +179,6 @@ inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
 ////////////////////////////////////////////////////////////////////////////
 // Allows us to assign into **conformable** real vectors from complex
 ////////////////////////////////////////////////////////////////////////////
-//  template < class S, class V >
-//  inline auto ComplexRemove(const Grid_simd<S,V> &c) ->
-//  Grid_simd<Grid_simd<S,V>::Real,V> {
-//    Grid_simd<Grid_simd<S,V>::Real,V> ret;
-//    ret.v = c.v;
-//    return ret;
-//  }
 template <class scalar>
 struct AndFunctor {
   scalar operator()(const scalar &x, const scalar &y) const { return x & y; }

From 58e8d0a10d69c794c2839e6b9093bee3c2b32da2 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 May 2017 23:38:30 +0100
Subject: [PATCH 049/177] reverse direction lexico mapping

---
 lib/lattice/Lattice_transfer.h | 50 +++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h
index 68de52d0..c8ba0928 100644
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -551,7 +551,10 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
-typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type 
+unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
+{
+
   typedef typename vobj::vector_type vtype;
   
   GridBase* in_grid = in._grid;
@@ -590,6 +593,51 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
     extract1(in_vobj, out_ptrs, 0);
   }
 }
+//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
+template<typename vobj, typename sobj>
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type 
+vectorizeFromLexOrdArray(std::vector<sobj> &in, const Lattice<vobj> &out)
+{
+
+  typedef typename vobj::vector_type vtype;
+  
+  GridBase* grid = out._grid;
+  assert(in.size()==grid->lSites());
+  
+  int ndim     = grid->Nd();
+  int nsimd    = vtype::Nsimd();
+
+  std::vector<std::vector<int> > icoor(nsimd);
+      
+  for(int lane=0; lane < nsimd; lane++){
+    icoor[lane].resize(ndim);
+    grid->iCoorFromIindex(icoor[lane],lane);
+  }
+  
+  parallel_for(int oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
+    //Assemble vector of pointers to output elements
+    std::vector<sobj*> ptrs(nsimd);
+
+    std::vector<int> ocoor(ndim);
+    grid->oCoorFromOindex(ocoor, oidx);
+
+    std::vector<int> lcoor(grid->Nd());
+      
+    for(int lane=0; lane < nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
+
+      int lex;
+      Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions);
+      ptrs[lane] = &in[lex];
+    }
+    
+    //pack from those ptrs
+    vobj vecobj;
+    merge1(vecobj, ptrs, 0);
+    out._odata[oidx] = vecobj; 
+  }
+}
 
 //Convert a Lattice from one precision to another
 template<class VobjOut, class VobjIn>

From e30fa9f4b8fcce40211e69d598617992899b03d4 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 May 2017 23:39:16 +0100
Subject: [PATCH 050/177] RankCount; need to clean up ambigious ProcessCount

---
 lib/communicator/Communicator_base.cc | 2 ++
 lib/communicator/Communicator_base.h  | 2 ++
 lib/communicator/Communicator_mpi3.cc | 1 +
 3 files changed, 5 insertions(+)

diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 98d2abf4..557fef48 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -60,6 +60,7 @@ void CartesianCommunicator::ShmBufferFreeAll(void) {
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
+int                      CartesianCommunicator::Dimensions(void)         { return _ndimension; };
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
@@ -91,6 +92,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
 
 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
+int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
 
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index e0b9f2c3..23d4f647 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -148,6 +148,7 @@ class CartesianCommunicator {
   int  RankFromProcessorCoor(std::vector<int> &coor);
   void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
   
+  int                      Dimensions(void)        ;
   int                      IsBoss(void)            ;
   int                      BossRank(void)          ;
   int                      ThisRank(void)          ;
@@ -155,6 +156,7 @@ class CartesianCommunicator {
   const std::vector<int> & ProcessorGrid(void)     ;
   int                      ProcessorCount(void)    ;
   int                      NodeCount(void)    ;
+  int                      RankCount(void)    ;
 
   ////////////////////////////////////////////////////////////////////////////////
   // very VERY rarely (Log, serial RNG) we need world without a grid
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index a8bffc14..54a0f9b5 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -65,6 +65,7 @@ std::vector<int> CartesianCommunicator::MyGroup;
 std::vector<void *> CartesianCommunicator::ShmCommBufs;
 
 int CartesianCommunicator::NodeCount(void)    { return GroupSize;};
+int CartesianCommunicator::RankCount(void)    { return WorldSize;};
 
 
 #undef FORCE_COMMS

From 53a9aeb9653a312ffed057eccf65f7de0e193742 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 May 2017 23:39:53 +0100
Subject: [PATCH 051/177] Cosmetic only

---
 lib/tensors/Tensor_traits.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h
index ab20b807..c1ef397a 100644
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@@ -281,8 +281,8 @@ namespace Grid {
   template<typename T>
   class getPrecision{
   public:
-    typedef typename getVectorType<T>::type vector_obj; //get the vector_obj (i.e. a grid Tensor) if its a Lattice<vobj>, do nothing otherwise (i.e. if fundamental or grid Tensor)
-  
+    //get the vector_obj (i.e. a grid Tensor) if its a Lattice<vobj>, do nothing otherwise (i.e. if fundamental or grid Tensor)
+    typedef typename getVectorType<T>::type vector_obj; 
     typedef typename GridTypeMapper<vector_obj>::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types
     typedef typename GridTypeMapper<scalar_type>::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type
 

From ef1b7db374ede8eee0011b1db3fc6cd076d9bfb8 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 May 2017 23:40:11 +0100
Subject: [PATCH 052/177] Diff comparison check

---
 tests/IO/Test_nersc_io.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc
index cf919a7d..8507df13 100644
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -88,7 +88,12 @@ int main (int argc, char ** argv)
   int precision32 = 0;
   int tworow      = 0;
   NerscIO::writeConfiguration(Umu,file,tworow,precision32);
+  Umu_saved = Umu;
   NerscIO::readConfiguration(Umu,header,file);
+  Umu_diff = Umu - Umu_saved;
+  //std::cout << "Umu_save "<<Umu_saved[0]<<std::endl;
+  //std::cout << "Umu_read "<<Umu[0]<<std::endl;
+  std::cout << "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;
 
   for(int mu=0;mu<Nd;mu++){
     U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -139,7 +144,6 @@ int main (int argc, char ** argv)
   Complex p  = TensorRemove(Tp);
   std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;
 
-
   Complex LinkTraceScale(1.0/vol/4.0/3.0);
   TComplex Tl = sum(LinkTrace);
   Complex l  = TensorRemove(Tl);

From d38a4de36ce4ba5c71a9c17c25701f4518978787 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 May 2017 23:40:39 +0100
Subject: [PATCH 053/177] Beginning move to MPI IO

---
 lib/parallelIO/NerscIO.h | 43 ++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index cf3e41e4..ab535dac 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -30,7 +30,10 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 
-#define PARALLEL_READ
+#undef PARALLEL_READ
+#undef SERIAL_READ
+#define MPI_READ
+
 #define PARALLEL_WRITE
 
 #include <algorithm>
@@ -355,7 +358,12 @@ namespace Grid {
 #ifdef PARALLEL_READ
 	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 	  (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
-#else
+#endif
+#ifdef MPI_READ
+	csum=BinaryIO::readObjectMPI<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	  (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
+#endif
+#ifdef SERIAL_READ
 	csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 	  (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
 #endif
@@ -364,7 +372,12 @@ namespace Grid {
 #ifdef PARALLEL_READ
 	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 	  (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
-#else 
+#endif
+#ifdef MPI_READ
+	csum=BinaryIO::readObjectMPI<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	  (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
+#endif
+#ifdef SERIAL_READ
 	csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 	  (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
 #endif
@@ -374,7 +387,12 @@ namespace Grid {
 #ifdef PARALLEL_READ
 	  csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
-#else
+#endif
+#ifdef MPI_READ
+	  csum=BinaryIO::readObjectMPI<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
+#endif
+#ifdef SERIAL_READ
 	  csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
 #endif
@@ -383,7 +401,12 @@ namespace Grid {
 #ifdef PARALLEL_READ
 	  csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
-#else
+#endif
+#ifdef MPI_READ
+	  csum=BinaryIO::readObjectMPI<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
+#endif
+#ifdef SERIAL_READ
 	  csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
 #endif
@@ -411,13 +434,13 @@ namespace Grid {
 	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
 	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
 	std::cerr << " csum  " <<std::hex<< csum << " " << header.checksum<< std::dec<< std::endl;
-	exit(0);
+	//	exit(0);
       }
-      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
-      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
-      assert(csum == header.checksum );
+      //      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+      //      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
+      //      assert(csum == header.checksum );
 
-      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
+      //      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
       }
 
       template<class vsimd>

From 1e429a0d57aa4c5efaa458a198cd7d7a49cb2f34 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 May 2017 23:41:07 +0100
Subject: [PATCH 054/177] Added MPI version

---
 lib/parallelIO/BinaryIO.h | 145 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 144 insertions(+), 1 deletion(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index c1fca348..cbc619ef 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -250,6 +250,149 @@ class BinaryIO {
     return csum;
   }
 
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t readObjectMPI(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
+  {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = Umu._grid;
+
+    std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
+    GridStopWatch timer; timer.Start();
+
+    Umu = zero;
+    uint32_t csum=0;
+    uint64_t bytes=0;
+
+    int ndim                 = grid->Dimensions();
+    int nrank                = grid->ProcessorCount();
+    int myrank               = grid->ThisRank();
+
+    std::vector<int>  psizes = grid->ProcessorGrid(); 
+    std::vector<int>  pcoor  = grid->ThisProcessorCoor();
+    std::vector<int> gLattice= grid->GlobalDimensions();
+    std::vector<int> lLattice= grid->LocalDimensions();
+
+    std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
+    std::vector<int> dargs   (ndim,MPI_DISTRIBUTE_DFLT_DARG);
+
+    std::vector<int> lStart(ndim);
+    std::vector<int> gStart(ndim);
+
+    // Flatten the file
+    int lsites = grid->lSites();
+    std::vector<sobj> scalardata(lsites); 
+    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
+
+    for(int d=0;d<ndim;d++){
+      gStart[d] = lLattice[d]*pcoor[d];
+      lStart[d] = 0;
+    }
+
+    MPI_Datatype mpiObject;
+    MPI_Datatype fileArray;
+    MPI_Datatype localArray;
+    MPI_Datatype mpiword;
+    MPI_Offset disp = offset;
+    MPI_File fh ;
+    MPI_Status status;
+    int numword;
+
+    if ( sizeof( typename vobj::Realified::scalar_type ) == sizeof(float ) ) {
+      numword = sizeof(fobj)/sizeof(float);
+      mpiword = MPI_FLOAT;
+    } else {
+      numword = sizeof(fobj)/sizeof(double);
+      mpiword = MPI_DOUBLE;
+    }
+    
+    bytes = sizeof(fobj)*lsites*nrank;
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Sobj in MPI phrasing
+    //////////////////////////////////////////////////////////////////////////////
+    int ierr;
+    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);
+    assert(ierr==0);
+    ierr=MPI_Type_commit(&mpiObject);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // File global array data type
+    //////////////////////////////////////////////////////////////////////////////
+    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);
+    assert(ierr==0);
+    ierr=MPI_Type_commit(&fileArray);
+    assert(ierr==0);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // local lattice array
+    //////////////////////////////////////////////////////////////////////////////
+    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);
+    assert(ierr==0);
+    ierr=MPI_Type_commit(&localArray);
+    assert(ierr==0);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Do the MPI I/O read
+    //////////////////////////////////////////////////////////////////////////////
+    //    std::cout << "MPI IO read from " <<file<<std::endl;
+    ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
+    assert(ierr==0);
+    ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
+    //    std::cout<< "MPI File set view returned " <<ierr<<std::endl;
+    /*
+    if ( ierr ) { 
+      char buf[MPI_MAX_ERROR_STRING];
+      int blen;
+      MPI_Error_string(ierr,buf,&blen);
+      std::cout << " Error string " <<buf<<std::endl;
+    }
+    */
+    assert(ierr==0);
+
+    ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);
+    assert(ierr==0);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Finish up MPI I/O
+    //////////////////////////////////////////////////////////////////////////////
+    MPI_File_close(&fh);
+    MPI_Type_free(&fileArray);
+    MPI_Type_free(&localArray);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Byte order
+    //////////////////////////////////////////////////////////////////////////////
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+    if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*lsites);
+    if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*lsites);
+    if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*lsites);
+    if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*lsites);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Munge [ .e.g 3rd row recon ]
+    //////////////////////////////////////////////////////////////////////////////
+    for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x], csum);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Safety check
+    //////////////////////////////////////////////////////////////////////////////
+    grid->GlobalSum(csum);
+    grid->Barrier();
+
+    vectorizeFromLexOrdArray(scalardata,Umu);    
+
+    timer.Stop();
+    std::cout<<GridLogMessage<<"readObjectMPI: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+
+    return csum;
+  }
+
   template<class vobj,class fobj,class munger> 
   static inline uint32_t writeObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset,
 					   const std::string & format)
@@ -597,7 +740,7 @@ class BinaryIO {
 
 	for(int c=0;c<chunk;c++) munge(fileObj[c],siteObj[c],csum);
 
-      } 
+      }
      
       // Possibly do transport through pt2pt 
       for(int cc=0;cc<chunk;cc+=lstrip){

From 6f687a67cd2897d4cfa8a54ace553fe1b49db29f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 1 Jun 2017 17:36:18 -0400
Subject: [PATCH 055/177] As local vols increase, use 64 bits for safety

---
 lib/lattice/Lattice_transfer.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h
index c8ba0928..cbf31f86 100644
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -595,8 +595,9 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
 }
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
-typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type 
-vectorizeFromLexOrdArray(std::vector<sobj> &in, const Lattice<vobj> &out)
+typename std::enable_if<isSIMDvectorized<vobj>::value 
+                    && !isSIMDvectorized<sobj>::value, void>::type 
+vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
 {
 
   typedef typename vobj::vector_type vtype;
@@ -614,7 +615,7 @@ vectorizeFromLexOrdArray(std::vector<sobj> &in, const Lattice<vobj> &out)
     grid->iCoorFromIindex(icoor[lane],lane);
   }
   
-  parallel_for(int oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
+  parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
     //Assemble vector of pointers to output elements
     std::vector<sobj*> ptrs(nsimd);
 
@@ -624,8 +625,10 @@ vectorizeFromLexOrdArray(std::vector<sobj> &in, const Lattice<vobj> &out)
     std::vector<int> lcoor(grid->Nd());
       
     for(int lane=0; lane < nsimd; lane++){
-      for(int mu=0;mu<ndim;mu++)
+
+      for(int mu=0;mu<ndim;mu++){
 	lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
+      }
 
       int lex;
       Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions);
@@ -663,7 +666,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
   std::vector<SobjOut> in_slex_conv(in_grid->lSites());
   unvectorizeToLexOrdArray(in_slex_conv, in);
     
-  parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
+  parallel_for(uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
     std::vector<int> out_ocoor(ndim);
     out_grid->oCoorFromOindex(out_ocoor, out_oidx);
 

From 21421656abb44bc872ca85c3364eed638fff8a5f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 1 Jun 2017 17:36:53 -0400
Subject: [PATCH 056/177] Big changes improving the code to use MPI IO

---
 lib/parallelIO/BinaryIO.h | 1065 +++++++++++--------------------------
 1 file changed, 297 insertions(+), 768 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index cbc619ef..13341927 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -38,7 +38,12 @@
 #include <arpa/inet.h>
 #include <algorithm>
 
+namespace Grid { 
 
+
+/////////////////////////////////////////////////////////////////////////////////
+// Byte reversal garbage
+/////////////////////////////////////////////////////////////////////////////////
 inline uint32_t byte_reverse32(uint32_t f) { 
       f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
       return f;
@@ -60,63 +65,155 @@ inline uint64_t Grid_ntohll(uint64_t A) {
 }
 #endif
 
-namespace Grid { 
-
-  // A little helper
-  inline void removeWhitespace(std::string &key)
-  {
-    key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
+/////////////////////////////////////////////////////////////////////////////////
+// Simple classes for precision conversion
+/////////////////////////////////////////////////////////////////////////////////
+template <class fobj, class sobj>
+struct BinarySimpleUnmunger {
+  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
+  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
+  
+  void operator()(sobj &in, fobj &out) {
+    // take word by word and transform accoding to the status
+    fobj_stype *out_buffer = (fobj_stype *)&out;
+    sobj_stype *in_buffer = (sobj_stype *)&in;
+    size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
+    size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
+    assert(fobj_words == sobj_words);
+    
+    for (unsigned int word = 0; word < sobj_words; word++)
+      out_buffer[word] = in_buffer[word];  // type conversion on the fly
+    
   }
+};
 
+template <class fobj, class sobj>
+struct BinarySimpleMunger {
+  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
+  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
+
+  void operator()(fobj &in, sobj &out) {
+    // take word by word and transform accoding to the status
+    fobj_stype *in_buffer = (fobj_stype *)&in;
+    sobj_stype *out_buffer = (sobj_stype *)&out;
+    size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
+    size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
+    assert(fobj_words == sobj_words);
+    
+    for (unsigned int word = 0; word < sobj_words; word++)
+      out_buffer[word] = in_buffer[word];  // type conversion on the fly
+    
+  }
+};
+// A little helper
+inline void removeWhitespace(std::string &key)
+{
+  key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Static class holding the parallel IO code
+// Could just use a namespace
+///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
-
  public:
 
+  /////////////////////////////////////////////////////////////////////////////
+  // more byte manipulation helpers
+  /////////////////////////////////////////////////////////////////////////////
+  static inline void Uint32Checksum(uint32_t *buf,uint64_t buf_size_bytes,uint32_t &csum)
+  {
+#pragma omp parallel
+    { 
+      uint32_t csum_thr=0;
+      uint64_t count = buf_size_bytes/sizeof(uint32_t);
+#pragma omp for
+      for(uint64_t i=0;i<count;i++){
+	csum_thr=csum_thr+buf[i];
+      }
+#pragma omp critical
+      csum = csum + csum_thr;
 
+    }
+  }
   // Network is big endian
-  static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} 
-  static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} 
-  static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} 
-  static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} 
+  static inline void htobe32_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
+    Uint32Checksum((uint32_t *)file_object,bytes,csum); 
+    htobe32_v(file_object,bytes); 
+  }
+  static inline void htobe64_v(void *file_object,uint64_t bytes,uint32_t &csum){
+    Uint32Checksum((uint32_t *)file_object,bytes,csum); 
+    htobe64_v(file_object,bytes);
+  }
+  static inline void htole32_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
+    Uint32Checksum((uint32_t *)file_object,bytes,csum);
+    htole32_v(file_object,bytes);
+  }
+  static inline void htole64_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
+    Uint32Checksum((uint32_t *)file_object,bytes,csum);
+    htole64_v(file_object,bytes);
+  }
+  static inline void be32toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
+    be32toh_v(file_object,bytes); 
+    Uint32Checksum((uint32_t *)file_object,bytes,csum); 
+  }
+  static inline void be64toh_v(void *file_object,uint64_t bytes,uint32_t &csum){
+    be64toh_v(file_object,bytes);
+    Uint32Checksum((uint32_t *)file_object,bytes,csum); 
+  }
+  static inline void le32toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
+    le32toh_v(file_object,bytes);
+    Uint32Checksum((uint32_t *)file_object,bytes,csum);
+  }
+  static inline void le64toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
+    le64toh_v(file_object,bytes);
+    Uint32Checksum((uint32_t *)file_object,bytes,csum);
+  }
+  static inline void htobe32_v(void *file_object,uint64_t bytes){ be32toh_v(file_object,bytes);} 
+  static inline void htobe64_v(void *file_object,uint64_t bytes){ be64toh_v(file_object,bytes);} 
+  static inline void htole32_v(void *file_object,uint64_t bytes){ le32toh_v(file_object,bytes);} 
+  static inline void htole64_v(void *file_object,uint64_t bytes){ le64toh_v(file_object,bytes);} 
 
-  static inline void be32toh_v(void *file_object,uint32_t bytes)
+  static inline void be32toh_v(void *file_object,uint64_t bytes)
   {
     uint32_t * f = (uint32_t *)file_object;
-    for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
+    uint64_t count = bytes/sizeof(uint32_t);
+    parallel_for(uint64_t i=0;i<count;i++){  
       f[i] = ntohl(f[i]);
     }
   }
-
   // LE must Swap and switch to host
-  static inline void le32toh_v(void *file_object,uint32_t bytes)
+  static inline void le32toh_v(void *file_object,uint64_t bytes)
   {
     uint32_t *fp = (uint32_t *)file_object;
     uint32_t f;
 
-    for(int i=0;i*sizeof(uint32_t)<bytes;i++){  
+    uint64_t count = bytes/sizeof(uint32_t);
+    parallel_for(uint64_t i=0;i<count;i++){  
       f = fp[i];
       // got network order and the network to host
       f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
       fp[i] = ntohl(f);
     }
   }
-
   // BE is same as network
-  static inline void be64toh_v(void *file_object,uint32_t bytes)
+  static inline void be64toh_v(void *file_object,uint64_t bytes)
   {
     uint64_t * f = (uint64_t *)file_object;
-    for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
+    uint64_t count = bytes/sizeof(uint64_t);
+    parallel_for(uint64_t i=0;i<count;i++){  
       f[i] = Grid_ntohll(f[i]);
     }
   }
   
   // LE must swap and switch;
-  static inline void le64toh_v(void *file_object,uint32_t bytes)
+  static inline void le64toh_v(void *file_object,uint64_t bytes)
   {
     uint64_t *fp = (uint64_t *)file_object;
     uint64_t f,g;
     
-    for(int i=0;i*sizeof(uint64_t)<bytes;i++){  
+    uint64_t count = bytes/sizeof(uint64_t);
+    parallel_for(uint64_t i=0;i<count;i++){  
       f = fp[i];
       // got network order and the network to host
       g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
@@ -126,143 +223,23 @@ class BinaryIO {
       fp[i] = Grid_ntohll(g);
     }
   }
-
-  template<class vobj,class fobj,class munger> static inline void Uint32Checksum(Lattice<vobj> &lat,munger munge,uint32_t &csum)
+  /////////////////////////////////////////////////////////////////////////////
+  // Real action:
+  // Read or Write distributed lexico array of ANY object to a specific location in file 
+  //////////////////////////////////////////////////////////////////////////////////////
+  template<class word,class fobj>
+    static inline uint32_t IOobject(word w,
+				    GridBase *grid,
+				    std::vector<fobj> &iodata,
+				    std::string file,
+				    int offset,
+				    const std::string &format, int doread)
   {
-    typedef typename vobj::scalar_object sobj;
-    GridBase *grid = lat._grid ;
-    std::cout <<GridLogMessage<< "Uint32Checksum "<<norm2(lat)<<std::endl;
-    sobj siteObj;
-    fobj fileObj;
+    grid->Barrier();
+    GridStopWatch timer; 
+    GridStopWatch bstimer;
 
-    csum = 0;
-    std::vector<int> lcoor;
-    for(int l=0;l<grid->lSites();l++){
-      Lexicographic::CoorFromIndex(lcoor,l,grid->_ldimensions);
-      peekLocalSite(siteObj,lat,lcoor);
-      munge(siteObj,fileObj,csum);
-    }
-    grid->GlobalSum(csum);
-  }
-    
-  static inline void Uint32Checksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum)
-  {
-    for(int i=0;i*sizeof(uint32_t)<buf_size_bytes;i++){
-      csum=csum+buf[i];
-    }
-  }
-
-  // Simple classes for precision conversion
-  template <class fobj, class sobj>
-  struct BinarySimpleUnmunger {
-    typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
-    typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
-
-    void operator()(sobj &in, fobj &out, uint32_t &csum) {
-      // take word by word and transform accoding to the status
-      fobj_stype *out_buffer = (fobj_stype *)&out;
-      sobj_stype *in_buffer = (sobj_stype *)&in;
-      size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
-      size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
-      assert(fobj_words == sobj_words);
-
-      for (unsigned int word = 0; word < sobj_words; word++)
-        out_buffer[word] = in_buffer[word];  // type conversion on the fly
-
-      BinaryIO::Uint32Checksum((uint32_t *)&out, sizeof(out), csum);
-    }
-  };
-
-  template <class fobj, class sobj>
-  struct BinarySimpleMunger {
-    typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
-    typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
-
-    void operator()(fobj &in, sobj &out, uint32_t &csum) {
-      // take word by word and transform accoding to the status
-      fobj_stype *in_buffer = (fobj_stype *)&in;
-      sobj_stype *out_buffer = (sobj_stype *)&out;
-      size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
-      size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
-      assert(fobj_words == sobj_words);
-
-      for (unsigned int word = 0; word < sobj_words; word++)
-        out_buffer[word] = in_buffer[word];  // type conversion on the fly
-
-      BinaryIO::Uint32Checksum((uint32_t *)&in, sizeof(in), csum);
-    }
-  };
-
-  template<class vobj,class fobj,class munger>
-  static inline uint32_t readObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
-  {
-    typedef typename vobj::scalar_object sobj;
-
-    GridBase *grid = Umu._grid;
-
-    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
-    GridStopWatch timer; timer.Start();
-
-    int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
-
-    // Find the location of each site and send to primary node
-    // Take loop order from Chroma; defines loop order now that NERSC doc no longer
-    // available (how short sighted is that?)
-    std::ifstream fin(file,std::ios::binary|std::ios::in);
-    fin.seekg(offset);
-
-    Umu = zero;
     uint32_t csum=0;
-    uint64_t bytes=0;
-
-    int lx = grid->_fdimensions[0];
-    std::vector<fobj> file_object(lx);
-    std::vector<sobj> munged(lx);
-    for(int t=0;t<grid->_fdimensions[3];t++){
-    for(int z=0;z<grid->_fdimensions[2];z++){
-    for(int y=0;y<grid->_fdimensions[1];y++){
-    {
-      bytes += sizeof(fobj)*lx;
-      if (grid->IsBoss()) {
-        fin.read((char *)&file_object[0], sizeof(fobj)*lx); assert( fin.fail()==0);
-	if (ieee32big) be32toh_v((void *)&file_object[0], sizeof(fobj)*lx);
-	if (ieee32)    le32toh_v((void *)&file_object[0], sizeof(fobj)*lx);
-	if (ieee64big) be64toh_v((void *)&file_object[0], sizeof(fobj)*lx);
-	if (ieee64)    le64toh_v((void *)&file_object[0], sizeof(fobj)*lx);
-	for(int x=0;x<lx;x++){
-	  munge(file_object[x], munged[x], csum);
-	}
-      }
-      for(int x=0;x<lx;x++){
-	std::vector<int> site({x,y,z,t});
-	// The boss who read the file has their value poked
-	pokeSite(munged[x],Umu,site);
-      }
-    }}}}
-    timer.Stop();
-    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
-
-    grid->Broadcast(0,(void *)&csum,sizeof(csum));
-    return csum;
-  }
-
-  template<class vobj,class fobj,class munger>
-  static inline uint32_t readObjectMPI(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
-  {
-    typedef typename vobj::scalar_object sobj;
-
-    GridBase *grid = Umu._grid;
-
-    std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
-    GridStopWatch timer; timer.Start();
-
-    Umu = zero;
-    uint32_t csum=0;
-    uint64_t bytes=0;
 
     int ndim                 = grid->Dimensions();
     int nrank                = grid->ProcessorCount();
@@ -280,9 +257,8 @@ class BinaryIO {
     std::vector<int> gStart(ndim);
 
     // Flatten the file
-    int lsites = grid->lSites();
-    std::vector<sobj> scalardata(lsites); 
-    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
+    uint64_t lsites = grid->lSites();
+    iodata.resize(lsites);
 
     for(int d=0;d<ndim;d++){
       gStart[d] = lLattice[d]*pcoor[d];
@@ -298,7 +274,7 @@ class BinaryIO {
     MPI_Status status;
     int numword;
 
-    if ( sizeof( typename vobj::Realified::scalar_type ) == sizeof(float ) ) {
+    if ( sizeof( word ) == sizeof(float ) ) {
       numword = sizeof(fobj)/sizeof(float);
       mpiword = MPI_FLOAT;
     } else {
@@ -306,59 +282,25 @@ class BinaryIO {
       mpiword = MPI_DOUBLE;
     }
     
-    bytes = sizeof(fobj)*lsites*nrank;
 
     //////////////////////////////////////////////////////////////////////////////
     // Sobj in MPI phrasing
     //////////////////////////////////////////////////////////////////////////////
     int ierr;
-    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);
-    assert(ierr==0);
-    ierr=MPI_Type_commit(&mpiObject);
+    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);    assert(ierr==0);
+    ierr = MPI_Type_commit(&mpiObject);
 
     //////////////////////////////////////////////////////////////////////////////
     // File global array data type
     //////////////////////////////////////////////////////////////////////////////
-    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);
-    assert(ierr==0);
-    ierr=MPI_Type_commit(&fileArray);
-    assert(ierr==0);
+    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);    assert(ierr==0);
+    ierr=MPI_Type_commit(&fileArray);    assert(ierr==0);
 
     //////////////////////////////////////////////////////////////////////////////
     // local lattice array
     //////////////////////////////////////////////////////////////////////////////
-    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);
-    assert(ierr==0);
-    ierr=MPI_Type_commit(&localArray);
-    assert(ierr==0);
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Do the MPI I/O read
-    //////////////////////////////////////////////////////////////////////////////
-    //    std::cout << "MPI IO read from " <<file<<std::endl;
-    ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
-    assert(ierr==0);
-    ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
-    //    std::cout<< "MPI File set view returned " <<ierr<<std::endl;
-    /*
-    if ( ierr ) { 
-      char buf[MPI_MAX_ERROR_STRING];
-      int blen;
-      MPI_Error_string(ierr,buf,&blen);
-      std::cout << " Error string " <<buf<<std::endl;
-    }
-    */
-    assert(ierr==0);
-
-    ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);
-    assert(ierr==0);
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Finish up MPI I/O
-    //////////////////////////////////////////////////////////////////////////////
-    MPI_File_close(&fh);
-    MPI_Type_free(&fileArray);
-    MPI_Type_free(&localArray);
+    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);    assert(ierr==0);
+    ierr=MPI_Type_commit(&localArray);    assert(ierr==0);
 
     //////////////////////////////////////////////////////////////////////////////
     // Byte order
@@ -368,623 +310,210 @@ class BinaryIO {
     int ieee64big = (format == std::string("IEEE64BIG"));
     int ieee64    = (format == std::string("IEEE64"));
 
-    if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*lsites);
-    if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*lsites);
-    if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*lsites);
-    if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*lsites);
+    //////////////////////////////////////////////////////////////////////////////
+    // Do the MPI I/O read
+    //////////////////////////////////////////////////////////////////////////////
+    if ( doread ) { 
+      std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
+      timer.Start();
+      ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
+      ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
+      ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
+      timer.Stop();
 
+      grid->Barrier();
+
+      bstimer.Start();
+      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      bstimer.Stop();
+
+    } else { 
+      std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
+      bstimer.Start();
+      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      bstimer.Stop();
+
+      grid->Barrier();
+
+      timer.Start();
+      ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
+      ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
+      ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
+      timer.Stop();
+    
+    }
+   
     //////////////////////////////////////////////////////////////////////////////
-    // Munge [ .e.g 3rd row recon ]
+    // Finish up MPI I/O
     //////////////////////////////////////////////////////////////////////////////
-    for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x], csum);
+    MPI_File_close(&fh);
+    MPI_Type_free(&fileArray);
+    MPI_Type_free(&localArray);
+
+    std::cout<<GridLogMessage<<"IOobject: ";
+    if ( doread) std::cout << " read  ";
+    else         std::cout << " write ";
+    uint64_t bytes = sizeof(fobj)*lsites*nrank;
+    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
+
+    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;
 
     //////////////////////////////////////////////////////////////////////////////
     // Safety check
     //////////////////////////////////////////////////////////////////////////////
+    grid->Barrier();
     grid->GlobalSum(csum);
     grid->Barrier();
 
-    vectorizeFromLexOrdArray(scalardata,Umu);    
-
-    timer.Stop();
-    std::cout<<GridLogMessage<<"readObjectMPI: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
-
     return csum;
   }
 
-  template<class vobj,class fobj,class munger> 
-  static inline uint32_t writeObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset,
-					   const std::string & format)
+  /////////////////////////////////////////////////////////////////////////////
+  // Read a Lattice of object
+  //////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t readLatticeObject(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
   {
-        typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::Realified::scalar_type word;    word w=0;
 
     GridBase *grid = Umu._grid;
+    int lsites = grid->lSites();
 
-    int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
-
-    //////////////////////////////////////////////////
-    // Serialise through node zero
-    //////////////////////////////////////////////////
-    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
-    GridStopWatch timer; timer.Start();
+    std::vector<sobj> scalardata(lsites); 
+    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
     
-    std::ofstream fout;
-    if ( grid->IsBoss() ) {
-      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
-      fout.seekp(offset);
-    }
-    uint64_t bytes=0;
-    uint32_t csum=0;
-    int lx = grid->_fdimensions[0];
-    std::vector<fobj> file_object(lx);
-    std::vector<sobj> unmunged(lx);
-    for(int t=0;t<grid->_fdimensions[3];t++){
-    for(int z=0;z<grid->_fdimensions[2];z++){
-    for(int y=0;y<grid->_fdimensions[1];y++){
-    {
+    int doread=1;
+    uint32_t csum= IOobject(w,grid,iodata,file,offset,format,doread);
 
-      std::vector<int> site({0,y,z,t});
-      // peek & write
-      for(int x=0;x<lx;x++){
-	site[0]=x;
-	peekSite(unmunged[x],Umu,site);
-      }
-      
-      if ( grid->IsBoss() ) {
-	for(int x=0;x<lx;x++){
-	  munge(unmunged[x],file_object[x],csum);
-	}
-	if(ieee32big) htobe32_v((void *)&file_object[0],sizeof(fobj)*lx);
-	if(ieee32)    htole32_v((void *)&file_object[0],sizeof(fobj)*lx);
-	if(ieee64big) htobe64_v((void *)&file_object[0],sizeof(fobj)*lx);
-	if(ieee64)    htole64_v((void *)&file_object[0],sizeof(fobj)*lx);
+    GridStopWatch timer; 
+    timer.Start();
 
-	fout.write((char *)&file_object[0],sizeof(fobj)*lx);assert( fout.fail()==0);
-	bytes+=sizeof(fobj)*lx;
-      }
-    }}}}
+    parallel_for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
+
+    vectorizeFromLexOrdArray(scalardata,Umu);    
+    grid->Barrier();
 
     timer.Stop();
-    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
-    
-    grid->Broadcast(0,(void *)&csum,sizeof(csum));
+    std::cout<<GridLogMessage<<"readLatticeObject: vectorize overhead "<<timer.Elapsed()  <<std::endl;
+
+    return csum;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Write a Lattice of object
+  //////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj,class fobj,class munger>
+  static inline uint32_t writeLatticeObject(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
+  {
+    typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::Realified::scalar_type word;    word w=0;
+    GridBase *grid = Umu._grid;
+    int lsites = grid->lSites();
+
+    std::vector<sobj> scalardata(lsites); 
+    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Munge [ .e.g 3rd row recon ]
+    //////////////////////////////////////////////////////////////////////////////
+    GridStopWatch timer; timer.Start();
+    unvectorizeToLexOrdArray(scalardata,Umu);    
+
+    parallel_for(int x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
+
+    grid->Barrier();
+    timer.Stop();
+
+    int dowrite=0;
+    uint32_t csum= IOobject(w,grid,iodata,file,offset,format,dowrite);
+
+    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
+
     return csum;
   }
   
-  static inline uint32_t writeRNGSerial(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file,int offset)
+  /////////////////////////////////////////////////////////////////////////////
+  // Read a RNG;  use IOobject and lexico map to an array of state 
+  //////////////////////////////////////////////////////////////////////////////////////
+  static inline uint32_t readRNG(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file,int offset)
   {
     typedef typename GridSerialRNG::RngStateType RngStateType;
     const int RngStateCount = GridSerialRNG::RngStateCount;
+    typedef std::array<RngStateType,RngStateCount> RNGstate;
+    typedef RngStateType word;    word w=0;
 
-    GridBase *grid = parallel._grid;
-    int gsites = grid->_gsites;
-
-    GridStopWatch timer; timer.Start();
-    //////////////////////////////////////////////////
-    // Serialise through node zero
-    //////////////////////////////////////////////////
-    std::ofstream fout;
-    if (grid->IsBoss()) {
-      fout.open(file, std::ios::binary | std::ios::out);
-      if (!fout.is_open()) {
-        std::cout << GridLogMessage << "writeRNGSerial: Error opening file " << file << std::endl;
-        exit(0);// write better error handling
-      }
-      fout.seekp(offset);
-    }
-
-    std::cout << GridLogMessage << "Serial RNG write I/O on file " << file << std::endl;
     uint32_t csum = 0;
-    std::vector<RngStateType> saved(RngStateCount);
-    int bytes = sizeof(RngStateType) * saved.size();
-    std::cout << GridLogDebug << "RngStateCount: " << RngStateCount << std::endl;
-    std::cout << GridLogDebug << "Type has " << bytes << " bytes" << std::endl;
-    std::vector<int> gcoor;
-
-    for(int gidx=0;gidx<gsites;gidx++){
-
-      int rank,o_idx,i_idx;
-      grid->GlobalIndexToGlobalCoor(gidx,gcoor);
-      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
-      int l_idx=parallel.generator_idx(o_idx,i_idx);
-
-      if( rank == grid->ThisRank() ){
-	parallel.GetState(saved,l_idx);
-      }
-
-      if ( rank != 0 ) {
-	grid->Broadcast(rank, (void *)&saved[0], bytes);
-      }
-
-      grid->Barrier();
-
-      if ( grid->IsBoss() ) {
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-	fout.write((char *)&saved[0],bytes);assert( fout.fail()==0);
-      }
-      
-    }
-
-    if ( grid->IsBoss() ) {
-      serial.GetState(saved,0);
-      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-      fout.write((char *)&saved[0],bytes);assert( fout.fail()==0);
-    }
-
-    grid->Broadcast(0, (void *)&csum, sizeof(csum));
-
-    if (grid->IsBoss()) {
-      fout.close();
-    }
-
-    timer.Stop();
-
-    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
-    std::cout << GridLogMessage << "RNG state saved in " << timer.Elapsed() << std::endl;
-    return csum;
-  }
-
-
-  static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file,int offset)
-  {
-    typedef typename GridSerialRNG::RngStateType RngStateType;
-    const int RngStateCount = GridSerialRNG::RngStateCount;
+    std::string format = "IEEE32BIG";
 
     GridBase *grid = parallel._grid;
-    int gsites = grid->_gsites;
+    int gsites = grid->gSites();
+    int lsites = grid->lSites();
 
-    //////////////////////////////////////////////////
-    // Serialise through node zero
-    //////////////////////////////////////////////////
-    std::cout<< GridLogMessage<< "Serial RNG read I/O of file "<<file<<std::endl;
-
-    std::ifstream fin;
-    if (grid->IsBoss()) {
-      fin.open(file, std::ios::binary | std::ios::in);
-      if (!fin.is_open()) {
-        std::cout << GridLogMessage << "readRNGSerial: Error opening file " << file << std::endl;
-        exit(0);// write better error handling
-      }
-      fin.seekg(offset);
-    }
-
-    
-    uint32_t csum=0;
-    std::vector<RngStateType> saved(RngStateCount);
-    int bytes = sizeof(RngStateType)*saved.size();
-    std::cout << GridLogDebug << "RngStateCount: " << RngStateCount << std::endl;
-    std::cout << GridLogDebug << "Type has " << bytes << " bytes" << std::endl;
-    std::vector<int> gcoor;
-
-    std::cout << GridLogDebug << "gsites: " << gsites << " loop" << std::endl;
-    for(int gidx=0;gidx<gsites;gidx++){
-
-      int rank,o_idx,i_idx;
-      grid->GlobalIndexToGlobalCoor(gidx,gcoor);
-      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
-      int l_idx=parallel.generator_idx(o_idx,i_idx);
-
-      if ( grid->IsBoss() ) {
-	fin.read((char *)&saved[0],bytes);assert( fin.fail()==0);
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-      }
-      
-      grid->Broadcast(0,(void *)&saved[0],bytes);
-      grid->Barrier();
-
-      if( rank == grid->ThisRank() ){
-        parallel.SetState(saved,l_idx);
-      }
-    }
-
-    if ( grid->IsBoss() ) {
-      fin.read((char *)&saved[0],bytes);assert( fin.fail()==0);
-      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-      serial.SetState(saved,0);
-    }
-
-    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
-
-    grid->Broadcast(0,(void *)&csum,sizeof(csum));
-
-    return csum;
-  }
-
-  template <class vobj, class fobj, class munger>
-  static inline uint32_t readObjectParallel(Lattice<vobj> &Umu,
-                                            std::string file, 
-                                            munger munge,
-                                            int offset,
-                                            const std::string &format,
-                                            ILDGtype ILDG = ILDGtype()) {
-    typedef typename vobj::scalar_object sobj;
-
-    GridBase *grid = Umu._grid;
-
-    int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
-
-
-    // Ideally one reader/writer per xy plane and read these contiguously
-    // with comms from nominated I/O nodes.
-    std::ifstream fin;
-
-    int nd = grid->_ndimension;
-    std::vector<int> parallel(nd,1); parallel[0] = 0;
-    std::vector<int> ioproc  (nd);
-    std::vector<int> start(nd);
-    std::vector<int> range(nd);
-
-    for(int d=0;d<nd;d++){
-      assert(grid->CheckerBoarded(d) == 0);
-    }
-
-    uint64_t slice_vol = 1;
-
-    int IOnode = 1;
-    int gstrip = grid->_gdimensions[0];
-    int lstrip = grid->_ldimensions[0];
-
-    int chunk ;
-    if ( nd==1) chunk = gstrip;
-    else        chunk = gstrip*grid->_ldimensions[1];
-
-    for(int d=0;d<grid->_ndimension;d++) {
-      
-      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
-      } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
-	
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
-      }
-      slice_vol = slice_vol * range[d];
-    }
-
-    {
-      uint32_t tmp = IOnode;
-      grid->GlobalSum(tmp);
-      std::cout<< std::dec ;
-      std::cout<< GridLogMessage<< "Parallel read I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
-      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
-      }
-      std::cout << std::endl;
-      std::cout<< GridLogMessage<< "Parallel I/O local  strip size is "<< lstrip <<std::endl;
-      std::cout<< GridLogMessage<< "Parallel I/O global strip size is "<< gstrip <<std::endl;
-      std::cout<< GridLogMessage<< "Parallel I/O chunk size is "<< chunk  <<std::endl;
-    }
-
-    GridStopWatch timer; timer.Start();
-    uint64_t bytes=0;
-
-    int myrank = grid->ThisRank();
-    int iorank = grid->RankFromProcessorCoor(ioproc);
-
-    if (!ILDG.is_ILDG) {
-      if ( IOnode ) { 
-	fin.open(file,std::ios::binary|std::ios::in);
-	if ( !fin.is_open() ) { 
-	  std::cout << GridLogMessage << "readObjectParallel: Error opening file " << file << std::endl;
-          exit(0);
-	}
-      }
-    }
-
-    //////////////////////////////////////////////////////////
-    // Find the location of each site and send to primary node
-    // Take loop order from Chroma; defines loop order now that NERSC doc no longer
-    // available (how short sighted is that?)
-    //////////////////////////////////////////////////////////
-    Umu = zero;
-    static uint32_t csum; csum=0;//static for SHMEM
-
-    std::vector<fobj> fileObj(chunk); // FIXME
-    std::vector<sobj> siteObj(chunk); // Use alignedAllocator to place in symmetric region for SHMEM
-
-    // need to implement these loops in Nd independent way with a lexico conversion
-    for(int tlex=0;tlex<slice_vol;tlex+=chunk){
-
-      std::vector<int> tsite(nd); // temporary mixed up site
-      std::vector<int> gsite(nd);
-      std::vector<int> lsite(nd);
-
-      int rank, o_idx,i_idx, g_idx;
-
-      ///////////////////////////////////////////
-      // Get the global lexico base of the chunk
-      ///////////////////////////////////////////
-      Lexicographic::CoorFromIndex(tsite,tlex,range);
-      for(int d=0;d<nd;d++) gsite[d] = tsite[d]+start[d];
-      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
-      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
-
-      ////////////////////////////////
-      // iorank reads from the seek
-      ////////////////////////////////
-      if (myrank == iorank) {
-
-      	if (ILDG.is_ILDG){
-#ifdef HAVE_LIME
-	  // use C-LIME to populate the record
-          uint64_t sizeFO   = sizeof(fobj);
-          uint64_t sizeChunk= sizeFO*chunk;
-          limeReaderSeek(ILDG.LR, g_idx*sizeFO, SEEK_SET);
-          int status = limeReaderReadData((void *)&fileObj[0], &sizeChunk, ILDG.LR);
-#else 
-	  assert(0);
-#endif
-        } else {
-          fin.seekg(offset+g_idx*sizeof(fobj));
-          fin.read((char *)&fileObj[0],sizeof(fobj)*chunk);
-        }
-        bytes+=sizeof(fobj)*chunk;
-
-        if(ieee32big) be32toh_v((void *)&fileObj[0],sizeof(fobj)*chunk);
-        if(ieee32)    le32toh_v((void *)&fileObj[0],sizeof(fobj)*chunk);
-        if(ieee64big) be64toh_v((void *)&fileObj[0],sizeof(fobj)*chunk);
-        if(ieee64)    le64toh_v((void *)&fileObj[0],sizeof(fobj)*chunk);
-
-	for(int c=0;c<chunk;c++) munge(fileObj[c],siteObj[c],csum);
-
-      }
-     
-      // Possibly do transport through pt2pt 
-      for(int cc=0;cc<chunk;cc+=lstrip){
-
-	/////////////////////////////////
-	// Get the rank of owner of strip
-	/////////////////////////////////
-	Lexicographic::CoorFromIndex(tsite,tlex+cc,range);
-
-	for(int d=0;d<nd;d++){
-	  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	  gsite[d] = tsite[d]+start[d];               // global site
-	}
-	grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
-
-	if ( rank != iorank ) { 
-	  if ( (myrank == rank) || (myrank==iorank) ) {
-	    grid->SendRecvPacket((void *)&siteObj[cc],(void *)&siteObj[cc],iorank,rank,sizeof(sobj)*lstrip);
-	  }
-	}
-	// Poke at destination
-	if ( myrank == rank ) {
-	  for(int x=0;x<lstrip;x++){
-	    lsite[0]=x;
-	    pokeLocalSite(siteObj[cc+x],Umu,lsite);
-	  }
-	}
-	grid->Barrier(); // necessary?
-      }
-    }
-
-    grid->GlobalSum(csum);
-    grid->GlobalSum(bytes);
-    grid->Barrier();
-
-    timer.Stop();
-    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     <<(double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
-    return csum;
-  }
-
-
-  //////////////////////////////////////////////////////////
-  // Parallel writer
-  //////////////////////////////////////////////////////////
-  template <class vobj, class fobj, class munger>
-  static inline uint32_t writeObjectParallel(Lattice<vobj> &Umu,
-                                             std::string file, munger munge,
-                                             int offset,
-                                             const std::string &format,
-                                             ILDGtype ILDG = ILDGtype()) {
-    typedef typename vobj::scalar_object sobj;
-    GridBase *grid = Umu._grid;
-
-    int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
-
-    if (!(ieee32big || ieee32 || ieee64big || ieee64)) {
-      std::cout << GridLogError << "Unrecognized file format " << format << std::endl;
-      std::cout << GridLogError << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64" << std::endl;
-      exit(0);
-    }
-
-    int nd = grid->_ndimension;
-    for (int d = 0; d < nd; d++) {
-      assert(grid->CheckerBoarded(d) == 0);
-    }
-
-    // Parallel in yzt, serial funnelled in "x".
-    // gx x ly chunk size
-    std::vector<int> parallel(nd, 1); parallel[0] = 0;
-    std::vector<int> ioproc(nd);
-    std::vector<int> start(nd);
-    std::vector<int> range(nd);
-
-    uint64_t slice_vol = 1;
-
-    int IOnode = 1;
-    int gstrip = grid->_gdimensions[0];
-    int lstrip = grid->_ldimensions[0];
-    int chunk;
-    if ( nd==1) chunk = gstrip;
-    else        chunk = gstrip*grid->_ldimensions[1];
-
-    for (int d = 0; d < grid->_ndimension; d++) {
-
-      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
-      } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
-
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
-      }
-
-      slice_vol = slice_vol * range[d];
-    }
-
-    {
-      uint32_t tmp = IOnode;
-      grid->GlobalSum(tmp);
-      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
-      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
-      }
-      std::cout << std::endl;
-      std::cout<< GridLogMessage<< "Parallel I/O local  strip size is "<< lstrip <<std::endl;
-      std::cout<< GridLogMessage<< "Parallel I/O global strip size is "<< gstrip <<std::endl;
-      std::cout<< GridLogMessage<< "Parallel I/O chunk size is "<< chunk  <<std::endl;
-    }
-    
     GridStopWatch timer;
+
+    std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
+
+    int doread=1;
+    std::vector<RNGstate> iodata(lsites);
+    csum= IOobject(w,grid,iodata,file,offset,format,doread);
+
     timer.Start();
-    uint64_t bytes=0;
-
-    int myrank = grid->ThisRank();
-    int iorank = grid->RankFromProcessorCoor(ioproc);
-
-    // Take into account block size of parallel file systems want about
-    // Ideally one reader/writer per xy plane and read these contiguously
-    // with comms from nominated I/O nodes.
-    std::ofstream fout;
-    if (!ILDG.is_ILDG) {
-      if (IOnode){
-	fout.open(file, std::ios::binary | std::ios::in | std::ios::out);
-	if (!fout.is_open()) {
-	  std::cout << GridLogMessage << "writeObjectParallel: Error opening file " << file << std::endl;
-	  exit(0);
-	}
-      }
+    parallel_for(int lidx=0;lidx<lsites;lidx++){
+      std::vector<RngStateType> tmp(RngStateCount);
+      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
+      parallel.SetState(tmp,lidx);
     }
-    
-    //////////////////////////////////////////////////////////
-    // Find the location of each site and send to primary node
-    // Take loop order from Chroma; defines loop order now that NERSC doc no
-    // longer
-    // available (how short sighted is that?)
-    //////////////////////////////////////////////////////////
+    timer.Stop();
+
+    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
+    return csum;
+  }
+  /////////////////////////////////////////////////////////////////////////////
+  // Write a RNG; lexico map to an array of state and use IOobject
+  //////////////////////////////////////////////////////////////////////////////////////
+  static inline uint32_t writeRNG(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file,int offset)
+  {
+    typedef typename GridSerialRNG::RngStateType RngStateType;
+    typedef RngStateType word; word w=0;
+    const int RngStateCount = GridSerialRNG::RngStateCount;
+    typedef std::array<RngStateType,RngStateCount> RNGstate;
 
     uint32_t csum = 0;
-    std::vector<fobj> fileObj(chunk);
-    std::vector<sobj> siteObj(chunk);
 
-    // should aggregate a whole chunk and then write.
-    // need to implement these loops in Nd independent way with a lexico
-    // conversion
-    for (int tlex = 0; tlex < slice_vol; tlex+=chunk) {
+    GridBase *grid = parallel._grid;
+    int gsites = grid->gSites();
+    int lsites = grid->lSites();
 
-      std::vector<int> tsite(nd);  // temporary mixed up site
-      std::vector<int> gsite(nd);
-      std::vector<int> lsite(nd);
+    GridStopWatch timer;
+    std::string format = "IEEE32BIG";
 
-      int rank, o_idx, i_idx, g_idx;
+    std::cout << GridLogMessage << "RNG write I/O on file " << file << std::endl;
 
-      // Possibly do transport through pt2pt 
-      for(int cc=0;cc<chunk;cc+=lstrip){
-
-	// Get the rank of owner of strip
-	Lexicographic::CoorFromIndex(tsite,tlex+cc,range);
-
-	for(int d=0;d<nd;d++){
-	  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	  gsite[d] = tsite[d]+start[d];               // global site
-	}
-	grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
-
-	// Owner of data peeks it over lstrip
-	if ( myrank == rank ) {
-	  for(int x=0;x<lstrip;x++){
-	    lsite[0]=x;
-	    peekLocalSite(siteObj[cc+x],Umu,lsite);
-	  }
-	}
-
-	// Pair of nodes may need to do pt2pt send
-	if ( rank != iorank ) { // comms is necessary
-	  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	    // Send to IOrank 
-	    grid->SendRecvPacket((void *)&siteObj[cc],(void *)&siteObj[cc],rank,iorank,sizeof(sobj)*lstrip);
-	  }
-	}
-      }
-
-      grid->Barrier();  // necessary?
-
-      /////////////////////////
-      // Get the global lexico base of the chunk
-      /////////////////////////
-      Lexicographic::CoorFromIndex(tsite, tlex, range);
-      for(int d = 0;d < nd; d++){ gsite[d] = tsite[d] + start[d];}
-      grid->GlobalCoorToRankIndex(rank, o_idx, i_idx, gsite);
-      grid->GlobalCoorToGlobalIndex(gsite, g_idx);
-
-      if (myrank == iorank) {
-
-	for(int c=0;c<chunk;c++) munge(siteObj[c],fileObj[c],csum);
-
-        if (ieee32big) htobe32_v((void *)&fileObj[0], sizeof(fobj)*chunk);
-        if (ieee32   ) htole32_v((void *)&fileObj[0], sizeof(fobj)*chunk);
-        if (ieee64big) htobe64_v((void *)&fileObj[0], sizeof(fobj)*chunk);
-        if (ieee64   ) htole64_v((void *)&fileObj[0], sizeof(fobj)*chunk);
-
-        if (ILDG.is_ILDG) {
-#ifdef HAVE_LIME
-          uint64_t sizeFO   = sizeof(fobj);
-          uint64_t sizeChunk= sizeof(fobj)*chunk;
-	  limeWriterSeek(ILDG.LW, g_idx*sizeFO, SEEK_SET);
-          int status = limeWriteRecordData((void *)&fileObj[0], &sizeChunk, ILDG.LW);
-#else 
-	  assert(0);
-#endif
-        } else {
-          fout.seekp(offset + g_idx * sizeof(fobj));
-          fout.write((char *)&fileObj[0], sizeof(fobj)*chunk);assert( fout.fail()==0);
-        }
-        bytes += sizeof(fobj)*chunk;
-      }
+    timer.Start();
+    std::vector<RNGstate> iodata(lsites);
+    parallel_for(int lidx=0;lidx<lsites;lidx++){
+      std::vector<RngStateType> tmp(RngStateCount);
+      parallel.GetState(tmp,lidx);
+      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
     }
-    
-    grid->GlobalSum(csum);
-    grid->GlobalSum(bytes);
-    
     timer.Stop();
-    std::cout << GridLogPerformance << "writeObjectParallel: wrote " << bytes
-              << " bytes in " << timer.Elapsed() << " "
-              << (double)bytes / timer.useconds() << " MB/s " << std::endl;
 
-     grid->Barrier();  // necessary?
-     if (!ILDG.is_ILDG) {
-       if (IOnode) {
-	 fout.close();
-       }
-     }
+    int dowrite=0;
+    csum= IOobject(w,grid,iodata,file,offset,format,dowrite);
 
+    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
     return csum;
   }
 };
 }
-
 #endif

From 1a1f6d55f9ac7c94b7ddd1f129d26ddf87d29c9c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 1 Jun 2017 17:37:26 -0400
Subject: [PATCH 057/177] Roll over to MPI IO for parallel IO

---
 lib/parallelIO/NerscIO.h | 175 ++++++++++++---------------------------
 1 file changed, 52 insertions(+), 123 deletions(-)

diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index ab535dac..ba9d23de 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -30,12 +30,6 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 
-#undef PARALLEL_READ
-#undef SERIAL_READ
-#define MPI_READ
-
-#define PARALLEL_WRITE
-
 #include <algorithm>
 #include <iostream>
 #include <iomanip>
@@ -133,10 +127,6 @@ namespace Grid {
     //////////////////////////////////////////////////////////////////////
     // Utilities ; these are QCD aware
     //////////////////////////////////////////////////////////////////////
-    inline void NerscChecksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum)
-    {
-      BinaryIO::Uint32Checksum(buf,buf_size_bytes,csum);
-    }
     inline void reconstruct3(LorentzColourMatrix & cm)
     {
       const int x=0;
@@ -151,43 +141,38 @@ namespace Grid {
 
     template<class fobj,class sobj>
     struct NerscSimpleMunger{
-      void operator()(fobj &in, sobj &out, uint32_t &csum) {
+      void operator()(fobj &in, sobj &out) {
         for (int mu = 0; mu < Nd; mu++) {
           for (int i = 0; i < Nc; i++) {
-            for (int j = 0; j < Nc; j++) {
-              out(mu)()(i, j) = in(mu)()(i, j);
-            }
-          }
+          for (int j = 0; j < Nc; j++) {
+	    out(mu)()(i, j) = in(mu)()(i, j);
+	  }}
         }
-        NerscChecksum((uint32_t *)&in, sizeof(in), csum);
       };
     };
 
     template <class fobj, class sobj>
     struct NerscSimpleUnmunger {
-      void operator()(sobj &in, fobj &out, uint32_t &csum) {
+
+      void operator()(sobj &in, fobj &out) {
         for (int mu = 0; mu < Nd; mu++) {
           for (int i = 0; i < Nc; i++) {
-            for (int j = 0; j < Nc; j++) {
-              out(mu)()(i, j) = in(mu)()(i, j);
-            }
-          }
+          for (int j = 0; j < Nc; j++) {
+	    out(mu)()(i, j) = in(mu)()(i, j);
+	  }}
         }
-        NerscChecksum((uint32_t *)&out, sizeof(out), csum);
       };
     };
 
     template<class fobj,class sobj>
     struct Nersc3x2munger{
-      void operator() (fobj &in,sobj &out,uint32_t &csum){
-     
-	NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
 
+      void operator() (fobj &in,sobj &out){
 	for(int mu=0;mu<4;mu++){
 	  for(int i=0;i<2;i++){
-	    for(int j=0;j<3;j++){
-	      out(mu)()(i,j) = in(mu)(i)(j);
-	    }}
+	  for(int j=0;j<3;j++){
+	    out(mu)()(i,j) = in(mu)(i)(j);
+	  }}
 	}
 	reconstruct3(out);
       }
@@ -196,18 +181,13 @@ namespace Grid {
     template<class fobj,class sobj>
     struct Nersc3x2unmunger{
 
-      void operator() (sobj &in,fobj &out,uint32_t &csum){
-
-
+      void operator() (sobj &in,fobj &out){
 	for(int mu=0;mu<4;mu++){
 	  for(int i=0;i<2;i++){
-	    for(int j=0;j<3;j++){
-	      out(mu)(i)(j) = in(mu)()(i,j);
-	    }}
+	  for(int j=0;j<3;j++){
+	    out(mu)(i)(j) = in(mu)()(i,j);
+	  }}
 	}
-
-	NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
-
       }
     };
 
@@ -333,9 +313,9 @@ namespace Grid {
       // Now the meat: the object readers
       /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      template<class vsimd>
-      static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file)
-      {
+    template<class vsimd>
+    static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file)
+    {
       typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 
       GridBase *grid = Umu._grid;
@@ -354,62 +334,22 @@ namespace Grid {
       // depending on datatype, set up munger;
       // munger is a function of <floating point, Real, data_type>
       if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
-      if ( ieee32 || ieee32big ) {
-#ifdef PARALLEL_READ
-	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
-	  (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
-#endif
-#ifdef MPI_READ
-	csum=BinaryIO::readObjectMPI<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
-	  (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
-#endif
-#ifdef SERIAL_READ
-	csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
-	  (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
-#endif
-      }
-      if ( ieee64 || ieee64big ) {
-#ifdef PARALLEL_READ
-	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
-	  (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
-#endif
-#ifdef MPI_READ
-	csum=BinaryIO::readObjectMPI<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
-	  (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
-#endif
-#ifdef SERIAL_READ
-	csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
-	  (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
-#endif
-      }
-      } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
 	if ( ieee32 || ieee32big ) {
-#ifdef PARALLEL_READ
-	  csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
-#endif
-#ifdef MPI_READ
-	  csum=BinaryIO::readObjectMPI<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
-#endif
-#ifdef SERIAL_READ
-	  csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
-#endif
+	  csum=BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	    (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
 	}
 	if ( ieee64 || ieee64big ) {
-#ifdef PARALLEL_READ
-	  csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+	  csum=BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	    (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
+	}
+      } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
+	if ( ieee32 || ieee32big ) {
+	  csum=BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
+	}
+	if ( ieee64 || ieee64big ) {
+	  csum=BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
-#endif
-#ifdef MPI_READ
-	  csum=BinaryIO::readObjectMPI<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
-#endif
-#ifdef SERIAL_READ
-	  csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
-#endif
 	}
       } else {
 	assert(0);
@@ -434,14 +374,14 @@ namespace Grid {
 	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
 	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
 	std::cerr << " csum  " <<std::hex<< csum << " " << header.checksum<< std::dec<< std::endl;
-	//	exit(0);
-      }
-      //      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
-      //      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
-      //      assert(csum == header.checksum );
-
-      //      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
+	exit(0);
       }
+      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
+      assert(csum == header.checksum );
+      
+      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
+    }
 
       template<class vsimd>
       static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,std::string file, int two_row,int bits32)
@@ -466,41 +406,29 @@ namespace Grid {
 	NerscStatistics<GaugeField>(Umu,header);
 	NerscMachineCharacteristics(header);
 
-	uint32_t csum;
 	int offset;
   
 	truncate(file);
 
 	if ( two_row ) { 
-
 	  header.floating_point = std::string("IEEE64BIG");
 	  header.data_type      = std::string("4D_SU3_GAUGE");
 	  Nersc3x2unmunger<fobj2D,sobj> munge;
-	  BinaryIO::Uint32Checksum<vobj,fobj2D>(Umu, munge,header.checksum);
 	  offset = writeHeader(header,file);
-#ifdef PARALLEL_WRITE
-	  csum=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point);
-#else
-	  csum=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point);
-#endif
+	  header.checksum=BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point);
+	  writeHeader(header,file);
 	} else { 
 	  header.floating_point = std::string("IEEE64BIG");
 	  header.data_type      = std::string("4D_SU3_GAUGE_3x3");
 	  NerscSimpleUnmunger<fobj3D,sobj> munge;
-	  BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
 	  offset = writeHeader(header,file);
-#ifdef PARALLEL_WRITE
-	  csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
-#else
-	  csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
-#endif
+	  header.checksum=BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
+	  writeHeader(header,file);
 	}
-
-	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
-
+	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
+		 <<std::hex<<header.checksum
+		 <<std::dec<<" plaq "<< header.plaquette <<std::endl;
       }
-
-
       ///////////////////////////////
       // RNG state
       ///////////////////////////////
@@ -521,7 +449,6 @@ namespace Grid {
 	header.plaquette=0.0;
 	NerscMachineCharacteristics(header);
 
-	uint32_t csum;
 	int offset;
   
 #ifdef RNG_RANLUX
@@ -539,11 +466,13 @@ namespace Grid {
 
 	truncate(file);
 	offset = writeHeader(header,file);
-	csum=BinaryIO::writeRNGSerial(serial,parallel,file,offset);
-	header.checksum = csum;
+	header.checksum = BinaryIO::writeRNG(serial,parallel,file,offset);
 	offset = writeHeader(header,file);
 
-	std::cout<<GridLogMessage <<"Written NERSC RNG STATE "<<file<< " checksum "<<std::hex<<csum<<std::dec<<std::endl;
+	std::cout<<GridLogMessage 
+		 <<"Written NERSC RNG STATE "<<file<< " checksum "
+		 <<std::hex<<header.checksum
+		 <<std::dec<<std::endl;
 
       }
     
@@ -575,7 +504,7 @@ namespace Grid {
 
 	// depending on datatype, set up munger;
 	// munger is a function of <floating point, Real, data_type>
-	uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
+	uint32_t csum=BinaryIO::readRNG(serial,parallel,file,offset);
 
 	if ( csum != header.checksum ) { 
 	  std::cerr << "checksum mismatch "<<std::hex<< csum <<" "<<header.checksum<<std::dec<<std::endl;

From 4b98e524a01e153c5b2b5ae6389e135dd6705504 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 1 Jun 2017 17:38:18 -0400
Subject: [PATCH 058/177] Roll over to MPI version of I/O

---
 tests/IO/Test_nersc_io.cc | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc
index 8507df13..0a0f8977 100644
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -38,10 +38,13 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
+  std::cout <<GridLogMessage<< " main "<<std::endl;
 
   std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
-  std::vector<int> latt_size  ({16,16,16,16});
+  std::vector<int> latt_size  ({48,48,48,96});
+  //std::vector<int> latt_size  ({32,32,32,32});
+  //std::vector<int> latt_size  ({16,16,16,32});
   std::vector<int> clatt_size  ({4,4,4,8});
   int orthodir=3;
   int orthosz =latt_size[orthodir];
@@ -49,14 +52,17 @@ int main (int argc, char ** argv)
   GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
   GridCartesian     Coarse(clatt_size,simd_layout,mpi_layout);
 
+
   GridParallelRNG   pRNGa(&Fine);
   GridParallelRNG   pRNGb(&Fine);
   GridSerialRNG     sRNGa;
   GridSerialRNG     sRNGb;
 
+  std::cout <<GridLogMessage<< " seeding... "<<std::endl;
   pRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
   sRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  
+  std::cout <<GridLogMessage<< " ...done "<<std::endl;
+
   std::string rfile("./ckpoint_rng.4000");
   NerscIO::writeRNGState(sRNGa,pRNGa,rfile);
   NerscField rngheader;
@@ -65,14 +71,13 @@ int main (int argc, char ** argv)
   LatticeComplex tmpa(&Fine); random(pRNGa,tmpa);
   LatticeComplex tmpb(&Fine); random(pRNGb,tmpb);
   tmpa = tmpa - tmpb;
-  std::cout << " difference between restored randoms and orig "<<norm2( tmpa ) <<" / "<< norm2(tmpb)<<std::endl;
+  std::cout <<GridLogMessage<< " difference between restored randoms and orig "<<norm2( tmpa ) <<" / "<< norm2(tmpb)<<std::endl;
 
   ComplexD a,b;
 
   random(sRNGa,a);
   random(sRNGb,b);
-  std::cout << " serial RNG numbers "<<a<<" "<<b<<std::endl;
-
+  std::cout <<GridLogMessage<< " serial RNG numbers "<<a<<" "<<b<<std::endl;
 
   LatticeGaugeField Umu(&Fine);
   LatticeGaugeField Umu_diff(&Fine);
@@ -93,7 +98,7 @@ int main (int argc, char ** argv)
   Umu_diff = Umu - Umu_saved;
   //std::cout << "Umu_save "<<Umu_saved[0]<<std::endl;
   //std::cout << "Umu_read "<<Umu[0]<<std::endl;
-  std::cout << "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;
+  std::cout <<GridLogMessage<< "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;
 
   for(int mu=0;mu<Nd;mu++){
     U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -120,7 +125,6 @@ int main (int argc, char ** argv)
 #endif
   double vol = Fine.gSites();
   Complex PlaqScale(1.0/vol/6.0/3.0);
-  std::cout<<GridLogMessage <<"PlaqScale" << PlaqScale<<std::endl;
 
   std::vector<TComplex> Plaq_T(orthosz);
   sliceSum(Plaq,Plaq_T,Nd-1);

From 094c3d091afb3f29e7e370562cb0def29b3b26f0 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 2 Jun 2017 00:38:58 +0100
Subject: [PATCH 059/177] Improved and RNG's now survive checkpoint

---
 lib/parallelIO/BinaryIO.h                     | 175 ++++++++++++------
 .../hmc/checkpointers/BinaryCheckpointer.h    |  12 +-
 tests/IO/Test_nersc_io.cc                     |   4 +-
 3 files changed, 124 insertions(+), 67 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 13341927..e427a25b 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -133,7 +133,6 @@ class BinaryIO {
       }
 #pragma omp critical
       csum = csum + csum_thr;
-
     }
   }
   // Network is big endian
@@ -227,13 +226,20 @@ class BinaryIO {
   // Real action:
   // Read or Write distributed lexico array of ANY object to a specific location in file 
   //////////////////////////////////////////////////////////////////////////////////////
+
+  static const int BINARYIO_MASTER_APPEND = 0x10;
+  static const int BINARYIO_UNORDERED     = 0x08;
+  static const int BINARYIO_LEXICOGRAPHIC = 0x04;
+  static const int BINARYIO_READ          = 0x02;
+  static const int BINARYIO_WRITE         = 0x01;
+
   template<class word,class fobj>
-    static inline uint32_t IOobject(word w,
-				    GridBase *grid,
-				    std::vector<fobj> &iodata,
-				    std::string file,
-				    int offset,
-				    const std::string &format, int doread)
+  static inline uint32_t IOobject(word w,
+				  GridBase *grid,
+				  std::vector<fobj> &iodata,
+				  std::string file,
+				  int offset,
+				  const std::string &format, int control)
   {
     grid->Barrier();
     GridStopWatch timer; 
@@ -250,21 +256,24 @@ class BinaryIO {
     std::vector<int> gLattice= grid->GlobalDimensions();
     std::vector<int> lLattice= grid->LocalDimensions();
 
-    std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
-    std::vector<int> dargs   (ndim,MPI_DISTRIBUTE_DFLT_DARG);
-
     std::vector<int> lStart(ndim);
     std::vector<int> gStart(ndim);
 
     // Flatten the file
     uint64_t lsites = grid->lSites();
-    iodata.resize(lsites);
-
+    if ( control & BINARYIO_MASTER_APPEND )  {
+      assert(iodata.size()==1);
+    } else {
+      assert(lsites==iodata.size());
+    }
     for(int d=0;d<ndim;d++){
       gStart[d] = lLattice[d]*pcoor[d];
       lStart[d] = 0;
     }
 
+#ifdef USE_MPI_IO
+    std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
+    std::vector<int> dargs   (ndim,MPI_DISTRIBUTE_DFLT_DARG);
     MPI_Datatype mpiObject;
     MPI_Datatype fileArray;
     MPI_Datatype localArray;
@@ -281,7 +290,6 @@ class BinaryIO {
       numword = sizeof(fobj)/sizeof(double);
       mpiword = MPI_DOUBLE;
     }
-    
 
     //////////////////////////////////////////////////////////////////////////////
     // Sobj in MPI phrasing
@@ -301,6 +309,7 @@ class BinaryIO {
     //////////////////////////////////////////////////////////////////////////////
     ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);    assert(ierr==0);
     ierr=MPI_Type_commit(&localArray);    assert(ierr==0);
+#endif
 
     //////////////////////////////////////////////////////////////////////////////
     // Byte order
@@ -311,55 +320,91 @@ class BinaryIO {
     int ieee64    = (format == std::string("IEEE64"));
 
     //////////////////////////////////////////////////////////////////////////////
-    // Do the MPI I/O read
+    // Do the I/O
     //////////////////////////////////////////////////////////////////////////////
-    if ( doread ) { 
-      std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
+    if ( control & BINARYIO_READ ) { 
+
       timer.Start();
-      ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
-      ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
-      ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
+
+      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
+#ifdef USE_MPI_IO
+	std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
+	ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
+	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
+	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
+	MPI_File_close(&fh);
+	MPI_Type_free(&fileArray);
+	MPI_Type_free(&localArray);
+#else 
+	assert(0);
+#endif
+      } else { 
+	std::cout<< GridLogMessage<< "C++ read I/O "<< file<< std::endl;
+	std::ifstream fin;
+	fin.open(file,std::ios::binary|std::ios::in);
+	if ( control & BINARYIO_MASTER_APPEND )  {
+	  fin.seekg(-sizeof(fobj),fin.end);
+	} else { 
+	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
+	}
+	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
+	fin.close();
+      }
       timer.Stop();
 
       grid->Barrier();
 
       bstimer.Start();
-      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
-      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
-      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
-      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
+      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
       bstimer.Stop();
-
-    } else { 
-      std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
-      bstimer.Start();
-      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
-      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
-      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
-      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*lsites,csum);
-      bstimer.Stop();
-
-      grid->Barrier();
-
-      timer.Start();
-      ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
-      ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
-      ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
-      timer.Stop();
-    
     }
-   
-    //////////////////////////////////////////////////////////////////////////////
-    // Finish up MPI I/O
-    //////////////////////////////////////////////////////////////////////////////
-    MPI_File_close(&fh);
-    MPI_Type_free(&fileArray);
-    MPI_Type_free(&localArray);
+    
+    if ( control & BINARYIO_WRITE ) { 
+
+      bstimer.Start();
+      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      bstimer.Stop();
+
+      grid->Barrier();
+
+      timer.Start();
+      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
+#ifdef USE_MPI_IO
+	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
+	ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
+	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
+	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
+	MPI_File_close(&fh);
+	MPI_Type_free(&fileArray);
+	MPI_Type_free(&localArray);
+#else 
+	assert(0);
+#endif
+      } else { 
+	std::cout<< GridLogMessage<< "C++ write I/O "<< file<< std::endl;
+	std::ofstream fout;
+	fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+	if ( control & BINARYIO_MASTER_APPEND )  {
+	  fout.seekp(0,fout.end);
+	} else {
+	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
+	}
+	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
+	fout.close();
+      }
+      timer.Stop();
+    }
 
     std::cout<<GridLogMessage<<"IOobject: ";
-    if ( doread) std::cout << " read  ";
-    else         std::cout << " write ";
-    uint64_t bytes = sizeof(fobj)*lsites*nrank;
+    if ( control & BINARYIO_READ) std::cout << " read  ";
+    else                          std::cout << " write ";
+    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
     std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
 
@@ -390,8 +435,7 @@ class BinaryIO {
     std::vector<sobj> scalardata(lsites); 
     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
     
-    int doread=1;
-    uint32_t csum= IOobject(w,grid,iodata,file,offset,format,doread);
+    uint32_t csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC);
 
     GridStopWatch timer; 
     timer.Start();
@@ -432,8 +476,7 @@ class BinaryIO {
     grid->Barrier();
     timer.Stop();
 
-    int dowrite=0;
-    uint32_t csum= IOobject(w,grid,iodata,file,offset,format,dowrite);
+    uint32_t csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC);
 
     std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
 
@@ -461,9 +504,8 @@ class BinaryIO {
 
     std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
 
-    int doread=1;
     std::vector<RNGstate> iodata(lsites);
-    csum= IOobject(w,grid,iodata,file,offset,format,doread);
+    csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC);
 
     timer.Start();
     parallel_for(int lidx=0;lidx<lsites;lidx++){
@@ -473,6 +515,14 @@ class BinaryIO {
     }
     timer.Stop();
 
+    iodata.resize(1);
+    csum+= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND);
+    {
+      std::vector<RngStateType> tmp(RngStateCount);
+      std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
+      serial.SetState(tmp,0);
+    }
+
     std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
     std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
     return csum;
@@ -507,9 +557,16 @@ class BinaryIO {
     }
     timer.Stop();
 
-    int dowrite=0;
-    csum= IOobject(w,grid,iodata,file,offset,format,dowrite);
+    csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC);
 
+    iodata.resize(1);
+    {
+      std::vector<RngStateType> tmp(RngStateCount);
+      serial.GetState(tmp,0);
+      std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
+    }
+    csum+= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND);
+    
     std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
     std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
     return csum;
diff --git a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h b/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
index 251ed042..6116a46c 100644
--- a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
@@ -68,11 +68,11 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
       std::string config, rng;
       this->build_filenames(traj, Params, config, rng);
 
-      BinaryIO::BinarySimpleUnmunger<sobj_double, sobj> munge;
+      BinarySimpleUnmunger<sobj_double, sobj> munge;
       truncate(rng);
-      BinaryIO::writeRNGSerial(sRNG, pRNG, rng, 0);
+      BinaryIO::writeRNG(sRNG, pRNG, rng, 0);
       truncate(config);
-      uint32_t csum = BinaryIO::writeObjectParallel<vobj, sobj_double>(
+      uint32_t csum = BinaryIO::writeLatticeObject<vobj, sobj_double>(
           U, config, munge, 0, Params.format);
 
       std::cout << GridLogMessage << "Written Binary Configuration " << config
@@ -85,9 +85,9 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
     std::string config, rng;
     this->build_filenames(traj, Params, config, rng);
 
-    BinaryIO::BinarySimpleMunger<sobj_double, sobj> munge;
-    BinaryIO::readRNGSerial(sRNG, pRNG, rng, 0);
-    uint32_t csum = BinaryIO::readObjectParallel<vobj, sobj_double>(
+    BinarySimpleMunger<sobj_double, sobj> munge;
+    BinaryIO::readRNG(sRNG, pRNG, rng, 0);
+    uint32_t csum = BinaryIO::readLatticeObject<vobj, sobj_double>(
         U, config, munge, 0, Params.format);
 
     std::cout << GridLogMessage << "Read Binary Configuration " << config
diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc
index 0a0f8977..14c6080d 100644
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -42,9 +42,9 @@ int main (int argc, char ** argv)
 
   std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
-  std::vector<int> latt_size  ({48,48,48,96});
+  //std::vector<int> latt_size  ({48,48,48,96});
   //std::vector<int> latt_size  ({32,32,32,32});
-  //std::vector<int> latt_size  ({16,16,16,32});
+  std::vector<int> latt_size  ({16,16,16,32});
   std::vector<int> clatt_size  ({4,4,4,8});
   int orthodir=3;
   int orthosz =latt_size[orthodir];

From 092dcd4e04c1e069fe63984cfc7d9f1a0da9e703 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 2 Jun 2017 22:50:25 +0100
Subject: [PATCH 060/177] MPI I/O only if MPI compiled

---
 lib/parallelIO/BinaryIO.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index e427a25b..8b8d4165 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -29,12 +29,16 @@
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H
 
-
-#include "IldgIOtypes.h"
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
+#define USE_MPI_IO
+#else
+#undef  USE_MPI_IO
+#endif
 
 #ifdef HAVE_ENDIAN_H
 #include <endian.h>
 #endif
+
 #include <arpa/inet.h>
 #include <algorithm>
 

From 22749699a30da633f58a4d47642721c639048f31 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 6 Jun 2017 11:45:30 -0500
Subject: [PATCH 061/177] Fixes after merge and point sink module

---
 extras/Hadrons/Environment.cc                 |  56 ++++++++-
 extras/Hadrons/Environment.hpp                |  44 ++++++-
 extras/Hadrons/Global.hpp                     |   9 +-
 extras/Hadrons/Modules.hpp                    |   1 +
 extras/Hadrons/Modules/MAction/DWF.hpp        |   8 +-
 extras/Hadrons/Modules/MAction/Wilson.hpp     |   6 +-
 .../Hadrons/Modules/MContraction/Baryon.hpp   |  14 +--
 .../Hadrons/Modules/MContraction/DiscLoop.hpp |   8 +-
 .../Hadrons/Modules/MContraction/Gamma3pt.hpp |  12 +-
 extras/Hadrons/Modules/MContraction/Meson.hpp | 102 ++++++++++------
 .../Modules/MContraction/WeakHamiltonian.hpp  |   8 +-
 .../MContraction/WeakHamiltonianEye.hpp       |   6 +-
 .../MContraction/WeakHamiltonianNonEye.hpp    |   6 +-
 .../MContraction/WeakNeutral4ptDisc.hpp       |   6 +-
 extras/Hadrons/Modules/MGauge/Load.hpp        |   6 +-
 extras/Hadrons/Modules/MGauge/Random.hpp      |   6 +-
 extras/Hadrons/Modules/MGauge/StochEm.hpp     |   6 +-
 extras/Hadrons/Modules/MGauge/Unit.hpp        |   6 +-
 extras/Hadrons/Modules/MLoop/NoiseLoop.hpp    |   8 +-
 .../Hadrons/Modules/MScalar/ChargedProp.hpp   |   6 +-
 extras/Hadrons/Modules/MScalar/FreeProp.hpp   |   6 +-
 extras/Hadrons/Modules/MSink/Point.hpp        | 114 ++++++++++++++++++
 extras/Hadrons/Modules/MSolver/RBPrecCG.hpp   |   8 +-
 extras/Hadrons/Modules/MSource/Point.hpp      |   6 +-
 extras/Hadrons/Modules/MSource/SeqGamma.hpp   |   8 +-
 extras/Hadrons/Modules/MSource/Wall.hpp       |   8 +-
 extras/Hadrons/Modules/MSource/Z2.hpp         |   6 +-
 extras/Hadrons/Modules/Quark.hpp              |   2 +-
 .../templates/Module_in_NS.hpp.template       |   6 +-
 .../templates/Module_tmp_in_NS.hpp.template   |   6 +-
 extras/Hadrons/modules.inc                    |   1 +
 tests/hadrons/Test_hadrons_spectrum.cc        |  24 ++--
 32 files changed, 385 insertions(+), 134 deletions(-)
 create mode 100644 extras/Hadrons/Modules/MSink/Point.hpp

diff --git a/extras/Hadrons/Environment.cc b/extras/Hadrons/Environment.cc
index 37f2a3d7..0e7a4326 100644
--- a/extras/Hadrons/Environment.cc
+++ b/extras/Hadrons/Environment.cc
@@ -41,9 +41,10 @@ using namespace Hadrons;
 // constructor /////////////////////////////////////////////////////////////////
 Environment::Environment(void)
 {
-    nd_ = GridDefaultLatt().size();
+    dim_ = GridDefaultLatt();
+    nd_  = dim_.size();
     grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()),
+        dim_, GridDefaultSimd(nd_, vComplex::Nsimd()),
         GridDefaultMpi()));
     gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
     auto loc = getGrid()->LocalDimensions();
@@ -132,6 +133,16 @@ unsigned int Environment::getNd(void) const
     return nd_;
 }
 
+std::vector<int> Environment::getDim(void) const
+{
+    return dim_;
+}
+
+int Environment::getDim(const unsigned int mu) const
+{
+    return dim_[mu];
+}
+
 // random number generator /////////////////////////////////////////////////////
 void Environment::setSeed(const std::vector<int> &seed)
 {
@@ -271,6 +282,21 @@ std::string Environment::getModuleType(const std::string name) const
     return getModuleType(getModuleAddress(name));
 }
 
+std::string Environment::getModuleNamespace(const unsigned int address) const
+{
+    std::string type = getModuleType(address), ns;
+    
+    auto pos2 = type.rfind("::");
+    auto pos1 = type.rfind("::", pos2 - 2);
+    
+    return type.substr(pos1 + 2, pos2 - pos1 - 2);
+}
+
+std::string Environment::getModuleNamespace(const std::string name) const
+{
+    return getModuleNamespace(getModuleAddress(name));
+}
+
 bool Environment::hasModule(const unsigned int address) const
 {
     return (address < module_.size());
@@ -492,7 +518,14 @@ std::string Environment::getObjectType(const unsigned int address) const
 {
     if (hasRegisteredObject(address))
     {
-        return typeName(object_[address].type);
+        if (object_[address].type)
+        {
+            return typeName(object_[address].type);
+        }
+        else
+        {
+            return "<no type>";
+        }
     }
     else if (hasObject(address))
     {
@@ -532,6 +565,23 @@ Environment::Size Environment::getObjectSize(const std::string name) const
     return getObjectSize(getObjectAddress(name));
 }
 
+unsigned int Environment::getObjectModule(const unsigned int address) const
+{
+    if (hasObject(address))
+    {
+        return object_[address].module;
+    }
+    else
+    {
+        HADRON_ERROR("no object with address " + std::to_string(address));
+    }
+}
+
+unsigned int Environment::getObjectModule(const std::string name) const
+{
+    return getObjectModule(getObjectAddress(name));
+}
+
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
     if (hasRegisteredObject(address))
diff --git a/extras/Hadrons/Environment.hpp b/extras/Hadrons/Environment.hpp
index 2628e5a0..13264bd5 100644
--- a/extras/Hadrons/Environment.hpp
+++ b/extras/Hadrons/Environment.hpp
@@ -106,6 +106,8 @@ public:
     void                    createGrid(const unsigned int Ls);
     GridCartesian *         getGrid(const unsigned int Ls = 1) const;
     GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
+    std::vector<int>        getDim(void) const;
+    int                     getDim(const unsigned int mu) const;
     unsigned int            getNd(void) const;
     // random number generator
     void                    setSeed(const std::vector<int> &seed);
@@ -131,6 +133,8 @@ public:
     std::string             getModuleName(const unsigned int address) const;
     std::string             getModuleType(const unsigned int address) const;
     std::string             getModuleType(const std::string name) const;
+    std::string             getModuleNamespace(const unsigned int address) const;
+    std::string             getModuleNamespace(const std::string name) const;
     bool                    hasModule(const unsigned int address) const;
     bool                    hasModule(const std::string name) const;
     Graph<unsigned int>     makeModuleGraph(void) const;
@@ -171,6 +175,8 @@ public:
     std::string             getObjectType(const std::string name) const;
     Size                    getObjectSize(const unsigned int address) const;
     Size                    getObjectSize(const std::string name) const;
+    unsigned int            getObjectModule(const unsigned int address) const;
+    unsigned int            getObjectModule(const std::string name) const;
     unsigned int            getObjectLs(const unsigned int address) const;
     unsigned int            getObjectLs(const std::string name) const;
     bool                    hasObject(const unsigned int address) const;
@@ -181,6 +187,10 @@ public:
     bool                    hasCreatedObject(const std::string name) const;
     bool                    isObject5d(const unsigned int address) const;
     bool                    isObject5d(const std::string name) const;
+    template <typename T>
+    bool                    isObjectOfType(const unsigned int address) const;
+    template <typename T>
+    bool                    isObjectOfType(const std::string name) const;
     Environment::Size       getTotalSize(void) const;
     void                    addOwnership(const unsigned int owner,
                                          const unsigned int property);
@@ -197,6 +207,7 @@ private:
     bool                                   dryRun_{false};
     unsigned int                           traj_, locVol_;
     // grids
+    std::vector<int>                       dim_;
     GridPt                                 grid4d_;
     std::map<unsigned int, GridPt>         grid5d_;
     GridRbPt                               gridRb4d_;
@@ -343,7 +354,7 @@ T * Environment::getObject(const unsigned int address) const
         else
         {
             HADRON_ERROR("object with address " + std::to_string(address) +
-                         " does not have type '" + typeid(T).name() +
+                         " does not have type '" + typeName(&typeid(T)) +
                          "' (has type '" + getObjectType(address) + "')");
         }
     }
@@ -380,6 +391,37 @@ T * Environment::createLattice(const std::string name)
     return createLattice<T>(getObjectAddress(name));
 }
 
+template <typename T>
+bool Environment::isObjectOfType(const unsigned int address) const
+{
+    if (hasRegisteredObject(address))
+    {
+        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    else if (hasObject(address))
+    {
+        HADRON_ERROR("object with address " + std::to_string(address) +
+                     " exists but is not registered");
+    }
+    else
+    {
+        HADRON_ERROR("no object with address " + std::to_string(address));
+    }
+}
+
+template <typename T>
+bool Environment::isObjectOfType(const std::string name) const
+{
+    return isObjectOfType<T>(getObjectAddress(name));
+}
+
 END_HADRONS_NAMESPACE
 
 #endif // Hadrons_Environment_hpp_
diff --git a/extras/Hadrons/Global.hpp b/extras/Hadrons/Global.hpp
index 3ff79ea3..9de01623 100644
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -65,7 +65,9 @@ BEGIN_HADRONS_NAMESPACE
 typedef FermionOperator<FImpl>                       FMat##suffix;             \
 typedef typename FImpl::FermionField                 FermionField##suffix;     \
 typedef typename FImpl::PropagatorField              PropagatorField##suffix;  \
-typedef typename FImpl::SitePropagator               SitePropagator##suffix;
+typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \
+typedef std::vector<typename FImpl::SitePropagator::scalar_object>             \
+                                                     SlicedPropagator##suffix;
 
 #define GAUGE_TYPE_ALIASES(FImpl, suffix)\
 typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
@@ -78,7 +80,10 @@ typedef typename SImpl::Field PropagatorField##suffix;
 typedef std::function<void(FermionField##suffix &,\
                       const FermionField##suffix &)> SolverFn##suffix;
 
-#define TYPE_ALIASES(FImpl, suffix)\
+#define SINK_TYPE_ALIASES(suffix)\
+typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix;
+
+#define FGS_TYPE_ALIASES(FImpl, suffix)\
 FERM_TYPE_ALIASES(FImpl, suffix)\
 GAUGE_TYPE_ALIASES(FImpl, suffix)\
 SOLVER_TYPE_ALIASES(FImpl, suffix)
diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp
index 7155c02a..42a1f651 100644
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -16,6 +16,7 @@
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
+#include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
diff --git a/extras/Hadrons/Modules/MAction/DWF.hpp b/extras/Hadrons/Modules/MAction/DWF.hpp
index 880fe7b9..78e0916c 100644
--- a/extras/Hadrons/Modules/MAction/DWF.hpp
+++ b/extras/Hadrons/Modules/MAction/DWF.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_DWF_hpp_
-#define Hadrons_DWF_hpp_
+#ifndef Hadrons_MAction_DWF_hpp_
+#define Hadrons_MAction_DWF_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TDWF: public Module<DWFPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TDWF(const std::string name);
@@ -137,4 +137,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_DWF_hpp_
+#endif // Hadrons_MAction_DWF_hpp_
diff --git a/extras/Hadrons/Modules/MAction/Wilson.hpp b/extras/Hadrons/Modules/MAction/Wilson.hpp
index 4b84bda5..aab54245 100644
--- a/extras/Hadrons/Modules/MAction/Wilson.hpp
+++ b/extras/Hadrons/Modules/MAction/Wilson.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Wilson_hpp_
-#define Hadrons_Wilson_hpp_
+#ifndef Hadrons_MAction_Wilson_hpp_
+#define Hadrons_MAction_Wilson_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -54,7 +54,7 @@ template <typename FImpl>
 class TWilson: public Module<WilsonPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TWilson(const std::string name);
diff --git a/extras/Hadrons/Modules/MContraction/Baryon.hpp b/extras/Hadrons/Modules/MContraction/Baryon.hpp
index be7d919c..78bde5a2 100644
--- a/extras/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/extras/Hadrons/Modules/MContraction/Baryon.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Baryon_hpp_
-#define Hadrons_Baryon_hpp_
+#ifndef Hadrons_MContraction_Baryon_hpp_
+#define Hadrons_MContraction_Baryon_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -55,9 +55,9 @@ template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TBaryon: public Module<BaryonPar>
 {
 public:
-    TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
-    TYPE_ALIASES(FImpl3, 3);
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl3, 3);
     class Result: Serializable
     {
     public:
@@ -121,11 +121,11 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
     
     // FIXME: do contractions
     
-    write(writer, "meson", result);
+    // write(writer, "meson", result);
 }
 
 END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Baryon_hpp_
+#endif // Hadrons_MContraction_Baryon_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
index 4ad12e90..4f782cd3 100644
--- a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+++ b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_DiscLoop_hpp_
-#define Hadrons_DiscLoop_hpp_
+#ifndef Hadrons_MContraction_DiscLoop_hpp_
+#define Hadrons_MContraction_DiscLoop_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -52,7 +52,7 @@ public:
 template <typename FImpl>
 class TDiscLoop: public Module<DiscLoopPar>
 {
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
     class Result: Serializable
     {
     public:
@@ -141,4 +141,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_DiscLoop_hpp_
+#endif // Hadrons_MContraction_DiscLoop_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
index e5e73fa6..7f643d49 100644
--- a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Gamma3pt_hpp_
-#define Hadrons_Gamma3pt_hpp_
+#ifndef Hadrons_MContraction_Gamma3pt_hpp_
+#define Hadrons_MContraction_Gamma3pt_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -72,9 +72,9 @@ public:
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TGamma3pt: public Module<Gamma3ptPar>
 {
-    TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
-    TYPE_ALIASES(FImpl3, 3);
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl3, 3);
     class Result: Serializable
     {
     public:
@@ -167,4 +167,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Gamma3pt_hpp_
+#endif // Hadrons_MContraction_Gamma3pt_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/Meson.hpp b/extras/Hadrons/Modules/MContraction/Meson.hpp
index 09c2a6e1..7810326a 100644
--- a/extras/Hadrons/Modules/MContraction/Meson.hpp
+++ b/extras/Hadrons/Modules/MContraction/Meson.hpp
@@ -29,8 +29,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Meson_hpp_
-#define Hadrons_Meson_hpp_
+#ifndef Hadrons_MContraction_Meson_hpp_
+#define Hadrons_MContraction_Meson_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -69,7 +69,7 @@ public:
                                     std::string, q1,
                                     std::string, q2,
                                     std::string, gammas,
-                                    std::string, mom,
+                                    std::string, sink,
                                     std::string, output);
 };
 
@@ -77,8 +77,10 @@ template <typename FImpl1, typename FImpl2>
 class TMeson: public Module<MesonPar>
 {
 public:
-    TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
+    SINK_TYPE_ALIASES(Scalar);
     class Result: Serializable
     {
     public:
@@ -115,7 +117,7 @@ TMeson<FImpl1, FImpl2>::TMeson(const std::string name)
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void)
 {
-    std::vector<std::string> input = {par().q1, par().q2};
+    std::vector<std::string> input = {par().q1, par().q2, par().sink};
     
     return input;
 }
@@ -154,6 +156,9 @@ void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
 
 
 // execution ///////////////////////////////////////////////////////////////////
+#define mesonConnected(q1, q2, gSnk, gSrc) \
+(g5*(gSnk))*(q1)*(adj(gSrc)*g5)*adj(q2)
+
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::execute(void)
 {
@@ -161,43 +166,72 @@ void TMeson<FImpl1, FImpl2>::execute(void)
                  << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
                  << std::endl;
     
-    CorrWriter              writer(par().output);
-    PropagatorField1       &q1 = *env().template getObject<PropagatorField1>(par().q1);
-    PropagatorField2       &q2 = *env().template getObject<PropagatorField2>(par().q2);
-    LatticeComplex         c(env().getGrid());
-    Gamma                  g5(Gamma::Algebra::Gamma5);
-    std::vector<GammaPair> gammaList;
+    CorrWriter             writer(par().output);
     std::vector<TComplex>  buf;
     std::vector<Result>    result;
-    std::vector<Real>      p;
-
-    p  = strToVec<Real>(par().mom);
-    LatticeComplex         ph(env().getGrid()), coor(env().getGrid());
-    Complex                i(0.0,1.0);
-    ph = zero;
-    for(unsigned int mu = 0; mu < env().getNd(); mu++)
-    {
-        LatticeCoordinate(coor, mu);
-        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
-    }
-    ph = exp((Real)(2*M_PI)*i*ph);
+    Gamma                  g5(Gamma::Algebra::Gamma5);
+    std::vector<GammaPair> gammaList;
+    int                    nt = env().getDim(Tp);
     
     parseGammaString(gammaList);
-
     result.resize(gammaList.size());
     for (unsigned int i = 0; i < result.size(); ++i)
     {
-        Gamma gSnk(gammaList[i].first);
-        Gamma gSrc(gammaList[i].second);
-        c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph;
-        sliceSum(c, buf, Tp);
-
         result[i].gamma_snk = gammaList[i].first;
         result[i].gamma_src = gammaList[i].second;
-        result[i].corr.resize(buf.size());
-        for (unsigned int t = 0; t < buf.size(); ++t)
+        result[i].corr.resize(nt);
+    }
+    if (env().template isObjectOfType<SlicedPropagator1>(par().q1) and
+        env().template isObjectOfType<SlicedPropagator2>(par().q2))
+    {
+        SlicedPropagator1 &q1 = *env().template getObject<SlicedPropagator1>(par().q1);
+        SlicedPropagator2 &q2 = *env().template getObject<SlicedPropagator2>(par().q2);
+        
+        LOG(Message) << "(propagator already sinked)" << std::endl;
+        for (unsigned int i = 0; i < result.size(); ++i)
         {
-            result[i].corr[t] = TensorRemove(buf[t]);
+            Gamma gSnk(gammaList[i].first);
+            Gamma gSrc(gammaList[i].second);
+            
+            for (unsigned int t = 0; t < buf.size(); ++t)
+            {
+                result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
+            }
+        }
+    }
+    else
+    {
+        PropagatorField1 &q1   = *env().template getObject<PropagatorField1>(par().q1);
+        PropagatorField2 &q2   = *env().template getObject<PropagatorField2>(par().q2);
+        LatticeComplex   c(env().getGrid());
+        
+        LOG(Message) << "(using sink '" << par().sink << "')" << std::endl;
+        for (unsigned int i = 0; i < result.size(); ++i)
+        {
+            Gamma       gSnk(gammaList[i].first);
+            Gamma       gSrc(gammaList[i].second);
+            std::string ns;
+                
+            ns = env().getModuleNamespace(env().getObjectModule(par().sink));
+            if (ns == "MSource")
+            {
+                PropagatorField1 &sink =
+                    *env().template getObject<PropagatorField1>(par().sink);
+                
+                c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink);
+                sliceSum(c, buf, Tp);
+            }
+            else if (ns == "MSink")
+            {
+                SinkFnScalar &sink = *env().template getObject<SinkFnScalar>(par().sink);
+                
+                c   = trace(mesonConnected(q1, q2, gSnk, gSrc));
+                buf = sink(c);
+            }
+            for (unsigned int t = 0; t < buf.size(); ++t)
+            {
+                result[i].corr[t] = TensorRemove(buf[t]);
+            }
         }
     }
     write(writer, "meson", result);
@@ -207,4 +241,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Meson_hpp_
+#endif // Hadrons_MContraction_Meson_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp b/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
index 23482feb..0a3c2e31 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_WeakHamiltonian_hpp_
-#define Hadrons_WeakHamiltonian_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
+#define Hadrons_MContraction_WeakHamiltonian_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -83,7 +83,7 @@ public:
 class T##modname: public Module<WeakHamiltonianPar>\
 {\
 public:\
-    TYPE_ALIASES(FIMPL,)\
+    FERM_TYPE_ALIASES(FIMPL,)\
     class Result: Serializable\
     {\
     public:\
@@ -111,4 +111,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_WeakHamiltonian_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonian_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
index 2ee87895..3a2b9309 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_WeakHamiltonianEye_hpp_
-#define Hadrons_WeakHamiltonianEye_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
+#define Hadrons_MContraction_WeakHamiltonianEye_hpp_
 
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 
@@ -55,4 +55,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_WeakHamiltonianEye_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
index 69bb8005..eb5abe3c 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_WeakHamiltonianNonEye_hpp_
-#define Hadrons_WeakHamiltonianNonEye_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
+#define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 
@@ -54,4 +54,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_WeakHamiltonianNonEye_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
index c0d8f829..f26d4636 100644
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_WeakNeutral4ptDisc_hpp_
-#define Hadrons_WeakNeutral4ptDisc_hpp_
+#ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
+#define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 
@@ -56,4 +56,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_WeakNeutral4ptDisc_hpp_
+#endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
diff --git a/extras/Hadrons/Modules/MGauge/Load.hpp b/extras/Hadrons/Modules/MGauge/Load.hpp
index c41f9b8c..5ff6da0f 100644
--- a/extras/Hadrons/Modules/MGauge/Load.hpp
+++ b/extras/Hadrons/Modules/MGauge/Load.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Load_hpp_
-#define Hadrons_Load_hpp_
+#ifndef Hadrons_MGauge_Load_hpp_
+#define Hadrons_MGauge_Load_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -70,4 +70,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Load_hpp_
+#endif // Hadrons_MGauge_Load_hpp_
diff --git a/extras/Hadrons/Modules/MGauge/Random.hpp b/extras/Hadrons/Modules/MGauge/Random.hpp
index e3fbcf1a..a97d25cf 100644
--- a/extras/Hadrons/Modules/MGauge/Random.hpp
+++ b/extras/Hadrons/Modules/MGauge/Random.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Random_hpp_
-#define Hadrons_Random_hpp_
+#ifndef Hadrons_MGauge_Random_hpp_
+#define Hadrons_MGauge_Random_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Random_hpp_
+#endif // Hadrons_MGauge_Random_hpp_
diff --git a/extras/Hadrons/Modules/MGauge/StochEm.hpp b/extras/Hadrons/Modules/MGauge/StochEm.hpp
index 50a77435..12ce9fdc 100644
--- a/extras/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/extras/Hadrons/Modules/MGauge/StochEm.hpp
@@ -25,8 +25,8 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_StochEm_hpp_
-#define Hadrons_StochEm_hpp_
+#ifndef Hadrons_MGauge_StochEm_hpp_
+#define Hadrons_MGauge_StochEm_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -72,4 +72,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_StochEm_hpp_
+#endif // Hadrons_MGauge_StochEm_hpp_
diff --git a/extras/Hadrons/Modules/MGauge/Unit.hpp b/extras/Hadrons/Modules/MGauge/Unit.hpp
index 2ff10bfd..7cd15ef7 100644
--- a/extras/Hadrons/Modules/MGauge/Unit.hpp
+++ b/extras/Hadrons/Modules/MGauge/Unit.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Unit_hpp_
-#define Hadrons_Unit_hpp_
+#ifndef Hadrons_MGauge_Unit_hpp_
+#define Hadrons_MGauge_Unit_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Unit_hpp_
+#endif // Hadrons_MGauge_Unit_hpp_
diff --git a/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp b/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
index 3d2850d1..5d2c4a13 100644
--- a/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
+++ b/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_NoiseLoop_hpp_
-#define Hadrons_NoiseLoop_hpp_
+#ifndef Hadrons_MLoop_NoiseLoop_hpp_
+#define Hadrons_MLoop_NoiseLoop_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -65,7 +65,7 @@ template <typename FImpl>
 class TNoiseLoop: public Module<NoiseLoopPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TNoiseLoop(const std::string name);
@@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_NoiseLoop_hpp_
+#endif // Hadrons_MLoop_NoiseLoop_hpp_
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
index 8bb5faa0..fbe75c05 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -1,5 +1,5 @@
-#ifndef Hadrons_ChargedProp_hpp_
-#define Hadrons_ChargedProp_hpp_
+#ifndef Hadrons_MScalar_ChargedProp_hpp_
+#define Hadrons_MScalar_ChargedProp_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -58,4 +58,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_ChargedProp_hpp_
+#endif // Hadrons_MScalar_ChargedProp_hpp_
diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.hpp b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
index 29f15eda..97cf288a 100644
--- a/extras/Hadrons/Modules/MScalar/FreeProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -1,5 +1,5 @@
-#ifndef Hadrons_FreeProp_hpp_
-#define Hadrons_FreeProp_hpp_
+#ifndef Hadrons_MScalar_FreeProp_hpp_
+#define Hadrons_MScalar_FreeProp_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -47,4 +47,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_FreeProp_hpp_
+#endif // Hadrons_MScalar_FreeProp_hpp_
diff --git a/extras/Hadrons/Modules/MSink/Point.hpp b/extras/Hadrons/Modules/MSink/Point.hpp
new file mode 100644
index 00000000..7b3aa9de
--- /dev/null
+++ b/extras/Hadrons/Modules/MSink/Point.hpp
@@ -0,0 +1,114 @@
+#ifndef Hadrons_MSink_Point_hpp_
+#define Hadrons_MSink_Point_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Hadrons/Module.hpp>
+#include <Grid/Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                                   Point                                    *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSink)
+
+class PointPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(PointPar,
+                                    std::string, mom);
+};
+
+template <typename FImpl>
+class TPoint: public Module<PointPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    SINK_TYPE_ALIASES();
+public:
+    // constructor
+    TPoint(const std::string name);
+    // destructor
+    virtual ~TPoint(void) = default;
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSink);
+MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink);
+
+/******************************************************************************
+ *                          TPoint implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TPoint<FImpl>::TPoint(const std::string name)
+: Module<PointPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TPoint<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TPoint<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TPoint<FImpl>::setup(void)
+{
+    unsigned int size;
+    
+    size = env().template lattice4dSize<LatticeComplex>();
+    env().registerObject(getName(), size);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TPoint<FImpl>::execute(void)
+{
+    std::vector<Real> p = strToVec<Real>(par().mom);
+    LatticeComplex    ph(env().getGrid()), coor(env().getGrid());
+    Complex           i(0.0,1.0);
+    
+    LOG(Message) << "Setting up point sink function for momentum ["
+                 << par().mom << "]" << std::endl;
+    ph = zero;
+    for(unsigned int mu = 0; mu < env().getNd(); mu++)
+    {
+        LatticeCoordinate(coor, mu);
+        ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+    }
+    ph = exp((Real)(2*M_PI)*i*ph);
+    auto sink = [ph](const PropagatorField &field)
+    {
+        SlicedPropagator res;
+        PropagatorField  tmp = ph*field;
+        
+        sliceSum(tmp, res, Tp);
+        
+        return res;
+    };
+    env().setObject(getName(), new SinkFn(sink));
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSink_Point_hpp_
diff --git a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
index d7220271..b1f63a5d 100644
--- a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_RBPrecCG_hpp_
-#define Hadrons_RBPrecCG_hpp_
+#ifndef Hadrons_MSolver_RBPrecCG_hpp_
+#define Hadrons_MSolver_RBPrecCG_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -53,7 +53,7 @@ template <typename FImpl>
 class TRBPrecCG: public Module<RBPrecCGPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TRBPrecCG(const std::string name);
@@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_RBPrecCG_hpp_
+#endif // Hadrons_MSolver_RBPrecCG_hpp_
diff --git a/extras/Hadrons/Modules/MSource/Point.hpp b/extras/Hadrons/Modules/MSource/Point.hpp
index 3c0fc9a1..0c415807 100644
--- a/extras/Hadrons/Modules/MSource/Point.hpp
+++ b/extras/Hadrons/Modules/MSource/Point.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Point_hpp_
-#define Hadrons_Point_hpp_
+#ifndef Hadrons_MSource_Point_hpp_
+#define Hadrons_MSource_Point_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -133,4 +133,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Point_hpp_
+#endif // Hadrons_MSource_Point_hpp_
diff --git a/extras/Hadrons/Modules/MSource/SeqGamma.hpp b/extras/Hadrons/Modules/MSource/SeqGamma.hpp
index 366ebee7..e2129a46 100644
--- a/extras/Hadrons/Modules/MSource/SeqGamma.hpp
+++ b/extras/Hadrons/Modules/MSource/SeqGamma.hpp
@@ -28,8 +28,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_SeqGamma_hpp_
-#define Hadrons_SeqGamma_hpp_
+#ifndef Hadrons_MSource_SeqGamma_hpp_
+#define Hadrons_MSource_SeqGamma_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -72,7 +72,7 @@ template <typename FImpl>
 class TSeqGamma: public Module<SeqGammaPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TSeqGamma(const std::string name);
@@ -161,4 +161,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_SeqGamma_hpp_
+#endif // Hadrons_MSource_SeqGamma_hpp_
diff --git a/extras/Hadrons/Modules/MSource/Wall.hpp b/extras/Hadrons/Modules/MSource/Wall.hpp
index 8722876f..4de37e4d 100644
--- a/extras/Hadrons/Modules/MSource/Wall.hpp
+++ b/extras/Hadrons/Modules/MSource/Wall.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_WallSource_hpp_
-#define Hadrons_WallSource_hpp_
+#ifndef Hadrons_MSource_WallSource_hpp_
+#define Hadrons_MSource_WallSource_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -64,7 +64,7 @@ template <typename FImpl>
 class TWall: public Module<WallPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TWall(const std::string name);
@@ -144,4 +144,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_WallSource_hpp_
+#endif // Hadrons_MSource_WallSource_hpp_
diff --git a/extras/Hadrons/Modules/MSource/Z2.hpp b/extras/Hadrons/Modules/MSource/Z2.hpp
index 761ae139..a7f7a3e6 100644
--- a/extras/Hadrons/Modules/MSource/Z2.hpp
+++ b/extras/Hadrons/Modules/MSource/Z2.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_Z2_hpp_
-#define Hadrons_Z2_hpp_
+#ifndef Hadrons_MSource_Z2_hpp_
+#define Hadrons_MSource_Z2_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -149,4 +149,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Z2_hpp_
+#endif // Hadrons_MSource_Z2_hpp_
diff --git a/extras/Hadrons/Modules/Quark.hpp b/extras/Hadrons/Modules/Quark.hpp
index c0d1f65a..cf7d4c28 100644
--- a/extras/Hadrons/Modules/Quark.hpp
+++ b/extras/Hadrons/Modules/Quark.hpp
@@ -51,7 +51,7 @@ template <typename FImpl>
 class TQuark: public Module<QuarkPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TQuark(const std::string name);
diff --git a/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template b/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
index ece2bb58..ea77b12a 100644
--- a/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
+++ b/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
@@ -1,5 +1,5 @@
-#ifndef Hadrons____FILEBASENAME____hpp_
-#define Hadrons____FILEBASENAME____hpp_
+#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
+#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -41,4 +41,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons____FILEBASENAME____hpp_
+#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
diff --git a/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template b/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
index a330652d..b79c0ad3 100644
--- a/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
+++ b/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
@@ -1,5 +1,5 @@
-#ifndef Hadrons____FILEBASENAME____hpp_
-#define Hadrons____FILEBASENAME____hpp_
+#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
+#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -82,4 +82,4 @@ END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons____FILEBASENAME____hpp_
+#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc
index 3cf69144..f51ede5a 100644
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -28,6 +28,7 @@ modules_hpp =\
   Modules/MScalar/ChargedProp.hpp \
   Modules/MScalar/FreeProp.hpp \
   Modules/MScalar/Scalar.hpp \
+  Modules/MSink/Point.hpp \
   Modules/MSolver/RBPrecCG.hpp \
   Modules/MSource/Point.hpp \
   Modules/MSource/SeqGamma.hpp \
diff --git a/tests/hadrons/Test_hadrons_spectrum.cc b/tests/hadrons/Test_hadrons_spectrum.cc
index 55f3346e..8f7b30c8 100644
--- a/tests/hadrons/Test_hadrons_spectrum.cc
+++ b/tests/hadrons/Test_hadrons_spectrum.cc
@@ -63,6 +63,10 @@ int main(int argc, char *argv[])
     MSource::Point::Par ptPar;
     ptPar.position = "0 0 0 0";
     application.createModule<MSource::Point>("pt", ptPar);
+    // sink
+    MSink::Point::Par sinkPar;
+    sinkPar.mom = "0 0 0";
+    application.createModule<MSink::ScalarPoint>("sink", sinkPar);
     
     // set fermion boundary conditions to be periodic space, antiperiodic time.
     std::string boundary = "1 1 1 -1";
@@ -98,19 +102,19 @@ int main(int argc, char *argv[])
     {
         MContraction::Meson::Par mesPar;
         
-        mesPar.output = "mesons/pt_" + flavour[i] + flavour[j];
-        mesPar.q1     = "Qpt_" + flavour[i];
-        mesPar.q2     = "Qpt_" + flavour[j];
-        mesPar.gammas = "all";
-        mesPar.mom    = "0. 0. 0. 0.";
+        mesPar.output  = "mesons/pt_" + flavour[i] + flavour[j];
+        mesPar.q1      = "Qpt_" + flavour[i];
+        mesPar.q2      = "Qpt_" + flavour[j];
+        mesPar.gammas  = "all";
+        mesPar.sink    = "sink";
         application.createModule<MContraction::Meson>("meson_pt_"
                                                       + flavour[i] + flavour[j],
                                                       mesPar);
-        mesPar.output = "mesons/Z2_" + flavour[i] + flavour[j];
-        mesPar.q1     = "QZ2_" + flavour[i];
-        mesPar.q2     = "QZ2_" + flavour[j];
-        mesPar.gammas = "all";
-        mesPar.mom    = "0. 0. 0. 0.";
+        mesPar.output  = "mesons/Z2_" + flavour[i] + flavour[j];
+        mesPar.q1      = "QZ2_" + flavour[i];
+        mesPar.q2      = "QZ2_" + flavour[j];
+        mesPar.gammas  = "all";
+        mesPar.sink    = "sink";
         application.createModule<MContraction::Meson>("meson_Z2_"
                                                       + flavour[i] + flavour[j],
                                                       mesPar);

From 5f55bca378f0e379b8595a82d096e79e8a7ed92d Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 7 Jun 2017 20:10:48 -0500
Subject: [PATCH 062/177] Hadrons: Quark module renamed MFermion::GaugeProp

---
 extras/Hadrons/Modules.hpp                    |  2 +-
 .../{Quark.hpp => MFermion/GaugeProp.hpp}     | 79 +++++++------------
 extras/Hadrons/modules.inc                    |  4 +-
 tests/hadrons/Test_hadrons_meson_3pt.cc       | 14 ++--
 tests/hadrons/Test_hadrons_spectrum.cc        |  6 +-
 5 files changed, 42 insertions(+), 63 deletions(-)
 rename extras/Hadrons/Modules/{Quark.hpp => MFermion/GaugeProp.hpp} (65%)

diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp
index 42a1f651..c27254aa 100644
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -8,6 +8,7 @@
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
+#include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
@@ -22,4 +23,3 @@
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 #include <Grid/Hadrons/Modules/MSource/Z2.hpp>
-#include <Grid/Hadrons/Modules/Quark.hpp>
diff --git a/extras/Hadrons/Modules/Quark.hpp b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
similarity index 65%
rename from extras/Hadrons/Modules/Quark.hpp
rename to extras/Hadrons/Modules/MFermion/GaugeProp.hpp
index cf7d4c28..b4f9edcc 100644
--- a/extras/Hadrons/Modules/Quark.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -1,34 +1,5 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: extras/Hadrons/Modules/Quark.hpp
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Quark_hpp_
-#define Hadrons_Quark_hpp_
+#ifndef Hadrons_MFermion_GaugeProp_hpp_
+#define Hadrons_MFermion_GaugeProp_hpp_
 
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE
 
 /******************************************************************************
- *                               TQuark                                       *
+ *                                GaugeProp                                   *
  ******************************************************************************/
-class QuarkPar: Serializable
+BEGIN_MODULE_NAMESPACE(MFermion)
+
+class GaugePropPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar,
                                     std::string, source,
                                     std::string, solver);
 };
 
 template <typename FImpl>
-class TQuark: public Module<QuarkPar>
+class TGaugeProp: public Module<GaugePropPar>
 {
 public:
     FGS_TYPE_ALIASES(FImpl,);
 public:
     // constructor
-    TQuark(const std::string name);
+    TGaugeProp(const std::string name);
     // destructor
-    virtual ~TQuark(void) = default;
-    // dependencies/products
+    virtual ~TGaugeProp(void) = default;
+    // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
     // setup
@@ -69,20 +42,20 @@ private:
     SolverFn     *solver_{nullptr};
 };
 
-MODULE_REGISTER(Quark, TQuark<FIMPL>);
+MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
 
 /******************************************************************************
- *                          TQuark implementation                             *
+ *                      TGaugeProp implementation                             *
  ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
-TQuark<FImpl>::TQuark(const std::string name)
-: Module(name)
+TGaugeProp<FImpl>::TGaugeProp(const std::string name)
+: Module<GaugePropPar>(name)
 {}
 
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
-std::vector<std::string> TQuark<FImpl>::getInput(void)
+std::vector<std::string> TGaugeProp<FImpl>::getInput(void)
 {
     std::vector<std::string> in = {par().source, par().solver};
     
@@ -90,7 +63,7 @@ std::vector<std::string> TQuark<FImpl>::getInput(void)
 }
 
 template <typename FImpl>
-std::vector<std::string> TQuark<FImpl>::getOutput(void)
+std::vector<std::string> TGaugeProp<FImpl>::getOutput(void)
 {
     std::vector<std::string> out = {getName(), getName() + "_5d"};
     
@@ -99,7 +72,7 @@ std::vector<std::string> TQuark<FImpl>::getOutput(void)
 
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TQuark<FImpl>::setup(void)
+void TGaugeProp<FImpl>::setup(void)
 {
     Ls_ = env().getObjectLs(par().solver);
     env().template registerLattice<PropagatorField>(getName());
@@ -111,13 +84,13 @@ void TQuark<FImpl>::setup(void)
 
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TQuark<FImpl>::execute(void)
+void TGaugeProp<FImpl>::execute(void)
 {
     LOG(Message) << "Computing quark propagator '" << getName() << "'"
-                 << std::endl;
+    << std::endl;
     
     FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)),
-                    tmp(env().getGrid());
+    tmp(env().getGrid());
     std::string     propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
     PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName);
     PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
@@ -128,12 +101,12 @@ void TQuark<FImpl>::execute(void)
     }
     
     LOG(Message) << "Inverting using solver '" << par().solver
-                 << "' on source '" << par().source << "'" << std::endl;
+    << "' on source '" << par().source << "'" << std::endl;
     for (unsigned int s = 0; s < Ns; ++s)
     for (unsigned int c = 0; c < Nc; ++c)
     {
         LOG(Message) << "Inversion for spin= " << s << ", color= " << c
-                     << std::endl;
+        << std::endl;
         // source conversion for 4D sources
         if (!env().isObject5d(par().source))
         {
@@ -170,7 +143,7 @@ void TQuark<FImpl>::execute(void)
         if (Ls_ > 1)
         {
             PropagatorField &p4d =
-                *env().template getObject<PropagatorField>(getName());
+            *env().template getObject<PropagatorField>(getName());
             
             axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
             axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
@@ -180,6 +153,8 @@ void TQuark<FImpl>::execute(void)
     }
 }
 
+END_MODULE_NAMESPACE
+
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Quark_hpp_
+#endif // Hadrons_MFermion_GaugeProp_hpp_
diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc
index f51ede5a..669b08ba 100644
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -20,6 +20,7 @@ modules_hpp =\
   Modules/MContraction/WeakHamiltonianEye.hpp \
   Modules/MContraction/WeakHamiltonianNonEye.hpp \
   Modules/MContraction/WeakNeutral4ptDisc.hpp \
+  Modules/MFermion/GaugeProp.hpp \
   Modules/MGauge/Load.hpp \
   Modules/MGauge/Random.hpp \
   Modules/MGauge/StochEm.hpp \
@@ -33,6 +34,5 @@ modules_hpp =\
   Modules/MSource/Point.hpp \
   Modules/MSource/SeqGamma.hpp \
   Modules/MSource/Wall.hpp \
-  Modules/MSource/Z2.hpp \
-  Modules/Quark.hpp
+  Modules/MSource/Z2.hpp
 
diff --git a/tests/hadrons/Test_hadrons_meson_3pt.cc b/tests/hadrons/Test_hadrons_meson_3pt.cc
index 7e487153..382c39d4 100644
--- a/tests/hadrons/Test_hadrons_meson_3pt.cc
+++ b/tests/hadrons/Test_hadrons_meson_3pt.cc
@@ -65,6 +65,10 @@ int main(int argc, char *argv[])
     // set fermion boundary conditions to be periodic space, antiperiodic time.
     std::string boundary = "1 1 1 -1";
 
+    // sink
+    MSink::Point::Par sinkPar;
+    sinkPar.mom = "0 0 0";
+    application.createModule<MSink::ScalarPoint>("sink", sinkPar);
     for (unsigned int i = 0; i < flavour.size(); ++i)
     {
         // actions
@@ -115,15 +119,15 @@ int main(int argc, char *argv[])
             }
             
             // propagators
-            Quark::Par quarkPar;
+            MFermion::GaugeProp::Par quarkPar;
             quarkPar.solver = "CG_" + flavour[i];
             quarkPar.source = srcName;
-            application.createModule<Quark>(qName[i], quarkPar);
+            application.createModule<MFermion::GaugeProp>(qName[i], quarkPar);
             for (unsigned int mu = 0; mu < Nd; ++mu)
             {
                 quarkPar.source = seqName[i][mu];
                 seqName[i][mu]  = "Q_" + flavour[i] + "-" + seqName[i][mu];
-                application.createModule<Quark>(seqName[i][mu], quarkPar);
+                application.createModule<MFermion::GaugeProp>(seqName[i][mu], quarkPar);
             }
         }
         
@@ -136,7 +140,7 @@ int main(int argc, char *argv[])
             mesPar.q1     = qName[i];
             mesPar.q2     = qName[j];
             mesPar.gammas = "all";
-            mesPar.mom    = "0. 0. 0. 0.";
+            mesPar.sink   = "sink";
             application.createModule<MContraction::Meson>("meson_Z2_"
                                                           + std::to_string(t)
                                                           + "_"
@@ -155,7 +159,7 @@ int main(int argc, char *argv[])
             mesPar.q1     = qName[i];
             mesPar.q2     = seqName[j][mu];
             mesPar.gammas = "all";
-            mesPar.mom    = "0. 0. 0. 0.";
+            mesPar.sink   = "sink";
             application.createModule<MContraction::Meson>("3pt_Z2_"
                                                           + std::to_string(t)
                                                           + "_"
diff --git a/tests/hadrons/Test_hadrons_spectrum.cc b/tests/hadrons/Test_hadrons_spectrum.cc
index 8f7b30c8..801674f7 100644
--- a/tests/hadrons/Test_hadrons_spectrum.cc
+++ b/tests/hadrons/Test_hadrons_spectrum.cc
@@ -90,12 +90,12 @@ int main(int argc, char *argv[])
                                                     solverPar);
         
         // propagators
-        Quark::Par quarkPar;
+        MFermion::GaugeProp::Par quarkPar;
         quarkPar.solver = "CG_" + flavour[i];
         quarkPar.source = "pt";
-        application.createModule<Quark>("Qpt_" + flavour[i], quarkPar);
+        application.createModule<MFermion::GaugeProp>("Qpt_" + flavour[i], quarkPar);
         quarkPar.source = "z2";
-        application.createModule<Quark>("QZ2_" + flavour[i], quarkPar);
+        application.createModule<MFermion::GaugeProp>("QZ2_" + flavour[i], quarkPar);
     }
     for (unsigned int i = 0; i < flavour.size(); ++i)
     for (unsigned int j = i; j < flavour.size(); ++j)

From 24908162970faae02a878ce3298d3ebc79a47fb9 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 7 Jun 2017 20:11:02 -0500
Subject: [PATCH 063/177] Hadrons: rare kaon program removed

---
 tests/hadrons/Test_hadrons.hpp         | 368 -------------------------
 tests/hadrons/Test_hadrons_rarekaon.cc | 342 -----------------------
 2 files changed, 710 deletions(-)
 delete mode 100644 tests/hadrons/Test_hadrons.hpp
 delete mode 100644 tests/hadrons/Test_hadrons_rarekaon.cc

diff --git a/tests/hadrons/Test_hadrons.hpp b/tests/hadrons/Test_hadrons.hpp
deleted file mode 100644
index 26d02a5c..00000000
--- a/tests/hadrons/Test_hadrons.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
-
- Source file: tests/hadrons/Test_hadrons.hpp
-
- Copyright (C) 2017
-
- Author: Andrew Lawson <andrew.lawson1991@gmail.com>
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
-
-#include <Grid/Hadrons/Application.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-
-/*******************************************************************************
- * Macros to reduce code duplication.
- ******************************************************************************/
-// Useful definitions
-#define ZERO_MOM "0. 0. 0. 0."
-#define INIT_INDEX(s, n) (std::string(s) + "_" + std::to_string(n))
-#define ADD_INDEX(s, n) (s + "_" + std::to_string(n))
-#define LABEL_3PT(s, t1, t2) ADD_INDEX(INIT_INDEX(s, t1), t2)
-#define LABEL_4PT(s, t1, t2, t3) ADD_INDEX(ADD_INDEX(INIT_INDEX(s, t1), t2), t3)
-#define LABEL_4PT_NOISE(s, t1, t2, t3, nn) ADD_INDEX(ADD_INDEX(ADD_INDEX(INIT_INDEX(s, t1), t2), t3), nn)
-
-// Wall source/sink macros
-#define NAME_3MOM_WALL_SOURCE(t, mom) ("wall_" + std::to_string(t) + "_" + mom)
-#define NAME_WALL_SOURCE(t) NAME_3MOM_WALL_SOURCE(t, ZERO_MOM)
-#define NAME_POINT_SOURCE(pos) ("point_" + pos)
-
-#define MAKE_3MOM_WALL_PROP(tW, mom, propName, solver)\
-{\
-    std::string srcName = NAME_3MOM_WALL_SOURCE(tW, mom);\
-    makeWallSource(application, srcName, tW, mom);\
-    makePropagator(application, propName, srcName, solver);\
-}
-
-#define MAKE_WALL_PROP(tW, propName, solver)\
-        MAKE_3MOM_WALL_PROP(tW, ZERO_MOM, propName, solver)
-
-// Sequential source macros
-#define MAKE_SEQUENTIAL_PROP(tS, qSrc, mom, propName, solver)\
-{\
-    std::string srcName = ADD_INDEX(qSrc + "_seq", tS);\
-    makeSequentialSource(application, srcName, qSrc, tS, mom);\
-    makePropagator(application, propName, srcName, solver);\
-}
-
-// Point source macros
-#define MAKE_POINT_PROP(pos, propName, solver)\
-{\
-    std::string srcName = NAME_POINT_SOURCE(pos);\
-    makePointSource(application, srcName, pos);\
-    makePropagator(application, propName, srcName, solver);\
-}
-
-/*******************************************************************************
- * Functions for propagator construction.
- ******************************************************************************/
- 
-/*******************************************************************************
- * Name: makePointSource
- * Purpose: Construct point source and add to application module.
- * Parameters: application - main application that stores modules.
- *             srcName     - name of source module to create.
- *             pos         - Position of point source.
- * Returns: None.
- ******************************************************************************/
-inline void makePointSource(Application &application, std::string srcName,
-                            std::string pos)
-{
-    // If the source already exists, don't make the module again.
-    if (!(Environment::getInstance().hasModule(srcName)))
-    {
-        MSource::Point::Par pointPar;
-        pointPar.position = pos;
-        application.createModule<MSource::Point>(srcName, pointPar);
-    }
-}
-
-/*******************************************************************************
- * Name: makeSequentialSource
- * Purpose: Construct sequential source and add to application module.
- * Parameters: application - main application that stores modules.
- *             srcName     - name of source module to create.
- *             qSrc        - Input quark for sequential inversion.
- *             tS          - sequential source timeslice.
- *             mom         - momentum insertion (default is zero).
- * Returns: None.
- ******************************************************************************/
-inline void makeSequentialSource(Application &application, std::string srcName,
-                                 std::string qSrc, unsigned int tS,
-                                 std::string mom = ZERO_MOM)
-{
-    // If the source already exists, don't make the module again.
-    if (!(Environment::getInstance().hasModule(srcName)))
-    {
-        MSource::SeqGamma::Par seqPar;
-        seqPar.q   = qSrc;
-        seqPar.tA  = tS;
-        seqPar.tB  = tS;
-        seqPar.mom = mom;
-        seqPar.gamma = Gamma::Algebra::GammaT;
-        application.createModule<MSource::SeqGamma>(srcName, seqPar);
-    }
-}
-
-/*******************************************************************************
- * Name: makeWallSource
- * Purpose: Construct wall source and add to application module.
- * Parameters: application - main application that stores modules.
- *             srcName     - name of source module to create.
- *             tW          - wall source timeslice.
- *             mom         - momentum insertion (default is zero).
- * Returns: None.
- ******************************************************************************/
-inline void makeWallSource(Application &application, std::string srcName,
-                           unsigned int tW, std::string mom = ZERO_MOM)
-{
-    // If the source already exists, don't make the module again.
-    if (!(Environment::getInstance().hasModule(srcName)))
-    {
-        MSource::Wall::Par wallPar;
-        wallPar.tW  = tW;
-        wallPar.mom = mom;
-        application.createModule<MSource::Wall>(srcName, wallPar);
-    }
-}
-
-/*******************************************************************************
- * Name: makeWallSink
- * Purpose: Wall sink smearing of a propagator.
- * Parameters: application - main application that stores modules.
- *             propName    - name of input propagator.
- *             wallName    - name of smeared propagator.
- *             mom         - momentum insertion (default is zero).
- * Returns: None.
- ******************************************************************************/
-inline void makeWallSink(Application &application, std::string propName,
-                         std::string wallName, std::string mom = ZERO_MOM)
-{
-    // If the propagator has already been smeared, don't smear it again.
-    // Temporarily removed, strategy for sink smearing likely to change.
-    /*if (!(Environment::getInstance().hasModule(wallName)))
-    {
-        MSink::Wall::Par wallPar;
-        wallPar.q   = propName;
-        wallPar.mom = mom;
-        application.createModule<MSink::Wall>(wallName, wallPar);
-    }*/
-}
-
-/*******************************************************************************
- * Name: makePropagator
- * Purpose: Construct source and propagator then add to application module.
- * Parameters: application - main application that stores modules.
- *             propName    - name of propagator module to create.
- *             srcName     - name of source module to use.
- *             solver      - solver to use (default is CG).
- * Returns: None.
- ******************************************************************************/
-inline void makePropagator(Application &application, std::string &propName,
-                           std::string &srcName, std::string &solver)
-{
-    // If the propagator already exists, don't make the module again.
-    if (!(Environment::getInstance().hasModule(propName)))
-    {
-        Quark::Par         quarkPar;
-        quarkPar.source = srcName;
-        quarkPar.solver = solver;
-        application.createModule<Quark>(propName, quarkPar);
-    }
-}
-
-/*******************************************************************************
- * Name: makeLoop
- * Purpose: Use noise source and inversion result to make loop propagator, then 
- *          add to application module.
- * Parameters: application - main application that stores modules.
- *             propName    - name of propagator module to create.
- *             srcName     - name of noise source module to use.
- *             resName     - name of inversion result on given noise source.
- * Returns: None.
- ******************************************************************************/
-inline void makeLoop(Application &application, std::string &propName,
-                     std::string &srcName, std::string &resName)
-{
-    // If the loop propagator already exists, don't make the module again.
-    if (!(Environment::getInstance().hasModule(propName)))
-    {
-        MLoop::NoiseLoop::Par loopPar;
-        loopPar.q   = resName;
-        loopPar.eta = srcName;
-        application.createModule<MLoop::NoiseLoop>(propName, loopPar);
-    }
-}
-
-/*******************************************************************************
- * Contraction module creation.
- ******************************************************************************/
-
-/*******************************************************************************
- * Name: mesonContraction
- * Purpose: Create meson contraction module and add to application module.
- * Parameters: application - main application that stores modules.
- *             npt         - specify n-point correlator (for labelling).
- *             q1          - quark propagator 1.
- *             q2          - quark propagator 2.
- *             label       - unique label to construct module name.
- *             mom         - momentum to project (default is zero)
- *             gammas      - gamma insertions at source and sink.
- * Returns: None.
- ******************************************************************************/
-inline void mesonContraction(Application &application, unsigned int npt, 
-                             std::string &q1, std::string &q2,
-                             std::string &label, 
-                             std::string mom = ZERO_MOM,
-                             std::string gammas = "<Gamma5 Gamma5>")
-{
-    std::string modName = std::to_string(npt) + "pt_" + label;
-    if (!(Environment::getInstance().hasModule(modName)))
-    {
-        MContraction::Meson::Par mesPar;
-        mesPar.output = std::to_string(npt) + "pt/" + label;
-        mesPar.q1 = q1;
-        mesPar.q2 = q2;
-        mesPar.mom = mom;
-        mesPar.gammas = gammas;
-        application.createModule<MContraction::Meson>(modName, mesPar);
-    }
- }
-
-/*******************************************************************************
- * Name: gamma3ptContraction
- * Purpose: Create gamma3pt contraction module and add to application module.
- * Parameters: application - main application that stores modules.
- *             npt         - specify n-point correlator (for labelling).
- *             q1          - quark propagator 1.
- *             q2          - quark propagator 2.
- *             q3          - quark propagator 3.
- *             label       - unique label to construct module name.
- *             gamma       - gamma insertions between q2 and q3.
- * Returns: None.
- ******************************************************************************/
-inline void gamma3ptContraction(Application &application, unsigned int npt, 
-                                std::string &q1, std::string &q2,
-                                std::string &q3, std::string &label, 
-                                Gamma::Algebra gamma = Gamma::Algebra::Identity)
-{
-    std::string modName = std::to_string(npt) + "pt_" + label;
-    if (!(Environment::getInstance().hasModule(modName)))
-    {
-        MContraction::Gamma3pt::Par gamma3ptPar;
-        gamma3ptPar.output = std::to_string(npt) + "pt/" + label;
-        gamma3ptPar.q1 = q1;
-        gamma3ptPar.q2 = q2;
-        gamma3ptPar.q3 = q3;
-        gamma3ptPar.gamma = gamma;
-        application.createModule<MContraction::Gamma3pt>(modName, gamma3ptPar);
-    }
- }
-
-/*******************************************************************************
- * Name: weakContraction[Eye,NonEye]
- * Purpose: Create Weak Hamiltonian contraction module for Eye/NonEye topology
- *          and add to application module.
- * Parameters: application - main application that stores modules.
- *             npt         - specify n-point correlator (for labelling).
- *             q1          - quark propagator 1.
- *             q2          - quark propagator 2.
- *             q3          - quark propagator 3.
- *             q4          - quark propagator 4.
- *             label       - unique label to construct module name.
- * Returns: None.
- ******************************************************************************/
-#define HW_CONTRACTION(top) \
-inline void weakContraction##top(Application &application, unsigned int npt,\
-                                 std::string &q1, std::string &q2, \
-                                 std::string &q3, std::string &q4, \
-                                 std::string &label)\
-{\
-    std::string modName = std::to_string(npt) + "pt_" + label;\
-    if (!(Environment::getInstance().hasModule(modName)))\
-    {\
-        MContraction::WeakHamiltonian##top::Par weakPar;\
-        weakPar.output = std::to_string(npt) + "pt/" + label;\
-        weakPar.q1 = q1;\
-        weakPar.q2 = q2;\
-        weakPar.q3 = q3;\
-        weakPar.q4 = q4;\
-        application.createModule<MContraction::WeakHamiltonian##top>(modName, weakPar);\
-    }\
-}
-HW_CONTRACTION(Eye)    // weakContractionEye
-HW_CONTRACTION(NonEye) // weakContractionNonEye
-
-/*******************************************************************************
- * Name: disc0Contraction
- * Purpose: Create contraction module for 4pt Weak Hamiltonian + current
- *          disconnected topology for neutral mesons and add to application 
- *          module.
- * Parameters: application - main application that stores modules.
- *             q1          - quark propagator 1.
- *             q2          - quark propagator 2.
- *             q3          - quark propagator 3.
- *             q4          - quark propagator 4.
- *             label       - unique label to construct module name.
- * Returns: None.
- ******************************************************************************/
-inline void disc0Contraction(Application &application, 
-                             std::string &q1, std::string &q2,
-                             std::string &q3, std::string &q4,
-                             std::string &label)
-{
-    std::string modName = "4pt_" + label;
-    if (!(Environment::getInstance().hasModule(modName)))
-    {
-        MContraction::WeakNeutral4ptDisc::Par disc0Par;
-        disc0Par.output = "4pt/" + label;
-        disc0Par.q1 = q1;
-        disc0Par.q2 = q2;
-        disc0Par.q3 = q3;
-        disc0Par.q4 = q4;
-        application.createModule<MContraction::WeakNeutral4ptDisc>(modName, disc0Par);
-    }
- }
-
-/*******************************************************************************
- * Name: discLoopContraction
- * Purpose: Create contraction module for disconnected loop and add to
- *          application module.
- * Parameters: application - main application that stores modules.
- *             q_loop      - loop quark propagator.
- *             modName     - unique module name.
- *             gamma       - gamma matrix to use in contraction.
- * Returns: None.
- ******************************************************************************/
-inline void discLoopContraction(Application &application,
-                                std::string &q_loop, std::string &modName,
-                                Gamma::Algebra gamma = Gamma::Algebra::Identity)
-{
-    if (!(Environment::getInstance().hasModule(modName)))
-    {
-        MContraction::DiscLoop::Par discPar;
-        discPar.output = "disc/" + modName;
-        discPar.q_loop = q_loop;
-        discPar.gamma  = gamma;
-        application.createModule<MContraction::DiscLoop>(modName, discPar);
-    }
- }
diff --git a/tests/hadrons/Test_hadrons_rarekaon.cc b/tests/hadrons/Test_hadrons_rarekaon.cc
deleted file mode 100644
index ab4d3ef1..00000000
--- a/tests/hadrons/Test_hadrons_rarekaon.cc
+++ /dev/null
@@ -1,342 +0,0 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
-
- Source file: tests/hadrons/Test_hadrons_rarekaon.cc
-
- Copyright (C) 2017
-
- Author: Andrew Lawson <andrew.lawson1991@gmail.com>
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
-
-#include "Test_hadrons.hpp"
-
-using namespace Grid;
-using namespace Hadrons;
-
-enum quarks
-{
-   light   = 0,
-   strange = 1,
-   charm   = 2  
-};
-
-int main(int argc, char *argv[])
-{
-    // parse command line //////////////////////////////////////////////////////
-    std::string configStem;
-    
-    if (argc < 2)
-    {
-        std::cerr << "usage: " << argv[0] << " <configuration filestem> [Grid options]";
-        std::cerr << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
-    configStem = argv[1];
-    
-    // initialization //////////////////////////////////////////////////////////
-    Grid_init(&argc, &argv);
-    HadronsLogError.Active(GridLogError.isActive());
-    HadronsLogWarning.Active(GridLogWarning.isActive());
-    HadronsLogMessage.Active(GridLogMessage.isActive());
-    HadronsLogIterative.Active(GridLogIterative.isActive());
-    HadronsLogDebug.Active(GridLogDebug.isActive());
-    LOG(Message) << "Grid initialized" << std::endl;
-
-    // run setup ///////////////////////////////////////////////////////////////
-    Application              application;
-    std::vector<double>       mass    = {.01, .04, .2};
-    std::vector<std::string>  flavour = {"l", "s", "c"};
-    std::vector<std::string>  solvers = {"CG_l", "CG_s", "CG_c"};
-    std::string               kmom    = "0. 0. 0. 0.";
-    std::string               pmom    = "1. 0. 0. 0.";
-    std::string               qmom    = "-1. 0. 0. 0.";
-    std::string               mqmom   = "1. 0. 0. 0.";
-    std::vector<unsigned int> tKs     = {0};
-    unsigned int              dt_pi   = 16;
-    std::vector<unsigned int> tJs     = {8};
-    unsigned int              n_noise = 1;
-    unsigned int              nt      = 32;
-    bool                      do_disconnected(false);
-
-    // Global parameters.
-    Application::GlobalPar globalPar;
-    globalPar.trajCounter.start    = 1500;
-    globalPar.trajCounter.end      = 1520;
-    globalPar.trajCounter.step     = 20;
-    globalPar.seed                 = "1 2 3 4";
-    globalPar.genetic.maxGen       = 1000;
-    globalPar.genetic.maxCstGen    = 200;
-    globalPar.genetic.popSize      = 20;
-    globalPar.genetic.mutationRate = .1;
-    application.setPar(globalPar);
-
-    // gauge field
-    if (configStem == "None")
-    {
-        application.createModule<MGauge::Unit>("gauge");
-    }
-    else
-    {
-        MGauge::Load::Par gaugePar;
-        gaugePar.file = configStem;
-        application.createModule<MGauge::Load>("gauge", gaugePar);
-    }
-    
-    // set fermion boundary conditions to be periodic space, antiperiodic time.
-    std::string boundary = "1 1 1 -1";
-
-    for (unsigned int i = 0; i < flavour.size(); ++i)
-    {
-        // actions
-        MAction::DWF::Par actionPar;
-        actionPar.gauge = "gauge";
-        actionPar.Ls    = 16;
-        actionPar.M5    = 1.8;
-        actionPar.mass  = mass[i];
-        actionPar.boundary = boundary;
-        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
-
-        // solvers
-        // RBPrecCG -> CG
-        MSolver::RBPrecCG::Par solverPar;
-        solverPar.action   = "DWF_" + flavour[i];
-        solverPar.residual = 1.0e-8;
-        application.createModule<MSolver::RBPrecCG>(solvers[i],
-                                                    solverPar);
-    }
-
-    // Create noise propagators for loops.
-    std::vector<std::string> noiseSrcs;
-    std::vector<std::vector<std::string>> noiseRes;
-    std::vector<std::vector<std::string>> noiseProps;
-    if (n_noise > 0)
-    {
-        MSource::Z2::Par noisePar;
-        noisePar.tA = 0;
-        noisePar.tB = nt - 1;
-        std::string loop_stem = "loop_";
-
-        noiseRes.resize(flavour.size());
-        noiseProps.resize(flavour.size());
-        for (unsigned int nn = 0; nn < n_noise; ++nn)
-        {
-            std::string eta = INIT_INDEX("noise", nn);
-            application.createModule<MSource::Z2>(eta, noisePar);
-            noiseSrcs.push_back(eta);
-
-            for (unsigned int f = 0; f < flavour.size(); ++f)
-            {
-                std::string loop_prop = INIT_INDEX(loop_stem + flavour[f], nn);
-                std::string loop_res  = loop_prop + "_res";
-                makePropagator(application, loop_res, eta, solvers[f]);
-                makeLoop(application, loop_prop, eta, loop_res);
-                noiseRes[f].push_back(loop_res);
-                noiseProps[f].push_back(loop_prop);
-            }
-        }
-    }
-
-    // Translate rare kaon decay across specified timeslices.
-    for (unsigned int i = 0; i < tKs.size(); ++i)
-    {
-        // Zero-momentum wall source propagators for kaon and pion.
-        unsigned int tK     = tKs[i];
-        unsigned int tpi    = (tK + dt_pi) % nt;
-        std::string q_Kl_0  = INIT_INDEX("Q_l_0", tK);
-        std::string q_pil_0 = INIT_INDEX("Q_l_0", tpi);
-        MAKE_WALL_PROP(tK, q_Kl_0, solvers[light]);
-        MAKE_WALL_PROP(tpi, q_pil_0, solvers[light]);
-
-        // Wall sources for kaon and pion with momentum insertion. If either
-        // p or k are zero, or p = k, re-use the existing name to avoid 
-        // duplicating a propagator.
-        std::string q_Ks_k  = INIT_INDEX("Q_Ks_k", tK);
-        std::string q_Ks_p  = INIT_INDEX((kmom == pmom) ? "Q_Ks_k" : "Q_Ks_p", tK);
-        std::string q_pil_k = INIT_INDEX((kmom == ZERO_MOM) ? "Q_l_0" : "Q_l_k", tpi);
-        std::string q_pil_p = INIT_INDEX((pmom == kmom) ? q_pil_k : ((pmom == ZERO_MOM) ? "Q_l_0" : "Q_l_p"), tpi);
-        MAKE_3MOM_WALL_PROP(tK, kmom, q_Ks_k, solvers[strange]);
-        MAKE_3MOM_WALL_PROP(tK, pmom, q_Ks_p, solvers[strange]);
-        MAKE_3MOM_WALL_PROP(tpi, kmom, q_pil_k, solvers[light]);
-        MAKE_3MOM_WALL_PROP(tpi, pmom, q_pil_p, solvers[light]);
-
-        /***********************************************************************
-         * CONTRACTIONS: pi and K 2pt contractions with mom = p, k.
-         **********************************************************************/
-        // Wall-Point
-        std::string PW_K_k = INIT_INDEX("PW_K_k", tK);
-        std::string PW_K_p = INIT_INDEX("PW_K_p", tK);
-        std::string PW_pi_k = INIT_INDEX("PW_pi_k", tpi);
-        std::string PW_pi_p = INIT_INDEX("PW_pi_p", tpi);
-        mesonContraction(application, 2, q_Kl_0, q_Ks_k, PW_K_k, kmom);
-        mesonContraction(application, 2, q_Kl_0, q_Ks_p, PW_K_p, pmom);
-        mesonContraction(application, 2, q_pil_k, q_pil_0, PW_pi_k, kmom);
-        mesonContraction(application, 2, q_pil_p, q_pil_0, PW_pi_p, pmom);
-        // Wall-Wall, to be done - requires modification of meson module.
-
-        /***********************************************************************
-         * CONTRACTIONS: 3pt Weak Hamiltonian, C & W (non-Eye type) classes.
-         **********************************************************************/
-        std::string HW_CW_k = LABEL_3PT("HW_CW_k", tK, tpi);
-        std::string HW_CW_p = LABEL_3PT("HW_CW_p", tK, tpi);
-        weakContractionNonEye(application, 3, q_Kl_0, q_Ks_k, q_pil_k, q_pil_0, HW_CW_k);
-        weakContractionNonEye(application, 3, q_Kl_0, q_Ks_p, q_pil_p, q_pil_0, HW_CW_p);
-
-        /***********************************************************************
-         * CONTRACTIONS: 3pt sd insertion.
-         **********************************************************************/
-        // Note: eventually will use wall sink smeared q_Kl_0 instead.
-        std::string sd_k = LABEL_3PT("sd_k", tK, tpi);
-        std::string sd_p = LABEL_3PT("sd_p", tK, tpi);
-        gamma3ptContraction(application, 3, q_Kl_0, q_Ks_k, q_pil_k, sd_k);
-        gamma3ptContraction(application, 3, q_Kl_0, q_Ks_p, q_pil_p, sd_p);
-
-        for (unsigned int nn = 0; nn < n_noise; ++nn)
-        {
-            /*******************************************************************
-             * CONTRACTIONS: 3pt Weak Hamiltonian, S and E (Eye type) classes.
-             ******************************************************************/
-            // Note: eventually will use wall sink smeared q_Kl_0 instead.
-            for (unsigned int f = 0; f < flavour.size(); ++f)
-            {
-                if ((f != strange) || do_disconnected)
-                {
-                    std::string HW_SE_k = LABEL_3PT("HW_SE_k_" + flavour[f], tK, tpi);
-                    std::string HW_SE_p = LABEL_3PT("HW_SE_p_" + flavour[f], tK, tpi);
-                    std::string loop_q  = noiseProps[f][nn];
-                    weakContractionEye(application, 3, q_Kl_0, q_Ks_k, q_pil_k, loop_q, HW_CW_k);
-                    weakContractionEye(application, 3, q_Kl_0, q_Ks_p, q_pil_p, loop_q, HW_CW_p);
-                }
-            }
-        }
-
-        // Perform separate contractions for each t_J position.
-        for (unsigned int j = 0; j < tJs.size(); ++j)
-        {
-            // Sequential sources for current insertions. Local for now,
-            // gamma_0 only.
-            unsigned int tJ = (tJs[j] + tK) % nt;
-            MSource::SeqGamma::Par seqPar;
-            std::string q_KlCl_q   = LABEL_3PT("Q_KlCl_q", tK, tJ);
-            std::string q_KsCs_mq  = LABEL_3PT("Q_KsCs_mq", tK, tJ);
-            std::string q_pilCl_q  = LABEL_3PT("Q_pilCl_q", tpi, tJ);
-            std::string q_pilCl_mq = LABEL_3PT("Q_pilCl_mq", tpi, tJ);
-            MAKE_SEQUENTIAL_PROP(tJ, q_Kl_0, qmom, q_KlCl_q, solvers[light]);
-            MAKE_SEQUENTIAL_PROP(tJ, q_Ks_k, mqmom, q_KsCs_mq, solvers[strange]);
-            MAKE_SEQUENTIAL_PROP(tJ, q_pil_p, qmom, q_pilCl_q, solvers[light]);
-            MAKE_SEQUENTIAL_PROP(tJ, q_pil_0, mqmom, q_pilCl_mq, solvers[light]);
-
-            /*******************************************************************
-             * CONTRACTIONS: pi and K 3pt contractions with current insertion.
-             ******************************************************************/
-            // Wall-Point
-            std::string C_PW_Kl   = LABEL_3PT("C_PW_Kl", tK, tJ);
-            std::string C_PW_Ksb  = LABEL_3PT("C_PW_Ksb", tK, tJ);
-            std::string C_PW_pilb = LABEL_3PT("C_PW_pilb", tK, tJ);
-            std::string C_PW_pil  = LABEL_3PT("C_PW_pil", tK, tJ);
-            mesonContraction(application, 3, q_KlCl_q, q_Ks_k, C_PW_Kl, pmom);
-            mesonContraction(application, 3, q_Kl_0, q_KsCs_mq, C_PW_Ksb, pmom);
-            mesonContraction(application, 3, q_pil_0, q_pilCl_q, C_PW_pilb, kmom);
-            mesonContraction(application, 3, q_pilCl_mq, q_pil_p, C_PW_pil, kmom);
-            // Wall-Wall, to be done.
-
-            /*******************************************************************
-             * CONTRACTIONS: 4pt contractions, C & W classes.
-             ******************************************************************/
-            std::string CW_Kl   = LABEL_4PT("CW_Kl", tK, tJ, tpi);
-            std::string CW_Ksb  = LABEL_4PT("CW_Ksb", tK, tJ, tpi);
-            std::string CW_pilb = LABEL_4PT("CW_pilb", tK, tJ, tpi);
-            std::string CW_pil  = LABEL_4PT("CW_pil", tK, tJ, tpi);
-            weakContractionNonEye(application, 4, q_KlCl_q, q_Ks_k, q_pil_p, q_pil_0, CW_Kl);
-            weakContractionNonEye(application, 4, q_Kl_0, q_KsCs_mq, q_pil_p, q_pil_0, CW_Ksb);
-            weakContractionNonEye(application, 4, q_Kl_0, q_Ks_k, q_pilCl_q, q_pil_0, CW_pilb);
-            weakContractionNonEye(application, 4, q_Kl_0, q_Ks_k, q_pil_p, q_pilCl_mq, CW_pil);
-
-            /*******************************************************************
-             * CONTRACTIONS: 4pt contractions, sd insertions.
-             ******************************************************************/
-            // Note: eventually will use wall sink smeared q_Kl_0/q_KlCl_q instead.
-            std::string sd_Kl   = LABEL_4PT("sd_Kl", tK, tJ, tpi);
-            std::string sd_Ksb  = LABEL_4PT("sd_Ksb", tK, tJ, tpi);
-            std::string sd_pilb = LABEL_4PT("sd_pilb", tK, tJ, tpi);
-            gamma3ptContraction(application, 4, q_KlCl_q, q_Ks_k, q_pil_p, sd_Kl);
-            gamma3ptContraction(application, 4, q_Kl_0, q_KsCs_mq, q_pil_p, sd_Ksb);
-            gamma3ptContraction(application, 4, q_Kl_0, q_Ks_k, q_pilCl_q, sd_pilb);
-
-            // Sequential sources for each noise propagator.
-            for (unsigned int nn = 0; nn < n_noise; ++nn)
-            {
-                std::string loop_stem = "loop_";
-
-                // Contraction required for each quark flavour - alternatively
-                // drop the strange loop if not performing disconnected
-                // contractions or neglecting H_W operators Q_3 -> Q_10.
-                for (unsigned int f = 0; f < flavour.size(); ++f)
-                {
-                    if ((f != strange) || do_disconnected)
-                    {
-                        std::string eta      = noiseSrcs[nn];
-                        std::string loop_q   = noiseProps[f][nn];
-                        std::string loop_qCq = LABEL_3PT(loop_stem + flavour[f], tJ, nn);
-                        std::string loop_qCq_res = loop_qCq + "_res";
-                        MAKE_SEQUENTIAL_PROP(tJ, noiseRes[f][nn], qmom, 
-                                             loop_qCq_res, solvers[f]);
-                        makeLoop(application, loop_qCq, eta, loop_qCq_res);
-
-                        /*******************************************************
-                         * CONTRACTIONS: 4pt contractions, S & E classes.
-                         ******************************************************/
-                        // Note: eventually will use wall sink smeared q_Kl_0/q_KlCl_q instead.
-                        std::string SE_Kl   = LABEL_4PT_NOISE("SE_Kl", tK, tJ, tpi, nn);
-                        std::string SE_Ksb  = LABEL_4PT_NOISE("SE_Ksb", tK, tJ, tpi, nn);
-                        std::string SE_pilb = LABEL_4PT_NOISE("SE_pilb", tK, tJ, tpi, nn);
-                        std::string SE_loop = LABEL_4PT_NOISE("SE_loop", tK, tJ, tpi, nn);
-                        weakContractionEye(application, 4, q_KlCl_q, q_Ks_k, q_pil_p, loop_q, SE_Kl);
-                        weakContractionEye(application, 4, q_Kl_0, q_KsCs_mq, q_pil_p, loop_q, SE_Ksb);
-                        weakContractionEye(application, 4, q_Kl_0, q_Ks_k, q_pilCl_q, loop_q, SE_pilb);
-                        weakContractionEye(application, 4, q_Kl_0, q_Ks_k, q_pil_p, loop_qCq, SE_loop);
-
-                        /*******************************************************
-                         * CONTRACTIONS: 4pt contractions, pi0 disconnected 
-                         * loop.
-                         ******************************************************/
-                        std::string disc0 = LABEL_4PT_NOISE("disc0", tK, tJ, tpi, nn);
-                        disc0Contraction(application, q_Kl_0, q_Ks_k, q_pilCl_q, loop_q, disc0);
-
-                        /*******************************************************
-                         * CONTRACTIONS: Disconnected loop.
-                         ******************************************************/
-                        std::string discLoop = "disc_" + loop_qCq;
-                        discLoopContraction(application, loop_qCq, discLoop);
-                    }
-                }
-            }
-        }
-    }
-    // execution
-    std::string par_file_name = "rarekaon_000_100_tK0_tpi16_tJ8_noloop_mc0.2.xml";
-    application.saveParameterFile(par_file_name);
-    application.run();
-
-    // epilogue
-    LOG(Message) << "Grid is finalizing now" << std::endl;
-    Grid_finalize();
-
-    return EXIT_SUCCESS;
-}

From 2bc4d0a20ec038786f6544783b368fed3bbfb804 Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Thu, 8 Jun 2017 22:21:25 +0100
Subject: [PATCH 064/177] Move code into utils

---
 tests/core/Test_fft_gfix.cc | 242 ++++--------------------------------
 1 file changed, 26 insertions(+), 216 deletions(-)

diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
index 7938241e..9732eb85 100644
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -28,212 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #include <Grid/Grid.h>
 
-using namespace Grid;
-using namespace Grid::QCD;
-
-template <class Gimpl> 
-class FourierAcceleratedGaugeFixer  : public Gimpl {
-  public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  typedef typename Gimpl::GaugeLinkField GaugeMat;
-  typedef typename Gimpl::GaugeField GaugeLorentz;
-
-  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
-    for(int mu=0;mu<Nd;mu++){
-//      ImplComplex cmi(0.0,-1.0);
-      Complex cmi(0.0,-1.0);
-      A[mu] = Ta(U[mu]) * cmi;
-    }
-  }
-  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
-    dmuAmu=zero;
-    for(int mu=0;mu<Nd;mu++){
-      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
-    }
-  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol) {
-    GridBase *grid = Umu._grid;
-
-    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
-    Real old_trace = org_link_trace;
-    Real trG;
-
-    std::vector<GaugeMat> U(Nd,grid);
-                 GaugeMat dmuAmu(grid);
-
-    for(int i=0;i<maxiter;i++){
-      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
-      //trG = SteepestDescentStep(U,alpha,dmuAmu);
-      trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
-      for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
-      // Monitor progress and convergence test 
-      // infrequently to minimise cost overhead
-      if ( i %20 == 0 ) { 
-	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
-
-	std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
-	
-	Real Phi  = 1.0 - old_trace / link_trace ;
-	Real Omega= 1.0 - trG;
-
-
-	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
-	if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
-	  std::cout << GridLogMessage << "Converged ! "<<std::endl;
-	  return;
-	}
-
-	old_trace = link_trace;
-
-      }
-    }
-  };
-  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
-    GridBase *grid = U[0]._grid;
-
-    std::vector<GaugeMat> A(Nd,grid);
-    GaugeMat g(grid);
-
-    GaugeLinkToLieAlgebraField(U,A);
-    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
-
-
-    Real vol = grid->gSites();
-    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
-
-    SU<Nc>::GaugeTransform(U,g);
-
-    return trG;
-  }
-
-  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
-
-    GridBase *grid = U[0]._grid;
-
-    Real vol = grid->gSites();
-
-    FFT theFFT((GridCartesian *)grid);
-
-    LatticeComplex  Fp(grid);
-    LatticeComplex  psq(grid); psq=zero;
-    LatticeComplex  pmu(grid); 
-    LatticeComplex   one(grid); one = Complex(1.0,0.0);
-
-    GaugeMat g(grid);
-    GaugeMat dmuAmu_p(grid);
-    std::vector<GaugeMat> A(Nd,grid);
-
-    GaugeLinkToLieAlgebraField(U,A);
-
-    DmuAmu(A,dmuAmu);
-
-    theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
-
-    //////////////////////////////////
-    // Work out Fp = psq_max/ psq...
-    //////////////////////////////////
-    std::vector<int> latt_size = grid->GlobalDimensions();
-    std::vector<int> coor(grid->_ndimension,0);
-    for(int mu=0;mu<Nd;mu++) {
-
-      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
-      LatticeCoordinate(pmu,mu);
-      pmu = TwoPiL * pmu ;
-      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
-    }
-
-    Complex psqMax(16.0);
-    Fp =  psqMax*one/psq;
-
-    /*
-    static int once;
-    if ( once == 0 ) { 
-      std::cout << " Fp " << Fp <<std::endl;
-      once ++;
-      }*/
-
-    pokeSite(TComplex(1.0),Fp,coor);
-
-    dmuAmu_p  = dmuAmu_p * Fp; 
-
-    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
-
-    GaugeMat ciadmam(grid);
-    Complex cialpha(0.0,-alpha);
-    ciadmam = dmuAmu*cialpha;
-    SU<Nc>::taExp(ciadmam,g);
-
-    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
-
-    SU<Nc>::GaugeTransform(U,g);
-
-    return trG;
-  }
-
-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
-    GridBase *grid = g._grid;
-    Complex cialpha(0.0,-alpha);
-    GaugeMat ciadmam(grid);
-    DmuAmu(A,dmuAmu);
-    ciadmam = dmuAmu*cialpha;
-    SU<Nc>::taExp(ciadmam,g);
-  }  
-/*
-  ////////////////////////////////////////////////////////////////
-  // NB The FT for fields living on links has an extra phase in it
-  // Could add these to the FFT class as a later task since this code
-  // might be reused elsewhere ????
-  ////////////////////////////////////////////////////////////////
-  static void InverseFourierTransformAmu(FFT &theFFT,const std::vector<GaugeMat> &Ap,std::vector<GaugeMat> &Ax) {
-    GridBase * grid = theFFT.Grid();
-    std::vector<int> latt_size = grid->GlobalDimensions();
-
-    ComplexField  pmu(grid);
-    ComplexField  pha(grid);
-    GaugeMat      Apha(grid);
-
-    Complex ci(0.0,1.0);
-
-    for(int mu=0;mu<Nd;mu++){
-
-      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
-      LatticeCoordinate(pmu,mu);
-      pmu = TwoPiL * pmu ;
-      pha = exp(pmu *  (0.5 *ci)); // e(ipmu/2) since Amu(x+mu/2)
-
-      Apha = Ap[mu] * pha;
-
-      theFFT.FFT_all_dim(Apha,Ax[mu],FFT::backward);
-    }
-  }
-  static void FourierTransformAmu(FFT & theFFT,const std::vector<GaugeMat> &Ax,std::vector<GaugeMat> &Ap) {
-    GridBase * grid = theFFT.Grid();
-    std::vector<int> latt_size = grid->GlobalDimensions();
-
-    ComplexField  pmu(grid);
-    ComplexField  pha(grid);
-    Complex ci(0.0,1.0);
-    
-    // Sign convention for FFTW calls:
-    // A(x)= Sum_p e^ipx A(p) / V
-    // A(p)= Sum_p e^-ipx A(x)
-
-    for(int mu=0;mu<Nd;mu++){
-      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
-      LatticeCoordinate(pmu,mu);
-      pmu = TwoPiL * pmu ;
-      pha = exp(-pmu *  (0.5 *ci)); // e(+ipmu/2) since Amu(x+mu/2)
-
-      theFFT.FFT_all_dim(Ax[mu],Ap[mu],FFT::backward);
-      Ap[mu] = Ap[mu] * pha;
-    }
-  }
-*/
-};
-
 int main (int argc, char ** argv)
 {
   std::vector<int> seeds({1,2,3,4});
@@ -264,22 +58,24 @@ int main (int argc, char ** argv)
   std::cout<< "*****************************************************************" <<std::endl;
 
   LatticeGaugeField   Umu(&GRID);
+  LatticeGaugeField   Urnd(&GRID);
   LatticeGaugeField   Uorg(&GRID);
   LatticeColourMatrix   g(&GRID); // Gauge xform
 
   
   SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
   Uorg=Umu;
+  Urnd=Umu;
+
+  SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge
 
-  SU3::RandomGaugeTransform(pRNG,Umu,g); // Unit gauge
   Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Initial plaquette "<<plaq << std::endl;
 
-
-
   Real alpha=0.1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
 
+  Umu = Urnd;
+  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,false);
 
   plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Final plaquette "<<plaq << std::endl;
@@ -288,14 +84,28 @@ int main (int argc, char ** argv)
   std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
 
 
-  //  std::cout<< "*****************************************************************" <<std::endl;
-  //  std::cout<< "* Testing Fourier accelerated fixing                            *" <<std::endl;
-  //  std::cout<< "*****************************************************************" <<std::endl;
+  std::cout<< "*****************************************************************" <<std::endl;
+  std::cout<< "* Testing Fourier accelerated fixing                            *" <<std::endl;
+  std::cout<< "*****************************************************************" <<std::endl;
+  Umu=Urnd;
+  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,true);
 
-  //  std::cout<< "*****************************************************************" <<std::endl;
-  //  std::cout<< "* Testing non-unit configuration                                *" <<std::endl;
-  //  std::cout<< "*****************************************************************" <<std::endl;
+  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
+  std::cout << " Final plaquette "<<plaq << std::endl;
 
+  std::cout<< "*****************************************************************" <<std::endl;
+  std::cout<< "* Testing non-unit configuration                                *" <<std::endl;
+  std::cout<< "*****************************************************************" <<std::endl;
+
+  SU3::HotConfiguration(pRNG,Umu); // Unit gauge
+
+  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
+  std::cout << " Initial plaquette "<<plaq << std::endl;
+
+  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,true);
+
+  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
+  std::cout << " Final plaquette "<<plaq << std::endl;
 
 
   Grid_finalize();

From 1d0ca65e28e987746d1f5561774d51c9b77385cc Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Thu, 8 Jun 2017 22:21:50 +0100
Subject: [PATCH 065/177] Move Gfix into utils

---
 lib/Grid.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Grid.h b/lib/Grid.h
index 543b0330..bf548211 100644
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -41,6 +41,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
+#include <Grid/qcd/utils/GaugeFix.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
 

From 70ab598c96401761996b78f6d0343f16267c6e73 Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Thu, 8 Jun 2017 22:22:23 +0100
Subject: [PATCH 066/177] Move gfix into utils

---
 lib/qcd/utils/GaugeFix.h | 188 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 lib/qcd/utils/GaugeFix.h

diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h
new file mode 100644
index 00000000..4ff216e4
--- /dev/null
+++ b/lib/qcd/utils/GaugeFix.h
@@ -0,0 +1,188 @@
+    /*************************************************************************************
+
+    grid` physics library, www.github.com/paboyle/Grid 
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+//#include <Grid/Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+template <class Gimpl> 
+class FourierAcceleratedGaugeFixer  : public Gimpl {
+  public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
+    for(int mu=0;mu<Nd;mu++){
+      Complex cmi(0.0,-1.0);
+      A[mu] = Ta(U[mu]) * cmi;
+    }
+  }
+  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
+    dmuAmu=zero;
+    for(int mu=0;mu<Nd;mu++){
+      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+    }
+  }  
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false) {
+    GridBase *grid = Umu._grid;
+
+    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+    Real old_trace = org_link_trace;
+    Real trG;
+
+    std::vector<GaugeMat> U(Nd,grid);
+                 GaugeMat dmuAmu(grid);
+
+    for(int i=0;i<maxiter;i++){
+      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
+      if ( Fourier==false ) { 
+	trG = SteepestDescentStep(U,alpha,dmuAmu);
+      } else { 
+	trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
+      }
+      for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
+      // Monitor progress and convergence test 
+      // infrequently to minimise cost overhead
+      if ( i %20 == 0 ) { 
+	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+
+	if (Fourier) 
+	  std::cout << GridLogMessage << "Fourier Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
+	else 
+	  std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
+	
+	Real Phi  = 1.0 - old_trace / link_trace ;
+	Real Omega= 1.0 - trG;
+
+
+	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
+	if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
+	  std::cout << GridLogMessage << "Converged ! "<<std::endl;
+	  return;
+	}
+
+	old_trace = link_trace;
+
+      }
+    }
+  };
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
+    GridBase *grid = U[0]._grid;
+
+    std::vector<GaugeMat> A(Nd,grid);
+    GaugeMat g(grid);
+
+    GaugeLinkToLieAlgebraField(U,A);
+    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
+
+
+    Real vol = grid->gSites();
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+
+    SU<Nc>::GaugeTransform(U,g);
+
+    return trG;
+  }
+
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
+
+    GridBase *grid = U[0]._grid;
+
+    Real vol = grid->gSites();
+
+    FFT theFFT((GridCartesian *)grid);
+
+    LatticeComplex  Fp(grid);
+    LatticeComplex  psq(grid); psq=zero;
+    LatticeComplex  pmu(grid); 
+    LatticeComplex   one(grid); one = Complex(1.0,0.0);
+
+    GaugeMat g(grid);
+    GaugeMat dmuAmu_p(grid);
+    std::vector<GaugeMat> A(Nd,grid);
+
+    GaugeLinkToLieAlgebraField(U,A);
+
+    DmuAmu(A,dmuAmu);
+
+    theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
+
+    //////////////////////////////////
+    // Work out Fp = psq_max/ psq...
+    //////////////////////////////////
+    std::vector<int> latt_size = grid->GlobalDimensions();
+    std::vector<int> coor(grid->_ndimension,0);
+    for(int mu=0;mu<Nd;mu++) {
+
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      LatticeCoordinate(pmu,mu);
+      pmu = TwoPiL * pmu ;
+      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
+    }
+
+    Complex psqMax(16.0);
+    Fp =  psqMax*one/psq;
+
+    /*
+    static int once;
+    if ( once == 0 ) { 
+      std::cout << " Fp " << Fp <<std::endl;
+      once ++;
+      }*/
+
+    pokeSite(TComplex(1.0),Fp,coor);
+
+    dmuAmu_p  = dmuAmu_p * Fp; 
+
+    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
+
+    GaugeMat ciadmam(grid);
+    Complex cialpha(0.0,-alpha);
+    ciadmam = dmuAmu*cialpha;
+    SU<Nc>::taExp(ciadmam,g);
+
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+
+    SU<Nc>::GaugeTransform(U,g);
+
+    return trG;
+  }
+
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
+    GridBase *grid = g._grid;
+    Complex cialpha(0.0,-alpha);
+    GaugeMat ciadmam(grid);
+    DmuAmu(A,dmuAmu);
+    ciadmam = dmuAmu*cialpha;
+    SU<Nc>::taExp(ciadmam,g);
+  }  
+};
+

From 3bfd1f13e67735d2273127eefabafb779c00996d Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 11 Jun 2017 23:14:10 +0100
Subject: [PATCH 067/177] I/O improvements

---
 benchmarks/Benchmark_memory_bandwidth.cc      |   4 +-
 benchmarks/Benchmark_su3.cc                   |   4 +-
 configure.ac                                  |   2 +-
 extras/Hadrons/Modules/MGauge/Load.cc         |   4 +-
 lib/Grid.h                                    |   1 +
 lib/GridStd.h                                 |   1 +
 lib/cartesian/Cartesian_base.h                |   9 +-
 lib/cartesian/Cartesian_full.h                |   8 +-
 lib/cartesian/Cartesian_red_black.h           |   4 +
 lib/communicator/Communicator_base.h          |   2 +
 lib/communicator/Communicator_mpi.cc          |   8 +
 lib/communicator/Communicator_mpi3.cc         |   8 +
 lib/communicator/Communicator_none.cc         |   2 +
 lib/parallelIO/BinaryIO.h                     | 249 ++++++---
 lib/parallelIO/IldgIO.h                       | 472 ++++++++++++------
 lib/parallelIO/IldgIOtypes.h                  | 110 ++--
 lib/parallelIO/NerscIO.h                      | 301 +++--------
 .../hmc/checkpointers/BinaryCheckpointer.h    |  38 +-
 lib/qcd/hmc/checkpointers/ILDGCheckpointer.h  |  32 +-
 lib/qcd/hmc/checkpointers/NerscCheckpointer.h |   2 +-
 lib/qcd/utils/Utils.h                         |   3 -
 lib/serialisation/XmlIO.cc                    |  58 ++-
 lib/serialisation/XmlIO.h                     |  11 +-
 tests/IO/Test_nersc_io.cc                     |   4 +-
 tests/IO/Test_nersc_read.cc                   |   2 +-
 tests/IO/Test_serialisation.cc                |  19 +-
 26 files changed, 779 insertions(+), 579 deletions(-)

diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index d57c4df5..1aa088f8 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -55,8 +55,8 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=44;
-#define NLOOP (1*lmax*lmax*lmax*lmax/vol)
+  uint64_t lmax=64;
+#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
   for(int lat=4;lat<=lmax;lat+=4){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 1321715a..3d7f9bc9 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -35,9 +35,9 @@ using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
-#define LMAX (32)
+#define LMAX (64)
 
-  int Nloop=200;
+  int Nloop=20;
 
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
diff --git a/configure.ac b/configure.ac
index 62b7545b..2fc9dfec 100644
--- a/configure.ac
+++ b/configure.ac
@@ -27,7 +27,7 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
       [version of g++ that will compile the code])
 
-CXXFLAGS="-O3 $CXXFLAGS"
+CXXFLAGS="-g $CXXFLAGS"
 
 
 ############### Checks for typedefs, structures, and compiler characteristics
diff --git a/extras/Hadrons/Modules/MGauge/Load.cc b/extras/Hadrons/Modules/MGauge/Load.cc
index e5ee8abb..062e7e98 100644
--- a/extras/Hadrons/Modules/MGauge/Load.cc
+++ b/extras/Hadrons/Modules/MGauge/Load.cc
@@ -65,7 +65,7 @@ void TLoad::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TLoad::execute(void)
 {
-    NerscField  header;
+    FieldMetaData  header;
     std::string fileName = par().file + "."
                            + std::to_string(env().getTrajectory());
     
@@ -74,5 +74,5 @@ void TLoad::execute(void)
     LatticeGaugeField &U = *env().createLattice<LatticeGaugeField>(getName());
     NerscIO::readConfiguration(U, header, fileName);
     LOG(Message) << "NERSC header:" << std::endl;
-    dump_nersc_header(header, LOG(Message));
+    dump_meta_data(header, LOG(Message));
 }
diff --git a/lib/Grid.h b/lib/Grid.h
index 543b0330..ce16894f 100644
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -42,6 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
 #include <Grid/qcd/smearing/Smearing.h>
+#include <Grid/parallelIO/MetaData.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
 
 #endif
diff --git a/lib/GridStd.h b/lib/GridStd.h
index fb5e5b21..959ba9ac 100644
--- a/lib/GridStd.h
+++ b/lib/GridStd.h
@@ -18,6 +18,7 @@
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
+#include <zlib.h>
 
 ///////////////////
 // Grid config
diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h
index b31b3b5f..0db6ce0d 100644
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -50,7 +50,6 @@ public:
 
     GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
 
-
     // Physics Grid information.
     std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
     std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -63,13 +62,12 @@ public:
     int _isites;
     int _fsites;                  // _isites*_osites = product(dimensions).
     int _gsites;
-    std::vector<int> _slice_block;   // subslice information
+    std::vector<int> _slice_block;// subslice information
     std::vector<int> _slice_stride;
     std::vector<int> _slice_nblock;
 
-    // Might need these at some point
-    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
-    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
+    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
+    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
 
 public:
 
@@ -176,6 +174,7 @@ public:
     inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
     inline int Nd    (void) const { return _ndimension;};
 
+    inline const std::vector<int> LocalStarts(void)             { return _lstart;    };
     inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
     inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
     inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h
index 7e29d311..b0e47fa4 100644
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -76,6 +76,8 @@ public:
         _ldimensions.resize(_ndimension);
         _rdimensions.resize(_ndimension);
         _simd_layout.resize(_ndimension);
+	_lstart.resize(_ndimension);
+	_lend.resize(_ndimension);
             
         _ostride.resize(_ndimension);
         _istride.resize(_ndimension);
@@ -94,8 +96,10 @@ public:
 	  // Use a reduced simd grid
 	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
 	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
-	  _osites *= _rdimensions[d];
-	  _isites *= _simd_layout[d];
+	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
+	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
+	  _osites  *= _rdimensions[d];
+	  _isites  *= _simd_layout[d];
                 
 	  // Addressing support
 	  if ( d==0 ) {
diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h
index 2f132c19..3037de00 100644
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -151,6 +151,8 @@ public:
       _ldimensions.resize(_ndimension);
       _rdimensions.resize(_ndimension);
       _simd_layout.resize(_ndimension);
+      _lstart.resize(_ndimension);
+      _lend.resize(_ndimension);
       
       _ostride.resize(_ndimension);
       _istride.resize(_ndimension);
@@ -169,6 +171,8 @@ public:
 	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
 	}
 	_ldimensions[d] = _gdimensions[d]/_processors[d];
+	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
+	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 23d4f647..12a8429f 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -177,6 +177,8 @@ class CartesianCommunicator {
   void GlobalSumVector(ComplexF *c,int N);
   void GlobalSum(ComplexD &c);
   void GlobalSumVector(ComplexD *c,int N);
+  void GlobalXOR(uint32_t &);
+  void GlobalXOR(uint64_t &);
   
   template<class obj> void GlobalSum(obj &o){
     typedef typename obj::scalar_type scalar_type;
diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc
index 470a06c7..bd2a62fb 100644
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -83,6 +83,14 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
   assert(ierr==0);
 }
+void CartesianCommunicator::GlobalXOR(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalXOR(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSum(float &f){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
   assert(ierr==0);
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 54a0f9b5..632eb991 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -510,6 +510,14 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
   assert(ierr==0);
 }
+void CartesianCommunicator::GlobalXOR(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalXOR(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSum(float &f){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
   assert(ierr==0);
diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc
index ace2868b..5319ab93 100644
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -59,6 +59,8 @@ void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}
+void CartesianCommunicator::GlobalXOR(uint32_t &){}
+void CartesianCommunicator::GlobalXOR(uint64_t &){}
 
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 8b8d4165..bc3da38b 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -125,57 +125,94 @@ class BinaryIO {
   /////////////////////////////////////////////////////////////////////////////
   // more byte manipulation helpers
   /////////////////////////////////////////////////////////////////////////////
-  static inline void Uint32Checksum(uint32_t *buf,uint64_t buf_size_bytes,uint32_t &csum)
+
+  template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,				      
+							 uint32_t &nersc_csum,
+							 uint32_t &scidac_csuma,
+							 uint32_t &scidac_csumb)
+
   {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = lat._grid;
+    int lsites = grid->lSites();
+
+    std::vector<sobj> scalardata(lsites); 
+    unvectorizeToLexOrdArray(scalardata,lat);    
+
+    Uint32Checksum(grid,scalardata,nersc_csum,scidac_csuma,scidac_csumb);
+  }
+  
+  template<class fobj>
+    static inline void Uint32Checksum(GridBase *grid,
+				      std::vector<fobj> &fbuf,
+				      uint32_t &nersc_csum,
+				      uint32_t &scidac_csuma,
+				      uint32_t &scidac_csumb)
+  {
+    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
+
+
+    int nd = grid->_ndimension;
+
+    uint64_t lsites              =grid->lSites();
+    std::vector<int> local_vol   =grid->LocalDimensions();
+    std::vector<int> local_start =grid->LocalStarts();
+    std::vector<int> global_vol  =grid->FullDimensions();
+
 #pragma omp parallel
     { 
-      uint32_t csum_thr=0;
-      uint64_t count = buf_size_bytes/sizeof(uint32_t);
+      std::vector<int> coor(nd);
+      uint32_t nersc_csum_thr=0;
+      uint32_t scidac_csuma_thr=0;
+      uint32_t scidac_csumb_thr=0;
+      uint32_t site_crc=0;
+      uint32_t zcrc = crc32(0L, Z_NULL, 0);
+
 #pragma omp for
-      for(uint64_t i=0;i<count;i++){
-	csum_thr=csum_thr+buf[i];
+      for(uint64_t local_site=0;local_site<lsites;local_site++){
+
+	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
+
+	for(uint64_t j=0;j<size32;j++){
+	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
+	}
+
+	/* 
+	 * Scidac csum  is rather more heavyweight
+	 */
+	int global_site;
+
+	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
+
+	for(int d=0;d<nd;d++) 
+	  coor[d] = coor[d]+local_start[d];
+
+	Lexicographic::IndexFromCoor(coor,global_site,global_vol);
+
+	uint32_t gsite29   = global_site%29;
+	uint32_t gsite31   = global_site%31;
+
+	site_crc = crc32(zcrc,(unsigned char *)site_buf,sizeof(fobj));
+
+	scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
+	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
       }
+
 #pragma omp critical
-      csum = csum + csum_thr;
+      {
+	nersc_csum  += nersc_csum_thr;
+	scidac_csuma^= scidac_csuma_thr;
+	scidac_csumb^= scidac_csumb_thr;
+      }
     }
   }
+
   // Network is big endian
-  static inline void htobe32_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
-    Uint32Checksum((uint32_t *)file_object,bytes,csum); 
-    htobe32_v(file_object,bytes); 
-  }
-  static inline void htobe64_v(void *file_object,uint64_t bytes,uint32_t &csum){
-    Uint32Checksum((uint32_t *)file_object,bytes,csum); 
-    htobe64_v(file_object,bytes);
-  }
-  static inline void htole32_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
-    Uint32Checksum((uint32_t *)file_object,bytes,csum);
-    htole32_v(file_object,bytes);
-  }
-  static inline void htole64_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
-    Uint32Checksum((uint32_t *)file_object,bytes,csum);
-    htole64_v(file_object,bytes);
-  }
-  static inline void be32toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
-    be32toh_v(file_object,bytes); 
-    Uint32Checksum((uint32_t *)file_object,bytes,csum); 
-  }
-  static inline void be64toh_v(void *file_object,uint64_t bytes,uint32_t &csum){
-    be64toh_v(file_object,bytes);
-    Uint32Checksum((uint32_t *)file_object,bytes,csum); 
-  }
-  static inline void le32toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
-    le32toh_v(file_object,bytes);
-    Uint32Checksum((uint32_t *)file_object,bytes,csum);
-  }
-  static inline void le64toh_v(void *file_object,uint64_t bytes,uint32_t &csum){ 
-    le64toh_v(file_object,bytes);
-    Uint32Checksum((uint32_t *)file_object,bytes,csum);
-  }
-  static inline void htobe32_v(void *file_object,uint64_t bytes){ be32toh_v(file_object,bytes);} 
-  static inline void htobe64_v(void *file_object,uint64_t bytes){ be64toh_v(file_object,bytes);} 
-  static inline void htole32_v(void *file_object,uint64_t bytes){ le32toh_v(file_object,bytes);} 
-  static inline void htole64_v(void *file_object,uint64_t bytes){ le64toh_v(file_object,bytes);} 
+  static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} 
+  static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} 
+  static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} 
+  static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} 
 
   static inline void be32toh_v(void *file_object,uint64_t bytes)
   {
@@ -199,6 +236,7 @@ class BinaryIO {
       fp[i] = ntohl(f);
     }
   }
+
   // BE is same as network
   static inline void be64toh_v(void *file_object,uint64_t bytes)
   {
@@ -238,18 +276,23 @@ class BinaryIO {
   static const int BINARYIO_WRITE         = 0x01;
 
   template<class word,class fobj>
-  static inline uint32_t IOobject(word w,
-				  GridBase *grid,
-				  std::vector<fobj> &iodata,
-				  std::string file,
-				  int offset,
-				  const std::string &format, int control)
+  static inline void IOobject(word w,
+			      GridBase *grid,
+			      std::vector<fobj> &iodata,
+			      std::string file,
+			      int offset,
+			      const std::string &format, int control,
+			      uint32_t &nersc_csum,
+			      uint32_t &scidac_csuma,
+			      uint32_t &scidac_csumb)
   {
     grid->Barrier();
     GridStopWatch timer; 
     GridStopWatch bstimer;
 
-    uint32_t csum=0;
+    nersc_csum=0;
+    scidac_csuma=0;
+    scidac_csumb=0;
 
     int ndim                 = grid->Dimensions();
     int nrank                = grid->ProcessorCount();
@@ -359,20 +402,22 @@ class BinaryIO {
       grid->Barrier();
 
       bstimer.Start();
-      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
-      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
-      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
-      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      Uint32Checksum(grid,iodata,nersc_csum,scidac_csuma,scidac_csumb);
       bstimer.Stop();
     }
     
     if ( control & BINARYIO_WRITE ) { 
 
       bstimer.Start();
-      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
-      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
-      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
-      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size(),csum);
+      Uint32Checksum(grid,iodata,nersc_csum,scidac_csuma,scidac_csumb);
+      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
       bstimer.Stop();
 
       grid->Barrier();
@@ -418,17 +463,27 @@ class BinaryIO {
     // Safety check
     //////////////////////////////////////////////////////////////////////////////
     grid->Barrier();
-    grid->GlobalSum(csum);
+    grid->GlobalSum(nersc_csum);
+    grid->GlobalXOR(scidac_csuma);
+    grid->GlobalXOR(scidac_csumb);
     grid->Barrier();
-
-    return csum;
+    //    std::cout << "Binary IO NERSC  checksum  0x"<<std::hex<<nersc_csum  <<std::dec<<std::endl;
+    //    std::cout << "Binary IO SCIDAC checksuma 0x"<<std::hex<<scidac_csuma<<std::dec<<std::endl;
+    //    std::cout << "Binary IO SCIDAC checksumb 0x"<<std::hex<<scidac_csumb<<std::dec<<std::endl;
   }
 
   /////////////////////////////////////////////////////////////////////////////
   // Read a Lattice of object
   //////////////////////////////////////////////////////////////////////////////////////
   template<class vobj,class fobj,class munger>
-  static inline uint32_t readLatticeObject(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
+  static inline void readLatticeObject(Lattice<vobj> &Umu,
+				       std::string file,
+				       munger munge,
+				       int offset,
+				       const std::string &format,
+				       uint32_t &nersc_csum,
+				       uint32_t &scidac_csuma,
+				       uint32_t &scidac_csumb)
   {
     typedef typename vobj::scalar_object sobj;
     typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -439,7 +494,8 @@ class BinaryIO {
     std::vector<sobj> scalardata(lsites); 
     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
     
-    uint32_t csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC);
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+	     nersc_csum,scidac_csuma,scidac_csumb);
 
     GridStopWatch timer; 
     timer.Start();
@@ -451,15 +507,20 @@ class BinaryIO {
 
     timer.Stop();
     std::cout<<GridLogMessage<<"readLatticeObject: vectorize overhead "<<timer.Elapsed()  <<std::endl;
-
-    return csum;
   }
 
   /////////////////////////////////////////////////////////////////////////////
   // Write a Lattice of object
   //////////////////////////////////////////////////////////////////////////////////////
   template<class vobj,class fobj,class munger>
-  static inline uint32_t writeLatticeObject(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format)
+    static inline void writeLatticeObject(Lattice<vobj> &Umu,
+					  std::string file,
+					  munger munge,
+					  int offset,
+					  const std::string &format,
+					  uint32_t &nersc_csum,
+					  uint32_t &scidac_csuma,
+					  uint32_t &scidac_csumb)
   {
     typedef typename vobj::scalar_object sobj;
     typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -480,36 +541,45 @@ class BinaryIO {
     grid->Barrier();
     timer.Stop();
 
-    uint32_t csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC);
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+	     nersc_csum,scidac_csuma,scidac_csumb);
 
     std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
-
-    return csum;
   }
   
   /////////////////////////////////////////////////////////////////////////////
   // Read a RNG;  use IOobject and lexico map to an array of state 
   //////////////////////////////////////////////////////////////////////////////////////
-  static inline uint32_t readRNG(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file,int offset)
+  static inline void readRNG(GridSerialRNG &serial,
+			     GridParallelRNG &parallel,
+			     std::string file,
+			     int offset,
+			     uint32_t &nersc_csum,
+			     uint32_t &scidac_csuma,
+			     uint32_t &scidac_csumb)
   {
     typedef typename GridSerialRNG::RngStateType RngStateType;
     const int RngStateCount = GridSerialRNG::RngStateCount;
     typedef std::array<RngStateType,RngStateCount> RNGstate;
     typedef RngStateType word;    word w=0;
 
-    uint32_t csum = 0;
     std::string format = "IEEE32BIG";
 
     GridBase *grid = parallel._grid;
     int gsites = grid->gSites();
     int lsites = grid->lSites();
 
+    uint32_t nersc_csum_tmp;
+    uint32_t scidac_csuma_tmp;
+    uint32_t scidac_csumb_tmp;
+
     GridStopWatch timer;
 
     std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
 
     std::vector<RNGstate> iodata(lsites);
-    csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC);
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+	     nersc_csum,scidac_csuma,scidac_csumb);
 
     timer.Start();
     parallel_for(int lidx=0;lidx<lsites;lidx++){
@@ -520,33 +590,49 @@ class BinaryIO {
     timer.Stop();
 
     iodata.resize(1);
-    csum+= IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND);
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND,
+	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
+
     {
       std::vector<RngStateType> tmp(RngStateCount);
       std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
       serial.SetState(tmp,0);
     }
 
-    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
+    nersc_csum   = nersc_csum   + nersc_csum_tmp;
+    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
+    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
+
+    //    std::cout << GridLogMessage << "RNG file nersc_checksum   " << std::hex << nersc_csum << std::dec << std::endl;
+    //    std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
+    //    std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
+
     std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
-    return csum;
   }
   /////////////////////////////////////////////////////////////////////////////
   // Write a RNG; lexico map to an array of state and use IOobject
   //////////////////////////////////////////////////////////////////////////////////////
-  static inline uint32_t writeRNG(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file,int offset)
+  static inline void writeRNG(GridSerialRNG &serial,
+			      GridParallelRNG &parallel,
+			      std::string file,
+			      int offset,
+			      uint32_t &nersc_csum,
+			      uint32_t &scidac_csuma,
+			      uint32_t &scidac_csumb)
   {
     typedef typename GridSerialRNG::RngStateType RngStateType;
     typedef RngStateType word; word w=0;
     const int RngStateCount = GridSerialRNG::RngStateCount;
     typedef std::array<RngStateType,RngStateCount> RNGstate;
 
-    uint32_t csum = 0;
-
     GridBase *grid = parallel._grid;
     int gsites = grid->gSites();
     int lsites = grid->lSites();
 
+    uint32_t nersc_csum_tmp;
+    uint32_t scidac_csuma_tmp;
+    uint32_t scidac_csumb_tmp;
+
     GridStopWatch timer;
     std::string format = "IEEE32BIG";
 
@@ -561,7 +647,8 @@ class BinaryIO {
     }
     timer.Stop();
 
-    csum= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC);
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+	     nersc_csum,scidac_csuma,scidac_csumb);
 
     iodata.resize(1);
     {
@@ -569,11 +656,11 @@ class BinaryIO {
       serial.GetState(tmp,0);
       std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
     }
-    csum+= IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND);
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
+	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
     
-    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
+    //    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
     std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
-    return csum;
   }
 };
 }
diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h
index 0912e2f6..237edf43 100644
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -43,201 +43,351 @@ extern "C" {  // for linkage
 #include "lime.h"
 }
 
+
+// Unused SCIDAC records names
+// SCIDAC_PRIVATE_FILE_XML   "scidac-private-file-xml"
+// SCIDAC_SITELIST           "scidac-sitelist"
+// SCIDAC_FILE_XML           "scidac-file-xml"
+// SCIDAC_RIVATE_RECORD_XML "scidac-private-record-xml"
+// SCIDAC_RECORD_XML         "scidac-record-xml"
+// SCIDAC_BINARY_DATA        "scidac-binary-data"
+//
+// Scidac checksum: CRC32 every site, xor reduce some hash of this.
+// https://github.com/usqcd-software/qio/blob/master/lib/dml/DML_utils.c
+
 namespace Grid {
 namespace QCD {
 
-inline void ILDGGrid(GridBase *grid, ILDGField &header) {
-  assert(grid->_ndimension == 4);  // emit error if not
-  header.dimension.resize(4);
-  header.boundary.resize(4);
-  for (int d = 0; d < 4; d++) {
-    header.dimension[d] = grid->_fdimensions[d];
-    // Read boundary conditions from ... ?
-    header.boundary[d] = std::string("periodic");
-  }
-}
-
-inline void ILDGChecksum(uint32_t *buf, uint32_t buf_size_bytes,
-                         uint32_t &csum) {
-  BinaryIO::Uint32Checksum(buf, buf_size_bytes, csum);
-}
-
-//////////////////////////////////////////////////////////////////////
-// Utilities ; these are QCD aware
-//////////////////////////////////////////////////////////////////////
-template <class GaugeField>
-inline void ILDGStatistics(GaugeField &data, ILDGField &header) {
-  // How to convert data precision etc...
-  header.link_trace = Grid::QCD::WilsonLoops<PeriodicGimplR>::linkTrace(data);
-  header.plaquette = Grid::QCD::WilsonLoops<PeriodicGimplR>::avgPlaquette(data);
-  // header.polyakov =
-}
-
-// Forcing QCD here
-template <class fobj, class sobj>
-struct ILDGMunger {
-  void operator()(fobj &in, sobj &out, uint32_t &csum) {
-    for (int mu = 0; mu < 4; mu++) {
-      for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-          out(mu)()(i, j) = in(mu)()(i, j);
-        }
-      }
-    }
-    ILDGChecksum((uint32_t *)&in, sizeof(in), csum);
-  };
-};
-
-template <class fobj, class sobj>
-struct ILDGUnmunger {
-  void operator()(sobj &in, fobj &out, uint32_t &csum) {
-    for (int mu = 0; mu < 4; mu++) {
-      for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-          out(mu)()(i, j) = in(mu)()(i, j);
-        }
-      }
-    }
-    ILDGChecksum((uint32_t *)&out, sizeof(out), csum);
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Write and read from fstream; compute header offset for payload
-////////////////////////////////////////////////////////////////////////////////
-enum ILDGstate {ILDGread, ILDGwrite};
-
-class ILDGIO : public BinaryIO {
-  FILE *File;
-  LimeWriter *LimeW;
-  LimeRecordHeader *LimeHeader;
-  LimeReader *LimeR;
-  std::string filename;
-
-
+class IldgIO : public BinaryIO {
  public:
-  ILDGIO(std::string file, ILDGstate RW) {
-      filename = file;
-    if (RW == ILDGwrite){
-      File = fopen(file.c_str(), "w");
-      // check if opened correctly
 
-      LimeW = limeCreateWriter(File);
-    } else {
-      File = fopen(file.c_str(), "r");
-      // check if opened correctly
-
-      LimeR = limeCreateReader(File);
-    }
-  }
-
-  ~ILDGIO() { fclose(File); }
-
-  int createHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L){
+  static int createHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L)
+  {
     LimeRecordHeader *h;
     h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
-    int status = limeWriteRecordHeader(h, L);
-    if (status < 0) {
-      std::cerr << "ILDG Header error\n";
-      return status;
-    }
+    assert(limeWriteRecordHeader(h, L) >= 0);
     limeDestroyHeader(h);
     return LIME_SUCCESS;
   }
 
-  unsigned int writeHeader(ILDGField &header) {
-    // write header in LIME
-    n_uint64_t nbytes;
-    int MB_flag = 1, ME_flag = 0;
-
-    char message[] = "ildg-format";
-    nbytes = strlen(message);
-    LimeHeader = limeCreateHeader(MB_flag, ME_flag, message, nbytes);
-    limeWriteRecordHeader(LimeHeader, LimeW);
-    limeDestroyHeader(LimeHeader);
-    // save the xml header here
-    // use the xml_writer to c++ streams in pugixml
-    // and convert to char message
-    limeWriteRecordData(message, &nbytes, LimeW);
+  template<class serialisable_object>
+  static void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, LimeWriter *LimeW)
+  {
+    std::string xmlstring;
+    {
+      XmlWriter WR("","");
+      write(WR,object_name,object);
+      xmlstring = WR.XmlString();
+    }
+    uint64_t nbytes = xmlstring.size();
+    LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes);
+    assert(limeWriteRecordHeader(h, LimeW)>=0);
+    assert(limeWriteRecordData(&xmlstring[0], &nbytes, LimeW)>=0);
     limeWriterCloseRecord(LimeW);
-
-    return 0;
+    limeDestroyHeader(h);
   }
 
-  unsigned int readHeader(ILDGField &header) {
+  static unsigned int writeHeader(FieldMetaData &header, LimeWriter *LimeW) {
+
+    uint64_t nbytes;
+
+    ildgFormat ildgfmt ;
+    usqcdInfo info;
+
+    //////////////////////////////////////////////////////
+    // Fill ILDG header data struct
+    //////////////////////////////////////////////////////
+    ildgfmt.field     = std::string("su3gauge");
+    ildgfmt.precision = 64;
+    ildgfmt.version = 1.0;
+    ildgfmt.lx = header.dimension[0];
+    ildgfmt.ly = header.dimension[1];
+    ildgfmt.lz = header.dimension[2];
+    ildgfmt.lt = header.dimension[3];
+    assert(header.nd==4);
+    assert(header.nd==header.dimension.size());
+
+    info.version=1.0;
+    info.plaq   = header.plaquette;
+    info.linktr = header.link_trace;
+
+    // Following scidac file downloaded from NERSC under MILC
+    // Begin message, keep open on successive records
+    //Message 1
+    // Type:           scidac-private-file-xml <scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 48 </dims><volfmt>0</volfmt></scidacFile>
+    // Type:           scidac-file-xml         <title>MILC ILDG archival gauge configuration</title>
+    //Message 2
+    // Type:           scidac-private-record-xml <scidacRecord><version>1.0</version><date>Thu May 11 00:11:33 2006 UTC</date><globaldata>0</globaldata>
+    //                    <datatype>QDP_F3_ColorMatrix</datatype><precision>F</precision><colors>3</colors><typesize>72</typesize><datacount>4</datacount></scidacRecord>
+    // Type:           scidac-record-xml 
+    // Type:           ildg-format
+    // Type:           ildg-data-lfn
+    // Type:           ildg-binary-data
+    // Type:           scidac-checksum
+
+    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW);
+    writeLimeObject(0,0,info   ,std::string("usqcdInfo"    ),std::string(USQCD_INFO ),LimeW);
+    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT),LimeW);
+    // LFN is not a serializable object
+    {
+      std::string LFN = header.ildg_lfn; 
+      uint64_t PayloadSize = LFN.size();
+      createHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize, LimeW);
+      limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize, LimeW);
+      limeWriterCloseRecord(LimeW);
+    }
     return 0;
   }
 
   template <class vsimd>
-  uint32_t readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu) {
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef LorentzColourMatrixD sobjd;
-    typedef LorentzColourMatrixF sobjf;
-    typedef iLorentzColourMatrix<vsimd> itype;
-    typedef LorentzColourMatrix sobj;
-    GridBase *grid = Umu._grid;
+  static void writeConfiguration(std::string filename,Lattice<iLorentzColourMatrix<vsimd> > &Umu, std::string format) {
 
-    ILDGField header;
-    readHeader(header);
+    FILE *File = fopen(filename.c_str(), "w");
+    LimeWriter *LimeW = limeCreateWriter(File);
 
-    // now just the conf, ignore the header
-    std::string format = std::string("IEEE64BIG");
-    do {limeReaderNextRecord(LimeR);}
-    while (strncmp(limeReaderType(LimeR), "ildg-binary-data",16));
-
-    n_uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
-
-
-    ILDGtype ILDGt(true, LimeR);
-    // this is special for double prec data, just for the moment
-    uint32_t csum = BinaryIO::readObjectParallel< itype, sobjd >(
-       Umu, filename, ILDGMunger<sobjd, sobj>(), 0, format, ILDGt);
-
-    // Check configuration 
-    // todo
-
-    return csum;
-  }
-
-  template <class vsimd>
-  uint32_t writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, std::string format) {
     typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
     typedef iLorentzColourMatrix<vsimd> vobj;
     typedef typename vobj::scalar_object sobj;
     typedef LorentzColourMatrixD fobj;
 
-    ILDGField header;
-    // fill the header
+    GridBase * grid = Umu._grid;
+
+    ////////////////////////////////////////
+    // fill the headers
+    ////////////////////////////////////////
+    FieldMetaData header;
+
+    GridMetaData(grid,header); 
+    GaugeStatistics<GaugeField>(Umu,header);
+    MachineCharacteristics(header);
+
+    assert( (format=="IEEE64BIG") || (format=="IEEE32BIG"));
     header.floating_point = format;
+    header.checksum = 0x0; // unused in ILDG
+    writeHeader(header,LimeW);
 
-    ILDGUnmunger<fobj, sobj> munge;
-    unsigned int offset = writeHeader(header);
-
-    BinaryIO::Uint32Checksum<vobj, fobj>(Umu, munge, header.checksum);
-
+    ////////////////////////////////////////
     // Write data record header
-    n_uint64_t PayloadSize = sizeof(fobj) * Umu._grid->_gsites;
-    createHeader("ildg-binary-data", 0, 1, PayloadSize, LimeW);
-
-    ILDGtype ILDGt(true, LimeW);
-    uint32_t csum = BinaryIO::writeObjectParallel<vobj, fobj>(
-       Umu, filename, munge, 0, header.floating_point, ILDGt);
-
+    ////////////////////////////////////////
+    uint64_t PayloadSize = sizeof(fobj) * Umu._grid->_gsites;
+    createHeader(ILDG_BINARY_DATA, 0, 0, PayloadSize, LimeW);
+    
+    off_t offset = ftell(File);
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    GaugeSimpleMunger<sobj, fobj> munge;
+    BinaryIO::writeLatticeObject<vobj, fobj >(Umu, filename, munge, offset, header.floating_point,
+					      nersc_csum,scidac_csuma,scidac_csumb);
     limeWriterCloseRecord(LimeW);
 
-    // Last record
-    // the logical file name LNF
-    // look into documentation on how to generate this string
-    std::string LNF = "empty"; 
+    ////////////////////////////////////////
+    // Write checksum element, propagaing forward from the BinaryIO
+    ////////////////////////////////////////
+    scidacChecksum checksum;
+    checksum.suma= scidac_csuma;
+    checksum.sumb= scidac_csumb;
+    //    std::cout << " writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
+    writeLimeObject(0,1,checksum,std::string("scidacChecksum"    ),std::string(SCIDAC_CHECKSUM),LimeW);
+
+    fclose(File);
+  }
+
+  template <class vsimd>
+  static void readConfiguration(std::string filename,Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
+
+    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+    typedef LorentzColourMatrixD sobjd;
+    typedef LorentzColourMatrixF sobjf;
+    typedef iLorentzColourMatrix<vsimd> itype;
+    typedef LorentzColourMatrix sobj;
+
+    GridBase *grid = Umu._grid;
+
+    std::vector<int> dims = Umu._grid->FullDimensions();
+    assert(dims.size()==4);
+
+    FILE *File = fopen(filename.c_str(), "r");
+    LimeReader *LimeR = limeCreateReader(File);
 
 
-    PayloadSize = sizeof(LNF);
-    createHeader("ildg-binary-lfn", 1 , 1, PayloadSize, LimeW);
-    limeWriteRecordData(const_cast<char*>(LNF.c_str()), &PayloadSize, LimeW);
+    // Metadata holders
+    ildgFormat     ildgFormat_    ;
+    std::string    ildgLFN_       ;
+    scidacChecksum scidacChecksum_; 
+    usqcdInfo      usqcdInfo_     ;
 
-    limeWriterCloseRecord(LimeW);
+    // track what we read from file
+    int found_ildgFormat    =0;
+    int found_ildgLFN       =0;
+    int found_scidacChecksum=0;
+    int found_usqcdInfo     =0;
+    int found_ildgBinary =0;
+    int found_FieldMetaData =0;
 
-    return csum;
+    uint32_t nersc_csum;
+    uint32_t scidac_csuma;
+    uint32_t scidac_csumb;
+
+    // Binary format
+    std::string format;
+
+    //////////////////////////////////////////////////////////////////////////
+    // Loop over all records
+    // -- Order is poorly guaranteed except ILDG header preceeds binary section.
+    // -- Run like an event loop.
+    // -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing
+    //    that Scidac. 
+    // -- Insist on Scidac checksum record.
+    //////////////////////////////////////////////////////////////////////////
+
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+
+      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
+
+      //////////////////////////////////////////////////////////////////
+      // If not BINARY_DATA read a string and parse
+      //////////////////////////////////////////////////////////////////
+      if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) )  ) {
+	
+	// Copy out the string
+	std::vector<char> xmlc(nbytes+1,'\0');
+	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
+	std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
+
+	//////////////////////////////////
+	// ILDG format record
+	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) { 
+
+	  XmlReader RD(&xmlc[0],"");
+	  read(RD,"ildgFormat",ildgFormat_);
+
+	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
+	  if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
+
+	  //	  std::cout << "This is an ILDG format record : "<<format<<std::endl;
+
+	  assert( ildgFormat_.lx == dims[0]);
+	  assert( ildgFormat_.ly == dims[1]);
+	  assert( ildgFormat_.lz == dims[2]);
+	  assert( ildgFormat_.lt == dims[3]);
+
+	  found_ildgFormat = 1;
+	}
+
+	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
+	  FieldMetaData_.ildg_lfn = std::string(&xmlc[0]);
+	  //	  std::cout << "ILDG logical file name "<< FieldMetaData_.ildg_lfn << std::endl;
+	  found_ildgLFN = 1;
+	}
+
+	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) { 
+
+	  XmlReader RD(&xmlc[0],"");
+	  read(RD,"FieldMetaData",FieldMetaData_);
+
+	  //	  std::cout << "Grid header found : format is "<<FieldMetaData_.floating_point<<std::endl;
+
+	  format = FieldMetaData_.floating_point;
+
+	  assert(FieldMetaData_.dimension[0] == dims[0]);
+	  assert(FieldMetaData_.dimension[1] == dims[1]);
+	  assert(FieldMetaData_.dimension[2] == dims[2]);
+	  assert(FieldMetaData_.dimension[3] == dims[3]);
+
+	  found_FieldMetaData = 1;
+	}
+
+	if ( !strncmp(limeReaderType(LimeR), USQCD_INFO,strlen(USQCD_INFO)) ) { 
+	  XmlReader RD(&xmlc[0],"");
+	  read(RD,USQCD_INFO,usqcdInfo_);
+	  //	  std::cout << "USQCD info record found " <<std::endl;
+	  found_usqcdInfo = 1;
+	}
+
+	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
+	  XmlReader RD(&xmlc[0],"");
+	  read(RD,"scidacChecksum",scidacChecksum_);
+	  FieldMetaData_.scidac_checksuma = scidacChecksum_.suma;
+	  FieldMetaData_.scidac_checksumb = scidacChecksum_.sumb;
+	  //std::cout << " Read Out "<<scidacChecksum_.version<<"/"<< scidacChecksum_.suma<<"/"<<scidacChecksum_.sumb<<std::endl;
+	  found_scidacChecksum = 1;
+	}
+
+      } else {  
+	/////////////////////////////////
+	// Binary data
+	/////////////////////////////////
+	std::cout << GridLogMessage << ILDG_BINARY_DATA << std::endl;
+	off_t offset= ftell(File);
+	GaugeSimpleMunger<sobjd, sobj> munge;
+	BinaryIO::readLatticeObject< itype, sobjd >(Umu, filename, munge, offset, format,
+						    nersc_csum,scidac_csuma,scidac_csumb);
+	found_ildgBinary = 1;
+      }
+
+    }
+
+    //////////////////////////////////////////////////////
+    // Minimally must find binary segment and checksum
+    //////////////////////////////////////////////////////
+    assert(found_ildgBinary);
+    assert(found_scidacChecksum);
+
+    // Must find something with the lattice dimensions
+    assert(found_FieldMetaData||found_ildgFormat);
+
+    if ( found_FieldMetaData ) {
+
+      std::cout << GridLogMessage<<"a Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
+      //      std::cout << "Read Grid Plaqette  "<<FieldMetaData_.plaquette<<std::endl;
+      //      std::cout << "Read Grid LinkTrace "<<FieldMetaData_.link_trace<<std::endl;
+
+    } else { 
+
+      assert(found_ildgFormat);
+      assert ( ildgFormat_.field == std::string("su3gauge") );
+
+      ///////////////////////////////////////////////////////////////////////////////////////
+      // Populate our Grid metadata as best we can
+      ///////////////////////////////////////////////////////////////////////////////////////
+
+      std::ostringstream vers; vers << ildgFormat_.version;
+      FieldMetaData_.hdr_version = vers.str();
+      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
+
+      assert(FieldMetaData_.nd==4);
+      assert(FieldMetaData_.dimension.size()==4);
+
+      FieldMetaData_.dimension[0] = ildgFormat_.lx ;
+      FieldMetaData_.dimension[1] = ildgFormat_.ly ;
+      FieldMetaData_.dimension[2] = ildgFormat_.lz ;
+      FieldMetaData_.dimension[3] = ildgFormat_.lt ;
+
+      if ( found_usqcdInfo ) { 
+	FieldMetaData_.plaquette = usqcdInfo_.plaq;
+	FieldMetaData_.link_trace= usqcdInfo_.linktr;
+	//	std::cout << "This configuration was probably written by USQCD and not Grid "<<std::endl;
+	//	std::cout << "Read USQCD Plaquette  "<<FieldMetaData_.plaquette<<std::endl;
+	//	std::cout << "Read USQCD LinkTrace  "<<FieldMetaData_.link_trace<<std::endl;
+      } else { 
+	FieldMetaData_.plaquette = 0.0;
+	FieldMetaData_.link_trace= 0.0;
+	std::cout << "Uhoh... This configuration is unsafe and contains no recognised checksum or physics records that can verify it !!! "<<std::endl;
+      }
+    }
+
+    if ( found_scidacChecksum ) {
+      assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
+      assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
+      std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
+    }
+
+    if ( found_FieldMetaData || found_usqcdInfo ) {
+      FieldMetaData checker;
+      GaugeStatistics<GaugeField>(Umu,checker);
+      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
+      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
+      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
+    }
   }
 
   // format for RNG? Now just binary out
diff --git a/lib/parallelIO/IldgIOtypes.h b/lib/parallelIO/IldgIOtypes.h
index 4c7a1edd..8e1316eb 100644
--- a/lib/parallelIO/IldgIOtypes.h
+++ b/lib/parallelIO/IldgIOtypes.h
@@ -34,47 +34,83 @@ extern "C" { // for linkage
 
 namespace Grid {
 
-struct ILDGtype {
-  bool is_ILDG;
-  LimeWriter* LW;
-  LimeReader* LR;
+#define GRID_FORMAT      "grid-format"
+#define ILDG_FORMAT      "ildg-format"
+#define ILDG_BINARY_DATA "ildg-binary-data"
+#define ILDG_DATA_LFN    "ildg-data-lfn"
+#define USQCD_INFO       "usqcdInfo"
+#define SCIDAC_CHECKSUM  "scidac-checksum"
 
-  ILDGtype(bool is, LimeWriter* L) : is_ILDG(is), LW(L), LR(NULL) {}
-  ILDGtype(bool is, LimeReader* L) : is_ILDG(is), LW(NULL), LR(L) {}
-  ILDGtype() : is_ILDG(false), LW(NULL), LR(NULL) {}
+/////////////////////////////////////////////////////////////////////////////////
+// Data representation of records that enter ILDG and SciDac formats
+/////////////////////////////////////////////////////////////////////////////////
+struct ildgFormat : Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
+				  double, version,
+				  std::string, field,
+				  int, precision,
+				  int, lx,
+				  int, ly,
+				  int, lz,
+				  int, lt);
+  ildgFormat() { 
+    version=1.0; 
+  };
 };
-
-class ILDGField {
+struct usqcdInfo : Serializable { 
  public:
-  // header strings (not in order)
-  std::vector<int> dimension;
-  std::vector<std::string> boundary;
-  int data_start;
-  std::string hdr_version;
-  std::string storage_format;
-  // Checks on data
-  double link_trace;
-  double plaquette;
-  uint32_t checksum;
-  unsigned int sequence_number;
-  std::string data_type;
-  std::string ensemble_id;
-  std::string ensemble_label;
-  std::string creator;
-  std::string creator_hardware;
-  std::string creation_date;
-  std::string archive_date;
-  std::string floating_point;
+  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
+				  double, version,
+				  double, plaq,
+				  double, linktr,
+				  std::string, info);
+  usqcdInfo() { 
+    version=1.0; 
+  };
+};
+
+struct usqcdPropFile : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
+				  double, version,
+				  std::string, type,
+				  std::string, info);
+  usqcdPropFile() { 
+    version=1.0; 
+  };
+};
+struct usqcdSourceInfo : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
+				  double, version,
+				  std::string, info);
+  usqcdSourceInfo() { 
+    version=1.0; 
+  };
+};
+struct usqcdPropInfo : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
+				  double, version,
+				  int, spin,
+				  int, color,
+				  std::string, info);
+  usqcdPropInfo() { 
+    version=1.0; 
+  };
+};
+struct scidacChecksum : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
+				  double, version,
+				  uint32_t, suma,
+				  uint32_t, sumb);
+  scidacChecksum() { 
+    version=1.0; 
+    suma=sumb=0;
+  };
 };
 }
-#else
-namespace Grid {
-
-struct ILDGtype {
-  bool is_ILDG;
-  ILDGtype() : is_ILDG(false) {}
-};
-}
-
 #endif
 #endif
diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index ba9d23de..cc37b537 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -30,168 +30,11 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 
-#include <algorithm>
-#include <iostream>
-#include <iomanip>
-#include <fstream>
-#include <map>
-
-#include <unistd.h>
-#include <sys/utsname.h>
-#include <pwd.h>
-
 namespace Grid {
   namespace QCD {
 
     using namespace Grid;
 
-    ////////////////////////////////////////////////////////////////////////////////
-    // Some data types for intermediate storage
-    ////////////////////////////////////////////////////////////////////////////////
-    template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, 4 >;
-
-    typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
-    typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
-    typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    // header specification/interpretation
-    ////////////////////////////////////////////////////////////////////////////////
-    class NerscField {
-    public:
-      // header strings (not in order)
-      int dimension[4];
-      std::string boundary[4]; 
-      int data_start;
-      std::string hdr_version;
-      std::string storage_format;
-      // Checks on data
-      double link_trace;
-      double plaquette;
-      uint32_t checksum;
-      unsigned int sequence_number;
-      std::string data_type;
-      std::string ensemble_id ;
-      std::string ensemble_label ;
-      std::string creator ;
-      std::string creator_hardware ;
-      std::string creation_date ;
-      std::string archive_date ;
-      std::string floating_point;
-    };
-
-    //////////////////////////////////////////////////////////////////////
-    // Bit and Physical Checksumming and QA of data
-    //////////////////////////////////////////////////////////////////////
-
-    inline void NerscGrid(GridBase *grid,NerscField &header)
-    {
-      assert(grid->_ndimension==4);
-      for(int d=0;d<4;d++) {
-	header.dimension[d] = grid->_fdimensions[d];
-      }
-      for(int d=0;d<4;d++) {
-	header.boundary[d] = std::string("PERIODIC");
-      }
-    }
-    template<class GaugeField>
-    inline void NerscStatistics(GaugeField & data,NerscField &header)
-    {
-      // How to convert data precision etc...
-      header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplR>::linkTrace(data);
-      header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplR>::avgPlaquette(data);
-    }
-
-    inline void NerscMachineCharacteristics(NerscField &header)
-    {
-      // Who
-      struct passwd *pw = getpwuid (getuid());
-      if (pw) header.creator = std::string(pw->pw_name); 
-
-      // When
-      std::time_t t = std::time(nullptr);
-      std::tm tm = *std::localtime(&t);
-      std::ostringstream oss; 
-      //  oss << std::put_time(&tm, "%c %Z");
-      header.creation_date = oss.str();
-      header.archive_date  = header.creation_date;
-
-      // What
-      struct utsname name;  uname(&name);
-      header.creator_hardware = std::string(name.nodename)+"-";
-      header.creator_hardware+= std::string(name.machine)+"-";
-      header.creator_hardware+= std::string(name.sysname)+"-";
-      header.creator_hardware+= std::string(name.release);
-
-    }
-    //////////////////////////////////////////////////////////////////////
-    // Utilities ; these are QCD aware
-    //////////////////////////////////////////////////////////////////////
-    inline void reconstruct3(LorentzColourMatrix & cm)
-    {
-      const int x=0;
-      const int y=1;
-      const int z=2;
-      for(int mu=0;mu<4;mu++){
-	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
-      }
-    }
-
-    template<class fobj,class sobj>
-    struct NerscSimpleMunger{
-      void operator()(fobj &in, sobj &out) {
-        for (int mu = 0; mu < Nd; mu++) {
-          for (int i = 0; i < Nc; i++) {
-          for (int j = 0; j < Nc; j++) {
-	    out(mu)()(i, j) = in(mu)()(i, j);
-	  }}
-        }
-      };
-    };
-
-    template <class fobj, class sobj>
-    struct NerscSimpleUnmunger {
-
-      void operator()(sobj &in, fobj &out) {
-        for (int mu = 0; mu < Nd; mu++) {
-          for (int i = 0; i < Nc; i++) {
-          for (int j = 0; j < Nc; j++) {
-	    out(mu)()(i, j) = in(mu)()(i, j);
-	  }}
-        }
-      };
-    };
-
-    template<class fobj,class sobj>
-    struct Nersc3x2munger{
-
-      void operator() (fobj &in,sobj &out){
-	for(int mu=0;mu<4;mu++){
-	  for(int i=0;i<2;i++){
-	  for(int j=0;j<3;j++){
-	    out(mu)()(i,j) = in(mu)(i)(j);
-	  }}
-	}
-	reconstruct3(out);
-      }
-    };
-
-    template<class fobj,class sobj>
-    struct Nersc3x2unmunger{
-
-      void operator() (sobj &in,fobj &out){
-	for(int mu=0;mu<4;mu++){
-	  for(int i=0;i<2;i++){
-	  for(int j=0;j<3;j++){
-	    out(mu)(i)(j) = in(mu)()(i,j);
-	  }}
-	}
-      }
-    };
-
-
     ////////////////////////////////////////////////////////////////////////////////
     // Write and read from fstream; comput header offset for payload
     ////////////////////////////////////////////////////////////////////////////////
@@ -202,42 +45,17 @@ namespace Grid {
 	std::ofstream fout(file,std::ios::out);
       }
   
-#define dump_nersc_header(field, s)					\
-      s << "BEGIN_HEADER"      << std::endl;				\
-      s << "HDR_VERSION = "    << field.hdr_version    << std::endl;	\
-      s << "DATATYPE = "       << field.data_type      << std::endl;	\
-      s << "STORAGE_FORMAT = " << field.storage_format << std::endl;	\
-      for(int i=0;i<4;i++){						\
-	s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
-      }									\
-      s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
-      s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl; \
-      for(int i=0;i<4;i++){						\
-	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
-      }									\
-									\
-      s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
-      s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
-      s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;	\
-      s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;	\
-      s << "CREATOR = "         << field.creator          << std::endl;	\
-      s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;	\
-      s << "CREATION_DATE = "   << field.creation_date    << std::endl;	\
-      s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;	\
-      s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
-      s << "END_HEADER"         << std::endl;
-  
-      static inline unsigned int writeHeader(NerscField &field,std::string file)
+      static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
       {
       std::ofstream fout(file,std::ios::out|std::ios::in);
       fout.seekp(0,std::ios::beg);
-      dump_nersc_header(field, fout);
+      dump_meta_data(field, fout);
       field.data_start = fout.tellp();
       return field.data_start;
     }
 
       // for the header-reader
-      static inline int readHeader(std::string file,GridBase *grid,  NerscField &field)
+      static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
       {
       int offset=0;
       std::map<std::string,std::string> header;
@@ -309,19 +127,21 @@ namespace Grid {
       return field.data_start;
     }
 
-      /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      // Now the meat: the object readers
-      /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Now the meat: the object readers
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
     template<class vsimd>
-    static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file)
+    static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+					 FieldMetaData& header,
+					 std::string file)
     {
       typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 
       GridBase *grid = Umu._grid;
       int offset = readHeader(file,Umu._grid,header);
 
-      NerscField clone(header);
+      FieldMetaData clone(header);
 
       std::string format(header.floating_point);
 
@@ -330,34 +150,38 @@ namespace Grid {
       int ieee64big = (format == std::string("IEEE64BIG"));
       int ieee64    = (format == std::string("IEEE64"));
 
-      uint32_t csum;
+      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
       // depending on datatype, set up munger;
       // munger is a function of <floating point, Real, data_type>
       if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
 	if ( ieee32 || ieee32big ) {
-	  csum=BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
-	    (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	    (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
+	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	if ( ieee64 || ieee64big ) {
-	  csum=BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
-	    (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	    (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
+	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
       } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
 	if ( ieee32 || ieee32big ) {
-	  csum=BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
+	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	if ( ieee64 || ieee64big ) {
-	  csum=BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
+	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
       } else {
 	assert(0);
       }
 
-      NerscStatistics<GaugeField>(Umu,clone);
+      GaugeStatistics<GaugeField>(Umu,clone);
 
-      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<            csum<< std::dec
+      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	       <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
       std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
 	       <<" header    "<<header.plaquette<<std::endl;
@@ -369,30 +193,35 @@ namespace Grid {
 	std::cout << Umu[0]<<std::endl;
 	std::cout << Umu[1]<<std::endl;
       }
-      if ( csum != header.checksum ) { 
+      if ( nersc_csum != header.checksum ) { 
 	std::cerr << " checksum mismatch " << std::endl;
 	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
 	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
-	std::cerr << " csum  " <<std::hex<< csum << " " << header.checksum<< std::dec<< std::endl;
+	std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
 	exit(0);
       }
       assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
       assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
-      assert(csum == header.checksum );
+      assert(nersc_csum == header.checksum );
       
       std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
     }
 
       template<class vsimd>
-      static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,std::string file, int two_row,int bits32)
+      static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+					    std::string file, 
+					    int two_row,
+					    int bits32)
       {
 	typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 
 	typedef iLorentzColourMatrix<vsimd> vobj;
 	typedef typename vobj::scalar_object sobj;
 
+	FieldMetaData header;
+	///////////////////////////////////////////
 	// Following should become arguments
-	NerscField header;
+	///////////////////////////////////////////
 	header.sequence_number = 1;
 	header.ensemble_id     = "UKQCD";
 	header.ensemble_label  = "DWF";
@@ -402,32 +231,31 @@ namespace Grid {
   
 	GridBase *grid = Umu._grid;
 
-	NerscGrid(grid,header);
-	NerscStatistics<GaugeField>(Umu,header);
-	NerscMachineCharacteristics(header);
+	GridMetaData(grid,header);
+	assert(header.nd==4);
+	GaugeStatistics<GaugeField>(Umu,header);
+	MachineCharacteristics(header);
 
 	int offset;
   
 	truncate(file);
 
-	if ( two_row ) { 
-	  header.floating_point = std::string("IEEE64BIG");
-	  header.data_type      = std::string("4D_SU3_GAUGE");
-	  Nersc3x2unmunger<fobj2D,sobj> munge;
-	  offset = writeHeader(header,file);
-	  header.checksum=BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point);
-	  writeHeader(header,file);
-	} else { 
-	  header.floating_point = std::string("IEEE64BIG");
-	  header.data_type      = std::string("4D_SU3_GAUGE_3x3");
-	  NerscSimpleUnmunger<fobj3D,sobj> munge;
-	  offset = writeHeader(header,file);
-	  header.checksum=BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
-	  writeHeader(header,file);
-	}
+	// Sod it -- always write 3x3 double
+	header.floating_point = std::string("IEEE64BIG");
+	header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+	GaugeSimpleUnmunger<fobj3D,sobj> munge;
+	offset = writeHeader(header,file);
+
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+	BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
+								  nersc_csum,scidac_csuma,scidac_csumb);
+	header.checksum = nersc_csum;
+	writeHeader(header,file);
+
 	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 		 <<std::hex<<header.checksum
 		 <<std::dec<<" plaq "<< header.plaquette <<std::endl;
+
       }
       ///////////////////////////////
       // RNG state
@@ -437,17 +265,18 @@ namespace Grid {
 	typedef typename GridParallelRNG::RngStateType RngStateType;
 
 	// Following should become arguments
-	NerscField header;
+	FieldMetaData header;
 	header.sequence_number = 1;
 	header.ensemble_id     = "UKQCD";
 	header.ensemble_label  = "DWF";
 
 	GridBase *grid = parallel._grid;
 
-	NerscGrid(grid,header);
+	GridMetaData(grid,header);
+	assert(header.nd==4);
 	header.link_trace=0.0;
 	header.plaquette=0.0;
-	NerscMachineCharacteristics(header);
+	MachineCharacteristics(header);
 
 	int offset;
   
@@ -466,7 +295,9 @@ namespace Grid {
 
 	truncate(file);
 	offset = writeHeader(header,file);
-	header.checksum = BinaryIO::writeRNG(serial,parallel,file,offset);
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+	BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
+	header.checksum = nersc_csum;
 	offset = writeHeader(header,file);
 
 	std::cout<<GridLogMessage 
@@ -476,7 +307,7 @@ namespace Grid {
 
       }
     
-      static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,NerscField& header,std::string file)
+      static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
       {
 	typedef typename GridParallelRNG::RngStateType RngStateType;
 
@@ -484,7 +315,7 @@ namespace Grid {
 
 	int offset = readHeader(file,grid,header);
 
-	NerscField clone(header);
+	FieldMetaData clone(header);
 
 	std::string format(header.floating_point);
 	std::string data_type(header.data_type);
@@ -504,19 +335,19 @@ namespace Grid {
 
 	// depending on datatype, set up munger;
 	// munger is a function of <floating point, Real, data_type>
-	uint32_t csum=BinaryIO::readRNG(serial,parallel,file,offset);
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+	BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
 
-	if ( csum != header.checksum ) { 
-	  std::cerr << "checksum mismatch "<<std::hex<< csum <<" "<<header.checksum<<std::dec<<std::endl;
+	if ( nersc_csum != header.checksum ) { 
+	  std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
 	  exit(0);
 	}
-	assert(csum == header.checksum );
+	assert(nersc_csum == header.checksum );
 
 	std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
       }
 
     };
 
-
   }}
 #endif
diff --git a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h b/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
index 6116a46c..59d655ad 100644
--- a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
@@ -62,36 +62,50 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
     fout.close();
   }
 
-  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
-                          GridParallelRNG &pRNG) {
+  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
+
     if ((traj % Params.saveInterval) == 0) {
       std::string config, rng;
       this->build_filenames(traj, Params, config, rng);
 
+      uint32_t nersc_csum;
+      uint32_t scidac_csuma;
+      uint32_t scidac_csumb;
+      
       BinarySimpleUnmunger<sobj_double, sobj> munge;
       truncate(rng);
-      BinaryIO::writeRNG(sRNG, pRNG, rng, 0);
+      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
       truncate(config);
-      uint32_t csum = BinaryIO::writeLatticeObject<vobj, sobj_double>(
-          U, config, munge, 0, Params.format);
+
+      BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
+						      nersc_csum,scidac_csuma,scidac_csumb);
 
       std::cout << GridLogMessage << "Written Binary Configuration " << config
-                << " checksum " << std::hex << csum << std::dec << std::endl;
+                << " checksum " << std::hex 
+		<< nersc_csum   <<"/"
+		<< scidac_csuma   <<"/"
+		<< scidac_csumb 
+		<< std::dec << std::endl;
     }
+
   };
 
-  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
-                         GridParallelRNG &pRNG) {
+  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
     std::string config, rng;
     this->build_filenames(traj, Params, config, rng);
 
     BinarySimpleMunger<sobj_double, sobj> munge;
-    BinaryIO::readRNG(sRNG, pRNG, rng, 0);
-    uint32_t csum = BinaryIO::readLatticeObject<vobj, sobj_double>(
-        U, config, munge, 0, Params.format);
 
+    uint32_t nersc_csum;
+    uint32_t scidac_csuma;
+    uint32_t scidac_csumb;
+    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+    BinaryIO::readLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
+						   nersc_csum,scidac_csuma,scidac_csumb);
+    
     std::cout << GridLogMessage << "Read Binary Configuration " << config
-              << " checksum " << std::hex << csum << std::dec << std::endl;
+              << " checksums " << std::hex << nersc_csum<<"/"<<scidac_csuma<<"/"<<scidac_csumb 
+	      << std::dec << std::endl;
   };
 };
 }
diff --git a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
index 8b8f9f23..b72fc6f7 100644
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -54,9 +54,9 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
 
     // check here that the format is valid
     int ieee32big = (Params.format == std::string("IEEE32BIG"));
-    int ieee32 = (Params.format == std::string("IEEE32"));
+    int ieee32    = (Params.format == std::string("IEEE32"));
     int ieee64big = (Params.format == std::string("IEEE64BIG"));
-    int ieee64 = (Params.format == std::string("IEEE64"));
+    int ieee64    = (Params.format == std::string("IEEE64"));
 
     if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
       std::cout << GridLogError << "Unrecognized file format " << Params.format
@@ -74,13 +74,17 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
     if ((traj % Params.saveInterval) == 0) {
       std::string config, rng;
       this->build_filenames(traj, Params, config, rng);
-
-      ILDGIO IO(config, ILDGwrite);
-      BinaryIO::writeRNGSerial(sRNG, pRNG, rng, 0);
-      uint32_t csum = IO.writeConfiguration(U, Params.format);
+      
+      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+      IldgIO::writeConfiguration(config,U, Params.format);
 
       std::cout << GridLogMessage << "Written ILDG Configuration on " << config
-                << " checksum " << std::hex << csum << std::dec << std::endl;
+                << " checksum " << std::hex 
+		<< nersc_csum<<"/"
+		<< scidac_csuma<<"/"
+		<< scidac_csumb
+		<< std::dec << std::endl;
     }
   };
 
@@ -89,12 +93,18 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
     std::string config, rng;
     this->build_filenames(traj, Params, config, rng);
 
-    ILDGIO IO(config, ILDGread);
-    BinaryIO::readRNGSerial(sRNG, pRNG, rng, 0);
-    uint32_t csum = IO.readConfiguration(U);  // format from the header
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+
+    FieldMetaData header;
+    IldgIO::readConfiguration(config,U,header);  // format from the header
 
     std::cout << GridLogMessage << "Read ILDG Configuration from " << config
-              << " checksum " << std::hex << csum << std::dec << std::endl;
+              << " checksum " << std::hex 
+	      << nersc_csum<<"/"
+	      << scidac_csuma<<"/"
+	      << scidac_csumb
+	      << std::dec << std::endl;
   };
 };
 }
diff --git a/lib/qcd/hmc/checkpointers/NerscCheckpointer.h b/lib/qcd/hmc/checkpointers/NerscCheckpointer.h
index 395369a0..a4b1b480 100644
--- a/lib/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/NerscCheckpointer.h
@@ -70,7 +70,7 @@ class NerscHmcCheckpointer : public BaseHmcCheckpointer<Gimpl> {
     std::string config, rng;
     this->build_filenames(traj, Params, config, rng);
 
-    NerscField header;
+    FieldMetaData header;
     NerscIO::readRNGState(sRNG, pRNG, header, rng);
     NerscIO::readConfiguration(U, header, config);
   };
diff --git a/lib/qcd/utils/Utils.h b/lib/qcd/utils/Utils.h
index 61c81cb5..1786db54 100644
--- a/lib/qcd/utils/Utils.h
+++ b/lib/qcd/utils/Utils.h
@@ -12,7 +12,4 @@
 #include <Grid/qcd/utils/SUnAdjoint.h>
 #include <Grid/qcd/utils/SUnTwoIndex.h>
 
-
-
-
 #endif
diff --git a/lib/serialisation/XmlIO.cc b/lib/serialisation/XmlIO.cc
index b04263c9..a132a2f0 100644
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@@ -32,16 +32,21 @@ using namespace Grid;
 using namespace std;
 
 // Writer implementation ///////////////////////////////////////////////////////
-XmlWriter::XmlWriter(const string &fileName)
-: fileName_(fileName)
+XmlWriter::XmlWriter(const string &fileName, string toplev) : fileName_(fileName)
 {
-  node_ = doc_.append_child();
-  node_.set_name("grid");
+  if ( toplev == std::string("") ) {
+    node_=doc_;
+  } else { 
+    node_=doc_.append_child();
+    node_.set_name(toplev.c_str());
+  }
 }
 
 XmlWriter::~XmlWriter(void)
 {
-  doc_.save_file(fileName_.c_str(), "  ");
+  if ( fileName_ != std::string("") ) { 
+    doc_.save_file(fileName_.c_str(), "  ");
+  }
 }
 
 void XmlWriter::push(const string &s)
@@ -53,21 +58,44 @@ void XmlWriter::pop(void)
 {
   node_ = node_.parent();
 }
-
-// Reader implementation ///////////////////////////////////////////////////////
-XmlReader::XmlReader(const string &fileName)
-: fileName_(fileName)
+std::string XmlWriter::XmlString(void)
 {
-  pugi::xml_parse_result result = doc_.load_file(fileName_.c_str());
-  
-  if ( !result )
-  {
+  std::ostringstream oss; 
+  doc_.save(oss);
+  return oss.str();
+}
+
+XmlReader::XmlReader(const char *xmlstring,string toplev) : fileName_("")
+{
+  pugi::xml_parse_result result;
+  result = doc_.load_string(xmlstring);
+  if ( !result ) {
     cerr << "XML error description: " << result.description() << "\n";
     cerr << "XML error offset     : " << result.offset        << "\n";
     abort();
   }
-  
-  node_ = doc_.child("grid");
+  if ( toplev == std::string("") ) {
+    node_ = doc_;
+  } else { 
+    node_ = doc_.child(toplev.c_str());
+  }
+}
+
+// Reader implementation ///////////////////////////////////////////////////////
+XmlReader::XmlReader(const string &fileName,string toplev) : fileName_(fileName)
+{
+  pugi::xml_parse_result result;
+  result = doc_.load_file(fileName_.c_str());
+  if ( !result ) {
+    cerr << "XML error description: " << result.description() << "\n";
+    cerr << "XML error offset     : " << result.offset        << "\n";
+    abort();
+  }
+  if ( toplev == std::string("") ) {
+    node_ = doc_;
+  } else { 
+    node_ = doc_.child(toplev.c_str());
+  }
 }
 
 bool XmlReader::push(const string &s)
diff --git a/lib/serialisation/XmlIO.h b/lib/serialisation/XmlIO.h
index f333b9aa..fcdbf1e4 100644
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -44,10 +44,9 @@ namespace Grid
 {
   
   class XmlWriter: public Writer<XmlWriter>
-  {
-    
+  {    
   public:
-    XmlWriter(const std::string &fileName);
+    XmlWriter(const std::string &fileName,std::string toplev = std::string("grid") );
     virtual ~XmlWriter(void);
     void push(const std::string &s);
     void pop(void);
@@ -55,6 +54,7 @@ namespace Grid
     void writeDefault(const std::string &s, const U &x);
     template <typename U>
     void writeDefault(const std::string &s, const std::vector<U> &x);
+    std::string XmlString(void);
   private:
     pugi::xml_document doc_;
     pugi::xml_node     node_;
@@ -64,7 +64,8 @@ namespace Grid
   class XmlReader: public Reader<XmlReader>
   {
   public:
-    XmlReader(const std::string &fileName);
+    XmlReader(const char *xmlstring,std::string toplev = std::string("grid") );
+    XmlReader(const std::string &fileName,std::string toplev = std::string("grid") );
     virtual ~XmlReader(void) = default;
     bool push(const std::string &s);
     void pop(void);
@@ -118,7 +119,7 @@ namespace Grid
     std::string buf;
     
     readDefault(s, buf);
-    std::cout << s << "   " << buf << std::endl;
+    //    std::cout << s << "   " << buf << std::endl;
     fromString(output, buf);
   }
   
diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc
index 14c6080d..ca04e623 100644
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -64,8 +64,8 @@ int main (int argc, char ** argv)
   std::cout <<GridLogMessage<< " ...done "<<std::endl;
 
   std::string rfile("./ckpoint_rng.4000");
+  FieldMetaData rngheader;
   NerscIO::writeRNGState(sRNGa,pRNGa,rfile);
-  NerscField rngheader;
   NerscIO::readRNGState (sRNGb,pRNGb,rngheader,rfile);
 
   LatticeComplex tmpa(&Fine); random(pRNGa,tmpa);
@@ -87,7 +87,7 @@ int main (int argc, char ** argv)
   
   SU3::HotConfiguration(pRNGa,Umu);
 
-  NerscField header;
+  FieldMetaData header;
   std::string file("./ckpoint_lat.4000");
 
   int precision32 = 0;
diff --git a/tests/IO/Test_nersc_read.cc b/tests/IO/Test_nersc_read.cc
index 9e9280a1..66b2d62e 100644
--- a/tests/IO/Test_nersc_read.cc
+++ b/tests/IO/Test_nersc_read.cc
@@ -50,7 +50,7 @@ int main (int argc, char ** argv)
   LatticeGaugeField Umu(&Fine);
   std::vector<LatticeColourMatrix> U(4,&Fine);
   
-  NerscField header;
+  FieldMetaData header;
   std::string file("./ckpoint_lat");
   NerscIO::readConfiguration(Umu,header,file);
 
diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc
index 7d911dfd..ceddee77 100644
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@@ -31,6 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 
 using namespace Grid;
+using namespace Grid::QCD;
 
 GRID_SERIALIZABLE_ENUM(myenum, undef, red, 1, blue, 2, green, 3);
   
@@ -62,6 +63,7 @@ public:
   }
 };
 
+
 int16_t  i16 = 1;
 uint16_t u16 = 2;
 int32_t  i32 = 3;
@@ -237,7 +239,22 @@ int main(int argc,char **argv)
     std::cout << "Loaded (JSON) -----------------" << std::endl;
     std::cout << jcopy1 << std::endl << jveccopy1 << std::endl;
   }
-
+  
+  { 
+    ildgFormat format;
+    format.version   =1.0;
+    format.field     =std::string("su3gauge");
+    format.precision =32;
+    format.lx        =24;
+    format.ly        =24;
+    format.lz        =24;
+    format.lt        =48;
+    XmlWriter WR("ildg-format.xml","");
+    XmlWriter WRs("","");
+    write(WR,"ildgFormat",format);
+    write(WRs,"ildgFormat",format);
+    std::cout << " XmlString: " <<WRs.XmlString()<<std::endl;
+  }
 /* 
   // This is still work in progress
   {

From 56042f002c64c4caab769bdf1ddfd18147aee748 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 11 Jun 2017 23:19:20 +0100
Subject: [PATCH 068/177] New files

---
 lib/parallelIO/MetaData.h | 223 ++++++++++++++++++++++++++++++++++++++
 tests/IO/Test_ildg_io.cc  |  93 ++++++++++++++++
 2 files changed, 316 insertions(+)
 create mode 100644 lib/parallelIO/MetaData.h
 create mode 100644 tests/IO/Test_ildg_io.cc

diff --git a/lib/parallelIO/MetaData.h b/lib/parallelIO/MetaData.h
new file mode 100644
index 00000000..e91371b8
--- /dev/null
+++ b/lib/parallelIO/MetaData.h
@@ -0,0 +1,223 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/parallelIO/NerscIO.h
+
+    Copyright (C) 2015
+
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <algorithm>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <map>
+#include <unistd.h>
+#include <sys/utsname.h>
+#include <pwd.h>
+
+namespace Grid {
+  namespace QCD {
+
+    using namespace Grid;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // header specification/interpretation
+    ////////////////////////////////////////////////////////////////////////////////
+    class FieldMetaData : Serializable {
+    public:
+
+      GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
+				      int, nd,
+				      std::vector<int>, dimension,
+				      std::vector<std::string>, boundary,
+				      int, data_start,
+				      std::string, hdr_version,
+				      std::string, storage_format,
+				      double, link_trace,
+				      double, plaquette,
+				      uint32_t, checksum,
+				      uint32_t, scidac_checksuma,
+				      uint32_t, scidac_checksumb,
+				      unsigned int, sequence_number,
+				      std::string, data_type,
+				      std::string, ensemble_id,
+				      std::string, ensemble_label,
+				      std::string, ildg_lfn,
+				      std::string, creator,
+				      std::string, creator_hardware,
+				      std::string, creation_date,
+				      std::string, archive_date,
+				      std::string, floating_point);
+    };
+
+    //////////////////////////////////////////////////////////////////////
+    // Bit and Physical Checksumming and QA of data
+    //////////////////////////////////////////////////////////////////////
+    inline void GridMetaData(GridBase *grid,FieldMetaData &header)
+    {
+      int nd = grid->_ndimension;
+      header.nd = nd;
+      header.dimension.resize(nd);
+      header.boundary.resize(nd);
+      for(int d=0;d<nd;d++) {
+	header.dimension[d] = grid->_fdimensions[d];
+      }
+      for(int d=0;d<nd;d++) {
+	header.boundary[d] = std::string("PERIODIC");
+      }
+    }
+    template<class GaugeField>
+    inline void GaugeStatistics(GaugeField & data,FieldMetaData &header)
+    {
+      // How to convert data precision etc...
+      header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplR>::linkTrace(data);
+      header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplR>::avgPlaquette(data);
+    }
+
+    inline void MachineCharacteristics(FieldMetaData &header)
+    {
+      // Who
+      struct passwd *pw = getpwuid (getuid());
+      if (pw) header.creator = std::string(pw->pw_name); 
+
+      // When
+      std::time_t t = std::time(nullptr);
+      std::tm tm = *std::localtime(&t);
+      std::ostringstream oss; 
+      oss << std::put_time(&tm, "%c %Z");
+      header.creation_date = oss.str();
+      header.archive_date  = header.creation_date;
+
+      // What
+      struct utsname name;  uname(&name);
+      header.creator_hardware = std::string(name.nodename)+"-";
+      header.creator_hardware+= std::string(name.machine)+"-";
+      header.creator_hardware+= std::string(name.sysname)+"-";
+      header.creator_hardware+= std::string(name.release);
+    }
+
+#define dump_meta_data(field, s)					\
+      s << "BEGIN_HEADER"      << std::endl;				\
+      s << "HDR_VERSION = "    << field.hdr_version    << std::endl;	\
+      s << "DATATYPE = "       << field.data_type      << std::endl;	\
+      s << "STORAGE_FORMAT = " << field.storage_format << std::endl;	\
+      for(int i=0;i<4;i++){						\
+	s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
+      }									\
+      s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
+      s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl; \
+      for(int i=0;i<4;i++){						\
+	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
+      }									\
+									\
+      s << "CHECKSUM (NERSC) = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
+      s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
+      s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
+      s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
+      s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;	\
+      s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;	\
+      s << "CREATOR = "         << field.creator          << std::endl;	\
+      s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;	\
+      s << "CREATION_DATE = "   << field.creation_date    << std::endl;	\
+      s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;	\
+      s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
+      s << "END_HEADER"         << std::endl;
+
+
+    //////////////////////////////////////////////////////////////////////
+    // Utilities ; these are QCD aware
+    //////////////////////////////////////////////////////////////////////
+    inline void reconstruct3(LorentzColourMatrix & cm)
+    {
+      const int x=0;
+      const int y=1;
+      const int z=2;
+      for(int mu=0;mu<Nd;mu++){
+	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
+      }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Some data types for intermediate storage
+    ////////////////////////////////////////////////////////////////////////////////
+    template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
+
+    typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
+    typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
+    typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
+
+    template<class fobj,class sobj>
+    struct GaugeSimpleMunger{
+      void operator()(fobj &in, sobj &out) {
+        for (int mu = 0; mu < Nd; mu++) {
+          for (int i = 0; i < Nc; i++) {
+          for (int j = 0; j < Nc; j++) {
+	    out(mu)()(i, j) = in(mu)()(i, j);
+	  }}
+        }
+      };
+    };
+
+    template <class fobj, class sobj>
+    struct GaugeSimpleUnmunger {
+
+      void operator()(sobj &in, fobj &out) {
+        for (int mu = 0; mu < Nd; mu++) {
+          for (int i = 0; i < Nc; i++) {
+          for (int j = 0; j < Nc; j++) {
+	    out(mu)()(i, j) = in(mu)()(i, j);
+	  }}
+        }
+      };
+    };
+
+    template<class fobj,class sobj>
+    struct Gauge3x2munger{
+      void operator() (fobj &in,sobj &out){
+	for(int mu=0;mu<Nd;mu++){
+	  for(int i=0;i<2;i++){
+	  for(int j=0;j<3;j++){
+	    out(mu)()(i,j) = in(mu)(i)(j);
+	  }}
+	}
+	reconstruct3(out);
+      }
+    };
+
+    template<class fobj,class sobj>
+    struct Gauge3x2unmunger{
+      void operator() (sobj &in,fobj &out){
+	for(int mu=0;mu<Nd;mu++){
+	  for(int i=0;i<2;i++){
+	  for(int j=0;j<3;j++){
+	    out(mu)(i)(j) = in(mu)()(i,j);
+	  }}
+	}
+      }
+    };
+
+  }
+}
diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc
new file mode 100644
index 00000000..1408c638
--- /dev/null
+++ b/tests/IO/Test_ildg_io.cc
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_nersc_io.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::cout <<GridLogMessage<< " main "<<std::endl;
+
+  std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  //std::vector<int> latt_size  ({48,48,48,96});
+  //std::vector<int> latt_size  ({32,32,32,32});
+  std::vector<int> latt_size  ({16,16,16,32});
+  std::vector<int> clatt_size  ({4,4,4,8});
+  int orthodir=3;
+  int orthosz =latt_size[orthodir];
+    
+  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
+  GridCartesian     Coarse(clatt_size,simd_layout,mpi_layout);
+
+
+  GridParallelRNG   pRNGa(&Fine);
+  GridParallelRNG   pRNGb(&Fine);
+  GridSerialRNG     sRNGa;
+  GridSerialRNG     sRNGb;
+
+  std::cout <<GridLogMessage<< " seeding... "<<std::endl;
+  pRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  sRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  std::cout <<GridLogMessage<< " ...done "<<std::endl;
+
+  LatticeGaugeField Umu(&Fine);
+  LatticeGaugeField Umu_diff(&Fine);
+  LatticeGaugeField Umu_saved(&Fine);
+
+  std::vector<LatticeColourMatrix> U(4,&Fine);
+  
+  SU3::HotConfiguration(pRNGa,Umu);
+
+
+  FieldMetaData header;
+
+  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
+  std::cout <<GridLogMessage<<"** Writing out  ILDG conf    *********"<<std::endl;
+  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
+  std::string file("./ckpoint_ildg.4000");
+  IldgIO::writeConfiguration(file,Umu, "IEEE64BIG");
+
+  Umu_saved = Umu;
+  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
+  std::cout <<GridLogMessage<<"** Reading back ILDG conf    *********"<<std::endl;
+  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
+  IldgIO::readConfiguration(file,Umu,header);
+  Umu_diff = Umu - Umu_saved;
+
+  std::cout <<GridLogMessage<< "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;
+
+  Grid_finalize();
+}

From eaac0044b572c953b085f9292ab0c012388bb130 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 12 Jun 2017 00:20:49 +0100
Subject: [PATCH 069/177] Compile fixes

---
 lib/parallelIO/IldgIO.h   | 2 +-
 lib/parallelIO/MetaData.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h
index 237edf43..df840fb2 100644
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -242,7 +242,7 @@ class IldgIO : public BinaryIO {
     while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 
       uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
-
+      
       //////////////////////////////////////////////////////////////////
       // If not BINARY_DATA read a string and parse
       //////////////////////////////////////////////////////////////////
diff --git a/lib/parallelIO/MetaData.h b/lib/parallelIO/MetaData.h
index e91371b8..85a6e0b9 100644
--- a/lib/parallelIO/MetaData.h
+++ b/lib/parallelIO/MetaData.h
@@ -103,9 +103,9 @@ namespace Grid {
 
       // When
       std::time_t t = std::time(nullptr);
-      std::tm tm = *std::localtime(&t);
+      std::tm tm_ = *std::localtime(&t);
       std::ostringstream oss; 
-      oss << std::put_time(&tm, "%c %Z");
+      //      oss << std::put_time(&tm_, "%c %Z");
       header.creation_date = oss.str();
       header.archive_date  = header.creation_date;
 

From 769ad578f594dc456c89f240463cffaa7c15a2ca Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 12 Jun 2017 00:41:21 +0100
Subject: [PATCH 070/177] Odd new error on G++ 49 on travis

---
 lib/serialisation/MacroMagic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h
index a864989c..04f1b401 100644
--- a/lib/serialisation/MacroMagic.h
+++ b/lib/serialisation/MacroMagic.h
@@ -110,7 +110,7 @@ THE SOFTWARE.
 
 #define GRID_MACRO_MEMBER(A,B)        A B;
 #define GRID_MACRO_COMP_MEMBER(A,B) result = (result and (lhs. B == rhs. B));
-#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <<std::endl;
+#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" " #B << " = " << obj. B << " ; " <<std::endl;
 #define GRID_MACRO_READ_MEMBER(A,B) Grid::read(RD,#B,obj. B);
 #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);
 

From a16b1e134ea9f7d7353648aa09a64ffefc275ab9 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 13 Jun 2017 10:48:43 +0100
Subject: [PATCH 071/177] gcc 4.9 fix

---
 lib/parallelIO/MetaData.h      | 8 +++++---
 tests/IO/Test_serialisation.cc | 5 ++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/parallelIO/MetaData.h b/lib/parallelIO/MetaData.h
index 85a6e0b9..1bad07f2 100644
--- a/lib/parallelIO/MetaData.h
+++ b/lib/parallelIO/MetaData.h
@@ -37,9 +37,6 @@
 #include <pwd.h>
 
 namespace Grid {
-  namespace QCD {
-
-    using namespace Grid;
 
     ////////////////////////////////////////////////////////////////////////////////
     // header specification/interpretation
@@ -71,6 +68,11 @@ namespace Grid {
 				      std::string, floating_point);
     };
 
+  namespace QCD {
+
+    using namespace Grid;
+
+
     //////////////////////////////////////////////////////////////////////
     // Bit and Physical Checksumming and QA of data
     //////////////////////////////////////////////////////////////////////
diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc
index ceddee77..6d918787 100644
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@@ -45,8 +45,8 @@ public:
                           double, y,
                           bool , b,
                           std::vector<double>, array,
-                          std::vector<std::vector<double>>, twodimarray,
-                          std::vector<std::vector<std::vector<Complex>>>, cmplx3darray
+                          std::vector<std::vector<double> >, twodimarray,
+                          std::vector<std::vector<std::vector<Complex> > >, cmplx3darray
                           );
   myclass() {}
   myclass(int i)
@@ -63,7 +63,6 @@ public:
   }
 };
 
-
 int16_t  i16 = 1;
 uint16_t u16 = 2;
 int32_t  i32 = 3;

From 0494feec98f1c53b2ac20cab2a4e159637ade84f Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 13 Jun 2017 12:00:23 +0100
Subject: [PATCH 072/177] Libz dependency

---
 configure.ac            | 4 ++++
 lib/parallelIO/IldgIO.h | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/configure.ac b/configure.ac
index 2fc9dfec..f7284d48 100644
--- a/configure.ac
+++ b/configure.ac
@@ -184,6 +184,10 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
 In order to use ILGG file format please install or provide the correct path to your installation
 Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 
+AC_SEARCH_LIBS([crc32], [z],
+               [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
+               [have_zlib=true],
+	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h
index df840fb2..a6810b0d 100644
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -379,6 +379,9 @@ class IldgIO : public BinaryIO {
       assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
       assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
       std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
+    } else { 
+      std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
+      //Could choose to fail ?
     }
 
     if ( found_FieldMetaData || found_usqcdInfo ) {

From 91199a8ea0907ff1b074066ae566a318b803e437 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 13 Jun 2017 12:21:29 +0100
Subject: [PATCH 073/177] openmpi is not const safe

---
 lib/parallelIO/BinaryIO.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index bc3da38b..7226ccba 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -376,7 +376,7 @@ class BinaryIO {
       if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
 	std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
-	ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
+	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
 	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
 	MPI_File_close(&fh);
@@ -426,7 +426,7 @@ class BinaryIO {
       if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
 	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
-	ierr=MPI_File_open(grid->communicator, file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
+	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
 	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
 	MPI_File_close(&fh);

From e7564f8330eceac22e73b61cca4110bdb2ea5b09 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 13 Jun 2017 12:22:50 +0100
Subject: [PATCH 074/177] Starting a test for reading an ILDG file.

---
 tests/IO/Test_ildg_read.cc | 112 +++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 tests/IO/Test_ildg_read.cc

diff --git a/tests/IO/Test_ildg_read.cc b/tests/IO/Test_ildg_read.cc
new file mode 100644
index 00000000..70a46dbf
--- /dev/null
+++ b/tests/IO/Test_ildg_read.cc
@@ -0,0 +1,112 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_nersc_io.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> latt_size = GridDefaultLatt();
+  int orthodir=3;
+  int orthosz =latt_size[orthodir];
+    
+  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
+
+  LatticeGaugeField Umu(&Fine);
+  std::vector<LatticeColourMatrix> U(4,&Fine);
+  
+  FieldMetaData header;
+  std::string file("./ckpoint_lat");
+  IldgIO::readConfiguration(file,Umu,header);
+
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  }
+
+  // Painful ; fix syntactical niceness
+  LatticeComplex LinkTrace(&Fine);
+  LinkTrace=zero;
+  for(int mu=0;mu<Nd;mu++){
+    LinkTrace = LinkTrace + trace(U[mu]);
+  }
+
+  // (1+2+3)=6 = N(N-1)/2 terms
+  LatticeComplex Plaq(&Fine);
+
+  Plaq = zero;
+
+  for(int mu=1;mu<Nd;mu++){
+    for(int nu=0;nu<mu;nu++){
+      Plaq = Plaq + trace(U[mu]*Cshift(U[nu],mu,1)*adj(Cshift(U[mu],nu,1))*adj(U[nu]));
+    }
+  }
+
+  double vol = Fine.gSites();
+  Complex PlaqScale(1.0/vol/6.0/3.0);
+  std::cout<<GridLogMessage <<"PlaqScale" << PlaqScale<<std::endl;
+
+  std::vector<TComplex> Plaq_T(orthosz);
+  sliceSum(Plaq,Plaq_T,Nd-1);
+  int Nt = Plaq_T.size();
+
+  TComplex Plaq_T_sum; 
+  Plaq_T_sum=zero;
+  for(int t=0;t<Nt;t++){
+    Plaq_T_sum = Plaq_T_sum+Plaq_T[t];
+    Complex Pt=TensorRemove(Plaq_T[t]);
+    std::cout<<GridLogMessage << "sliced ["<<t<<"]" <<Pt*PlaqScale*Real(Nt)<<std::endl;
+  }
+
+  {
+    Complex Pt = TensorRemove(Plaq_T_sum);
+    std::cout<<GridLogMessage << "total " <<Pt*PlaqScale<<std::endl;
+  }  
+
+
+  TComplex Tp = sum(Plaq);
+  Complex p  = TensorRemove(Tp);
+  std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;
+
+
+  Complex LinkTraceScale(1.0/vol/4.0/3.0);
+  TComplex Tl = sum(LinkTrace);
+  Complex l  = TensorRemove(Tl);
+  std::cout<<GridLogMessage << "calculated link trace " <<l*LinkTraceScale<<std::endl;
+
+  Grid_finalize();
+}

From 12ccc73cf512f09cebda7a8fd0dc13fdce0d9e3e Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 14 Jun 2017 05:19:17 +0100
Subject: [PATCH 075/177] Serialisation no compile fix

---
 tests/IO/Test_serialisation.cc | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc
index 6d918787..d5b52044 100644
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@@ -239,21 +239,6 @@ int main(int argc,char **argv)
     std::cout << jcopy1 << std::endl << jveccopy1 << std::endl;
   }
   
-  { 
-    ildgFormat format;
-    format.version   =1.0;
-    format.field     =std::string("su3gauge");
-    format.precision =32;
-    format.lx        =24;
-    format.ly        =24;
-    format.lz        =24;
-    format.lt        =48;
-    XmlWriter WR("ildg-format.xml","");
-    XmlWriter WRs("","");
-    write(WR,"ildgFormat",format);
-    write(WRs,"ildgFormat",format);
-    std::cout << " XmlString: " <<WRs.XmlString()<<std::endl;
-  }
 /* 
   // This is still work in progress
   {

From 2ad54c5a0222946571f133837f1a2cfaef807181 Mon Sep 17 00:00:00 2001
From: Lanny91 <andrew.lawson@gmail.com>
Date: Wed, 14 Jun 2017 10:53:39 +0100
Subject: [PATCH 076/177] QPX exchange support

---
 lib/simd/Grid_qpx.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index cbca9118..9fc8ef3c 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -375,6 +375,49 @@ namespace Optimization {
     FLOAT_WRAP_2(operator(), inline)
   };
 
+  //////////////////////////////////////////////
+  // Exchange support
+#define FLOAT_WRAP_EXCHANGE(fn) \
+  static inline void fn(vector4float &out1, vector4float &out2, \
+                        vector4float in1,  vector4float in2) \
+  { \
+    vector4double out1d, out2d, in1d, in2d; \
+    in1d  = Vset()(in1);   \
+    in2d  = Vset()(in2);   \
+    fn(out1d, out2d, in1d, in2d); \
+    Vstore()(out1d, out1); \
+    Vstore()(out2d, out2); \
+  }
+
+  struct Exchange{
+
+    // double precision
+    static inline void Exchange0(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      out1 = vec_perm(in1, in2, vec_gpci(0145));
+      out2 = vec_perm(in1, in2, vec_gpci(02367));
+    }
+    static inline void Exchange1(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      out1 = vec_perm(in1, in2, vec_gpci(0426));
+      out2 = vec_perm(in1, in2, vec_gpci(01537));
+    }
+    static inline void Exchange2(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      assert(0);
+    }
+    static inline void Exchange3(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      assert(0);
+    }
+
+    // single precision
+    FLOAT_WRAP_EXCHANGE(Exchange0);
+    FLOAT_WRAP_EXCHANGE(Exchange1);
+    FLOAT_WRAP_EXCHANGE(Exchange2);
+    FLOAT_WRAP_EXCHANGE(Exchange3);
+  };
+
   struct Permute{
     //Complex double
     static inline vector4double Permute0(vector4double v){ //0123 -> 2301

From 735cbdb983703fd3ffadc6133d792b4d058a897b Mon Sep 17 00:00:00 2001
From: Lanny91 <andrew.lawson@gmail.com>
Date: Wed, 14 Jun 2017 10:55:10 +0100
Subject: [PATCH 077/177] QPX Integer reduction (+ integer reduction test)

---
 lib/simd/Grid_qpx.h | 11 +++++++----
 tests/Test_simd.cc  | 47 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index 9fc8ef3c..00dbace5 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -540,10 +540,13 @@ namespace Optimization {
   
   //Integer Reduce
   template<>
-  inline Integer Reduce<Integer, int>::operator()(int in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+  inline Integer Reduce<Integer, veci>::operator()(veci in){
+    Integer a = 0;
+    for (unsigned int i = 0; i < W<Integer>::r; ++i)
+    {
+        a += in.v[i];
+    }
+    return a;
   }
 }
 
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index c0bbef1d..b2e8d68e 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -183,8 +183,6 @@ void IntTester(const functor &func)
 {
   typedef Integer  scal;
   typedef vInteger vec;
-  GridSerialRNG          sRNG;
-  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   int Nsimd = vec::Nsimd();
 
@@ -287,6 +285,50 @@ void ReductionTester(const functor &func)
 }
 
 
+template<class reduced,class scal, class vec,class functor > 
+void IntReductionTester(const functor &func)
+{
+  int Nsimd = vec::Nsimd();
+
+  std::vector<scal> input1(Nsimd);
+  std::vector<scal> input2(Nsimd);
+  reduced result(0);
+  reduced reference(0);
+  reduced tmp;
+
+  std::vector<vec,alignedAllocator<vec> > buf(3);
+  vec & v_input1 = buf[0];
+  vec & v_input2 = buf[1];
+
+  for(int i=0;i<Nsimd;i++){
+    input1[i] = (i + 1) * 30;
+    input2[i] = (i + 1) * 20;
+  }
+
+  merge<vec,scal>(v_input1,input1);
+  merge<vec,scal>(v_input2,input2);
+
+  func.template vfunc<reduced,vec>(result,v_input1,v_input2);
+
+  for(int i=0;i<Nsimd;i++) {
+    func.template sfunc<reduced,scal>(tmp,input1[i],input2[i]);
+    reference+=tmp;
+  }
+
+  std::cout<<GridLogMessage << " " << func.name()<<std::endl;
+
+  int ok=0;
+  if ( reference-result != 0 ){
+    std::cout<<GridLogMessage<< "*****" << std::endl;
+    std::cout<<GridLogMessage<< reference-result << " " <<reference<< " " << result<<std::endl;
+    ok++;
+  }
+  if ( ok==0 ) {
+    std::cout<<GridLogMessage << " OK!" <<std::endl;
+  }
+  assert(ok==0);
+}
+
 
 class funcPermute {
 public:
@@ -691,6 +733,7 @@ int main (int argc, char ** argv)
   IntTester(funcPlus());
   IntTester(funcMinus());
   IntTester(funcTimes());
+  IntReductionTester<Integer, Integer, vInteger>(funcReduce());
 
   std::cout<<GridLogMessage << "==================================="<<  std::endl;
   std::cout<<GridLogMessage << "Testing precisionChange            "<<  std::endl;

From 07b2c1b25338008410c772867c0b1b43ef2c7248 Mon Sep 17 00:00:00 2001
From: Lanny91 <andrew.lawson@gmail.com>
Date: Fri, 16 Jun 2017 15:04:26 +0100
Subject: [PATCH 078/177] Placeholder precision change functions to allow Grid
 to compile with QPX (warning: no actual functionality)

---
 lib/simd/Grid_qpx.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index 00dbace5..8de7bde8 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -374,6 +374,41 @@ namespace Optimization {
     // Complex float
     FLOAT_WRAP_2(operator(), inline)
   };
+#define USE_FP16
+  struct PrecisionChange {
+    static inline vech StoH (const vector4float &a, const vector4float &b) {
+      vech ret;
+      std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void  HtoS (vech h, vector4float &sa, vector4float &sb) {
+      std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+    static inline vector4float DtoS (vector4double a, vector4double b) {
+      vector4float ret;
+      std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
+      std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+    static inline vech DtoH (vector4double a, vector4double b, 
+                             vector4double c, vector4double d) {
+      vech ret;
+      std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void HtoD (vech h, vector4double &a, vector4double &b, 
+                                     vector4double &c, vector4double &d) {
+      std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+  };
 
   //////////////////////////////////////////////
   // Exchange support
@@ -552,6 +587,7 @@ namespace Optimization {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Here assign types
+typedef Optimization::vech         SIMD_Htype;  // Half precision type
 typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
 typedef vector4double              SIMD_Dtype; // Double precision type
 typedef Optimization::veci         SIMD_Itype; // Integer type

From a833f88c3237f9c941e9eb79ad459d0e260d2a2b Mon Sep 17 00:00:00 2001
From: Lanny91 <andrew.lawson@gmail.com>
Date: Fri, 16 Jun 2017 15:58:47 +0100
Subject: [PATCH 079/177] Added missing SIMD integer reduction implementation
 for AVX, AVX-512, SSE4, IMCI

---
 lib/simd/Grid_avx.h    | 25 ++++++++++++++++++++++---
 lib/simd/Grid_avx512.h | 22 +++++++++++++++++++---
 lib/simd/Grid_imci.h   |  4 +---
 lib/simd/Grid_sse4.h   |  6 +++---
 4 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index 52be9c05..57d9064d 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -701,9 +701,28 @@ namespace Optimization {
   //Integer Reduce
   template<>
   inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    __m128i ret;
+#if defined (AVX2)
+    // AVX2 horizontal adds within upper and lower halves of register; use
+    // SSE to add upper and lower halves for result.
+    __m256i v1, v2;
+    __m128i u1, u2;
+    v1  = _mm256_hadd_epi32(in, in);
+    v2  = _mm256_hadd_epi32(v1, v1);
+    u1  = _mm256_castsi256_si128(v2);      // upper half
+    u2  = _mm256_extracti128_si256(v2, 1); // lower half
+    ret = _mm256_add_epi32(u1, u2);
+#else
+    // No AVX horizontal add; extract upper and lower halves of register & use
+    // SSE intrinsics.
+    __m128i u1, u2, u3;
+    u1  = _mm256_extractf128_si256(in, 0); // upper half
+    u2  = _mm256_extractf128_si256(in, 1); // lower half
+    u3  = _mm_add_epi32(u1, u2);
+    u1  = _mm_hadd_epi32(u3, u3);
+    ret = _mm_hadd_epi32(u1, u1);
+#endif
+    return _mm_cvtsi128_si32(ret);
   }
 
 }
diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index ba054665..458a8f7c 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -543,6 +543,24 @@ namespace Optimization {
      u512d conv; conv.v = v1;
      return conv.f[0];
   }
+  
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
+    // No full vector reduce, use AVX to add upper and lower halves of register
+    // and perform AVX reduction.
+    __m256i v1, v2, v3;
+    __m128i u1, u2, ret;
+    v1  = _mm512_castsi512_si256(in);       // upper half
+    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
+    v3  = _mm256_add_epi32(v1, v2);
+    v1  = _mm256_hadd_epi32(v3, v3);
+    v2  = _mm256_hadd_epi32(v1, v1);
+    u1  = _mm256_castsi256_si128(v2)        // upper half
+    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
+    ret = _mm256_add_epi32(u1, u2);
+    return _mm_cvtsi128_si32(ret);
+  }
 #else
   //Complex float Reduce
   template<>
@@ -570,9 +588,7 @@ namespace Optimization {
   //Integer Reduce
   template<>
   inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    return _mm512_reduce_add_epi32(in);
   }
 #endif
   
diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h
index 173e57d8..a1dae565 100644
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -401,9 +401,7 @@ namespace Optimization {
   //Integer Reduce
   template<>
   inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    return _mm512_reduce_add_epi32(in);
   }
   
   
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index 2fb2df76..0b1f9ffb 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -570,9 +570,9 @@ namespace Optimization {
   //Integer Reduce
   template<>
   inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
-    // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    __m128i v1 = _mm_hadd_epi32(in, in);
+    __m128i v2 = _mm_hadd_epi32(v1, v1);
+    return _mm_cvtsi128_si32(v2);
   }
 }
 

From d57217017075d38c8f170fe7b141ea6d7f662c16 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:10:20 +0100
Subject: [PATCH 080/177] Update for SciDAC

---
 lib/GridStd.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/GridStd.h b/lib/GridStd.h
index 959ba9ac..097e62ab 100644
--- a/lib/GridStd.h
+++ b/lib/GridStd.h
@@ -7,6 +7,7 @@
 #include <cassert>
 #include <complex>
 #include <vector>
+#include <string>
 #include <iostream>
 #include <iomanip>
 #include <random>

From 8e9be9f84f0aa38e94dfafa81d525526fbed9bc1 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:10:42 +0100
Subject: [PATCH 081/177] Updates for SciDAC IO

---
 lib/parallelIO/BinaryIO.h | 135 +++++++++++++++++---------------------
 1 file changed, 59 insertions(+), 76 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 7226ccba..117bec01 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -69,46 +69,6 @@ inline uint64_t Grid_ntohll(uint64_t A) {
 }
 #endif
 
-/////////////////////////////////////////////////////////////////////////////////
-// Simple classes for precision conversion
-/////////////////////////////////////////////////////////////////////////////////
-template <class fobj, class sobj>
-struct BinarySimpleUnmunger {
-  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
-  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
-  
-  void operator()(sobj &in, fobj &out) {
-    // take word by word and transform accoding to the status
-    fobj_stype *out_buffer = (fobj_stype *)&out;
-    sobj_stype *in_buffer = (sobj_stype *)&in;
-    size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
-    size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
-    assert(fobj_words == sobj_words);
-    
-    for (unsigned int word = 0; word < sobj_words; word++)
-      out_buffer[word] = in_buffer[word];  // type conversion on the fly
-    
-  }
-};
-
-template <class fobj, class sobj>
-struct BinarySimpleMunger {
-  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
-  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
-
-  void operator()(fobj &in, sobj &out) {
-    // take word by word and transform accoding to the status
-    fobj_stype *in_buffer = (fobj_stype *)&in;
-    sobj_stype *out_buffer = (sobj_stype *)&out;
-    size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
-    size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
-    assert(fobj_words == sobj_words);
-    
-    for (unsigned int word = 0; word < sobj_words; word++)
-      out_buffer[word] = in_buffer[word];  // type conversion on the fly
-    
-  }
-};
 // A little helper
 inline void removeWhitespace(std::string &key)
 {
@@ -126,11 +86,7 @@ class BinaryIO {
   // more byte manipulation helpers
   /////////////////////////////////////////////////////////////////////////////
 
-  template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,				      
-							 uint32_t &nersc_csum,
-							 uint32_t &scidac_csuma,
-							 uint32_t &scidac_csumb)
-
+  template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,uint32_t &nersc_csum)
   {
     typedef typename vobj::scalar_object sobj;
 
@@ -140,15 +96,38 @@ class BinaryIO {
     std::vector<sobj> scalardata(lsites); 
     unvectorizeToLexOrdArray(scalardata,lat);    
 
-    Uint32Checksum(grid,scalardata,nersc_csum,scidac_csuma,scidac_csumb);
+    NerscChecksum(grid,scalardata,nersc_csum);
   }
   
-  template<class fobj>
-    static inline void Uint32Checksum(GridBase *grid,
-				      std::vector<fobj> &fbuf,
-				      uint32_t &nersc_csum,
-				      uint32_t &scidac_csuma,
-				      uint32_t &scidac_csumb)
+  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
+  {
+    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
+
+
+    uint64_t lsites              =grid->lSites();
+    if (fbuf.size()==1) {
+      lsites=1;
+    }
+
+#pragma omp parallel
+    { 
+      uint32_t nersc_csum_thr=0;
+
+#pragma omp for
+      for(uint64_t local_site=0;local_site<lsites;local_site++){
+	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
+	for(uint64_t j=0;j<size32;j++){
+	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
+	}
+      }
+
+#pragma omp critical
+      {
+	nersc_csum  += nersc_csum_thr;
+      }
+    }
+  }
+  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
   {
     const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
 
@@ -156,6 +135,9 @@ class BinaryIO {
     int nd = grid->_ndimension;
 
     uint64_t lsites              =grid->lSites();
+    if (fbuf.size()==1) {
+      lsites=1;
+    }
     std::vector<int> local_vol   =grid->LocalDimensions();
     std::vector<int> local_start =grid->LocalStarts();
     std::vector<int> global_vol  =grid->FullDimensions();
@@ -163,21 +145,15 @@ class BinaryIO {
 #pragma omp parallel
     { 
       std::vector<int> coor(nd);
-      uint32_t nersc_csum_thr=0;
       uint32_t scidac_csuma_thr=0;
       uint32_t scidac_csumb_thr=0;
       uint32_t site_crc=0;
-      uint32_t zcrc = crc32(0L, Z_NULL, 0);
 
 #pragma omp for
       for(uint64_t local_site=0;local_site<lsites;local_site++){
 
 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
 
-	for(uint64_t j=0;j<size32;j++){
-	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
-	}
-
 	/* 
 	 * Scidac csum  is rather more heavyweight
 	 */
@@ -185,23 +161,24 @@ class BinaryIO {
 
 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
 
-	for(int d=0;d<nd;d++) 
+	for(int d=0;d<nd;d++) {
 	  coor[d] = coor[d]+local_start[d];
+	}
 
 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);
 
 	uint32_t gsite29   = global_site%29;
 	uint32_t gsite31   = global_site%31;
-
-	site_crc = crc32(zcrc,(unsigned char *)site_buf,sizeof(fobj));
-
+	
+	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
+	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
+	//	std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
 	scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
 	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
       }
 
 #pragma omp critical
       {
-	nersc_csum  += nersc_csum_thr;
 	scidac_csuma^= scidac_csuma_thr;
 	scidac_csumb^= scidac_csumb_thr;
       }
@@ -386,7 +363,8 @@ class BinaryIO {
 	assert(0);
 #endif
       } else { 
-	std::cout<< GridLogMessage<< "C++ read I/O "<< file<< std::endl;
+	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
+		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 	std::ifstream fin;
 	fin.open(file,std::ios::binary|std::ios::in);
 	if ( control & BINARYIO_MASTER_APPEND )  {
@@ -402,22 +380,24 @@ class BinaryIO {
       grid->Barrier();
 
       bstimer.Start();
+      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
       if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
       if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
       if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
       if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
-      Uint32Checksum(grid,iodata,nersc_csum,scidac_csuma,scidac_csumb);
+      NerscChecksum(grid,iodata,nersc_csum);
       bstimer.Stop();
     }
     
     if ( control & BINARYIO_WRITE ) { 
 
       bstimer.Start();
-      Uint32Checksum(grid,iodata,nersc_csum,scidac_csuma,scidac_csumb);
+      NerscChecksum(grid,iodata,nersc_csum);
       if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
       if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
       if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
       if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
       bstimer.Stop();
 
       grid->Barrier();
@@ -436,9 +416,9 @@ class BinaryIO {
 	assert(0);
 #endif
       } else { 
-	std::cout<< GridLogMessage<< "C++ write I/O "<< file<< std::endl;
-	std::ofstream fout;
-	fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
+		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 	if ( control & BINARYIO_MASTER_APPEND )  {
 	  fout.seekp(0,fout.end);
 	} else {
@@ -467,9 +447,6 @@ class BinaryIO {
     grid->GlobalXOR(scidac_csuma);
     grid->GlobalXOR(scidac_csumb);
     grid->Barrier();
-    //    std::cout << "Binary IO NERSC  checksum  0x"<<std::hex<<nersc_csum  <<std::dec<<std::endl;
-    //    std::cout << "Binary IO SCIDAC checksuma 0x"<<std::hex<<scidac_csuma<<std::dec<<std::endl;
-    //    std::cout << "Binary IO SCIDAC checksumb 0x"<<std::hex<<scidac_csumb<<std::dec<<std::endl;
   }
 
   /////////////////////////////////////////////////////////////////////////////
@@ -603,9 +580,9 @@ class BinaryIO {
     scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
     scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
 
-    //    std::cout << GridLogMessage << "RNG file nersc_checksum   " << std::hex << nersc_csum << std::dec << std::endl;
-    //    std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
-    //    std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file nersc_checksum   " << std::hex << nersc_csum << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
 
     std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
   }
@@ -658,8 +635,14 @@ class BinaryIO {
     }
     IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
 	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
+
+    nersc_csum   = nersc_csum   + nersc_csum_tmp;
+    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
+    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
     
-    //    std::cout << GridLogMessage << "RNG file checksum " << std::hex << csum << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file checksum " << std::hex << nersc_csum    << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
     std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
   }
 };

From 0ab555b4f502454598bc6e7acc3a66d46d9ded91 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:11:02 +0100
Subject: [PATCH 082/177] SciDAC I/O and ILDG improvements

---
 lib/parallelIO/IldgIO.h      | 552 ++++++++++++++++++++++++++---------
 lib/parallelIO/IldgIOtypes.h | 149 ++++++++--
 2 files changed, 551 insertions(+), 150 deletions(-)

diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h
index a6810b0d..9a1612d5 100644
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -27,6 +27,7 @@ directory
 #ifndef GRID_ILDG_IO_H
 #define GRID_ILDG_IO_H
 
+#ifdef HAVE_LIME
 #include <algorithm>
 #include <fstream>
 #include <iomanip>
@@ -37,31 +38,153 @@ directory
 #include <sys/utsname.h>
 #include <unistd.h>
 
-#ifdef HAVE_LIME
-
+//Lime is a must have for this functionality
 extern "C" {  // for linkage
 #include "lime.h"
 }
 
-
-// Unused SCIDAC records names
-// SCIDAC_PRIVATE_FILE_XML   "scidac-private-file-xml"
-// SCIDAC_SITELIST           "scidac-sitelist"
-// SCIDAC_FILE_XML           "scidac-file-xml"
-// SCIDAC_RIVATE_RECORD_XML "scidac-private-record-xml"
-// SCIDAC_RECORD_XML         "scidac-record-xml"
-// SCIDAC_BINARY_DATA        "scidac-binary-data"
-//
-// Scidac checksum: CRC32 every site, xor reduce some hash of this.
-// https://github.com/usqcd-software/qio/blob/master/lib/dml/DML_utils.c
-
 namespace Grid {
 namespace QCD {
 
-class IldgIO : public BinaryIO {
+ template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
+ template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
+ template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
+ template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
+ template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
+ template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
+ template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
+
+ template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
+
+   typedef typename getPrecision<vobj>::real_scalar_type stype;
+
+   int _ColourN       = indexRank<ColourIndex,vobj>();
+   int _ColourScalar  =  isScalar<ColourIndex,vobj>();
+   int _ColourVector  =  isVector<ColourIndex,vobj>();
+   int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
+
+   int _SpinN       = indexRank<SpinIndex,vobj>();
+   int _SpinScalar  =  isScalar<SpinIndex,vobj>();
+   int _SpinVector  =  isVector<SpinIndex,vobj>();
+   int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
+
+   int _LorentzN       = indexRank<LorentzIndex,vobj>();
+   int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
+   int _LorentzVector  =  isVector<LorentzIndex,vobj>();
+   int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
+
+   std::stringstream stream;
+
+   stream << "GRID_";
+   stream << ScidacWordMnemonic<stype>();
+
+   //   std::cout << " Lorentz N/S/V/M : " << _LorentzN<<" "<<_LorentzScalar<<"/"<<_LorentzVector<<"/"<<_LorentzMatrix<<std::endl;
+   //   std::cout << " Spin    N/S/V/M : " << _SpinN   <<" "<<_SpinScalar   <<"/"<<_SpinVector   <<"/"<<_SpinMatrix<<std::endl;
+   //   std::cout << " Colour  N/S/V/M : " << _ColourN <<" "<<_ColourScalar <<"/"<<_ColourVector <<"/"<<_ColourMatrix<<std::endl;
+
+   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
+   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
+
+   if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
+   if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
+
+   if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
+   if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
+
+   if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
+
+
+   typesize = sizeof(typename vobj::scalar_type);
+
+   if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
+   else                 typesize*= _ColourN;
+
+   if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
+   else                 typesize*= _SpinN;
+
+   colors    = _ColourN;
+   spins     = _SpinN;
+   datacount = _LorentzN;
+
+   return stream.str();
+ }
+ 
+ template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) { 
+   return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
+ };
+
+ template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
+					  FieldMetaData &header,
+					  scidacRecord & _scidacRecord,
+					  scidacFile   & _scidacFile) 
+ {
+   typedef typename getPrecision<vobj>::real_scalar_type stype;
+
+   /////////////////////////////////////
+   // Pull Grid's metadata
+   /////////////////////////////////////
+   PrepareMetaData(field,header);
+
+   /////////////////////////////////////
+   // Scidac Private File structure
+   /////////////////////////////////////
+   _scidacFile              = scidacFile(field._grid);
+
+   /////////////////////////////////////
+   // Scidac Private Record structure
+   /////////////////////////////////////
+   scidacRecord sr;
+   sr.datatype   = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
+   sr.date       = header.creation_date;
+   sr.precision  = ScidacWordMnemonic<stype>();
+   sr.recordtype = GRID_IO_FIELD;
+
+   _scidacRecord = sr;
+
+   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
+ }
+ 
+ ///////////////////////////////////////////////////////
+ // Scidac checksum
+ ///////////////////////////////////////////////////////
+ static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
+ {
+   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
+   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
+   if ( scidac_csuma !=scidac_checksuma) return 0;
+   if ( scidac_csumb !=scidac_checksumb) return 0;
+    return 1;
+ }
+
+////////////////////////////////////////////////////////////////////////////////////
+// Lime, ILDG and Scidac I/O classes
+////////////////////////////////////////////////////////////////////////////////////
+class LimeIO : public BinaryIO {
  public:
 
-  static int createHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L)
+   ///////////////////////////////////////////////////
+   // FIXME: format for RNG? Now just binary out instead
+   // FIXME: Make interface able to write multiple records
+   // FIXME: Split into LimeReader and LimeWriter
+   ///////////////////////////////////////////////////
+   /*
+   FILE * File;
+   LimeWriter LimeW;
+   LimeReader LimeR;
+   template<class serialisable_object>
+   int readObject(serialisable_object &object,std::string object_name,std::string record_name)
+
+  int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize);
+  template<class serialisable_object>
+  int writeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name)
+  template<class vobj>
+  int writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string filename,std::string record_name)
+   */
+  ///////////////////////////////////////////////////////
+  // Lime utility functions
+  ///////////////////////////////////////////////////////
+
+  static int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L)
   {
     LimeRecordHeader *h;
     h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
@@ -70,6 +193,9 @@ class IldgIO : public BinaryIO {
     return LIME_SUCCESS;
   }
 
+  ////////////////////////////////////////////
+  // Write a generic serialisable object
+  ////////////////////////////////////////////
   template<class serialisable_object>
   static void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, LimeWriter *LimeW)
   {
@@ -81,24 +207,232 @@ class IldgIO : public BinaryIO {
     }
     uint64_t nbytes = xmlstring.size();
     LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes);
-    assert(limeWriteRecordHeader(h, LimeW)>=0);
-    assert(limeWriteRecordData(&xmlstring[0], &nbytes, LimeW)>=0);
-    limeWriterCloseRecord(LimeW);
+    int err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
+    err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
+    err=limeWriterCloseRecord(LimeW);  assert(err>=0);
     limeDestroyHeader(h);
   }
+  ////////////////////////////////////////////
+  // Read a generic serialisable object
+  ////////////////////////////////////////////
+  template<class serialisable_object>
+  static void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name, LimeReader *LimeR)
+  {
+    std::string xmlstring;
+    // should this be a do while; can we miss a first record??
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 
-  static unsigned int writeHeader(FieldMetaData &header, LimeWriter *LimeW) {
+      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
+
+      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+	std::vector<char> xmlc(nbytes+1,'\0');
+	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
+	XmlReader RD(&xmlc[0],"");
+	read(RD,object_name,object);
+	return;
+      }
+
+    }  
+    assert(0);
+  }
+
+  ////////////////////////////////////////////
+  // Read a generic lattice field and verify checksum
+  ////////////////////////////////////////////
+  template<class vobj>
+  static void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string filename,std::string record_name,FILE *File, LimeReader *LimeR)
+  {
+    typedef typename vobj::scalar_object sobj;
+    scidacChecksum scidacChecksum_;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+
+    std::string format = getFormatString<vobj>();
+
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+
+      std::cout << GridLogMessage << limeReaderType(LimeR) <<std::endl;
+	
+      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+
+
+	off_t offset= ftell(File);
+	BinarySimpleMunger<sobj,sobj> munge;
+	BinaryIO::readLatticeObject< sobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+
+	/////////////////////////////////////////////
+	// Insist checksum is next record
+	/////////////////////////////////////////////
+	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name,LimeR);
+
+	/////////////////////////////////////////////
+	// Verify checksums
+	/////////////////////////////////////////////
+	scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
+	return;
+      }
+    }
+  }
+
+  ////////////////////////////////////////////
+  // Write a generic lattice field and csum
+  ////////////////////////////////////////////
+  template<class vobj>
+  static void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string filename,std::string record_name,FILE *File, LimeWriter *LimeW)
+  {
+
+    ////////////////////////////////////////////
+    // Create record header
+    ////////////////////////////////////////////
+    typedef typename vobj::scalar_object sobj;
+    int err;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
+    createLimeRecordHeader(record_name, 0, 0, PayloadSize, LimeW);
+
+    ////////////////////////////////////////////////////////////////////
+    // NB: FILE and iostream are jointly writing disjoint sequences in the
+    // the same file through different file handles (integer units).
+    // 
+    // These are both buffered, so why I think this code is right is as follows.
+    //
+    // i)  write record header to FILE *File, telegraphing the size. 
+    // ii) ftell reads the offset from FILE *File .
+    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
+    //      Closes iostream and flushes.
+    // iv) fseek on FILE * to end of this disjoint section.
+    //  v) Continue writing scidac record.
+    ////////////////////////////////////////////////////////////////////
+    off_t offset = ftell(File);
+    std::string format = getFormatString<vobj>();
+    BinarySimpleMunger<sobj,sobj> munge;
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+    err=limeWriterCloseRecord(LimeW);  assert(err>=0);
+    ////////////////////////////////////////
+    // Write checksum element, propagaing forward from the BinaryIO
+    // Always pair a checksum with a binary object, and close message
+    ////////////////////////////////////////
+    scidacChecksum checksum;
+    std::stringstream streama; streama << std::hex << scidac_csuma;
+    std::stringstream streamb; streamb << std::hex << scidac_csumb;
+    checksum.suma= streama.str();
+    checksum.sumb= streamb.str();
+    std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
+    writeLimeObject(0,1,checksum,std::string("scidacChecksum"    ),std::string(SCIDAC_CHECKSUM),LimeW);
+  }
+  // Could end the LIME base class here
+};
+
+class ScidacIO : public LimeIO {
+ public:
+   /*
+    LimeWriter *LimeW;
+    LimeReader *LimeR;
+    FILE *File;
+  template<class userFile>
+  int open(std::string filename,GridBase *grid,userFile &_userFile,int volfmt) {
+ 
+  }
+  void close(void) {
+ 
+  }
+  template<class vobj,class userRecord>
+  int writeScidacField(Lattice<vobj> &field,userRecord &_userRecord,int volfmt) 
+  template<class vobj,class userRecord>
+  int  readScidacField(Lattice<vobj> &field,userRecord &_userRecord,int volfmt) 
+   */
+  ////////////////////////////////////////////////
+  // Write generic lattice field in scidac format
+  ////////////////////////////////////////////////
+  template <class vobj,class userFile, class userRecord>
+  static void writeScidacField(std::string filename,Lattice<vobj> &field,userFile _userFile,userRecord _userRecord) 
+  {
+    typedef typename vobj::scalar_object sobj;
+    uint64_t nbytes;
+    GridBase * grid = field._grid;
+
+    ////////////////////////////////////////
+    // fill the Grid header
+    ////////////////////////////////////////
+    FieldMetaData header;
+    scidacRecord  _scidacRecord;
+    scidacFile    _scidacFile;
+
+    ScidacMetaData(field,header,_scidacRecord,_scidacFile);
+
+    //////////////////////////////////////////////
+    // Fill the Lime file record by record
+    //////////////////////////////////////////////
+    FILE *File = fopen(filename.c_str(), "w");
+    LimeWriter *LimeW = limeCreateWriter(File);
+    assert(LimeW != NULL );
+
+    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); // Open message 
+    writeLimeObject(0,0,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML),LimeW);
+    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML),LimeW);
+    writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML),LimeW);
+    writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML),LimeW);
+    writeLimeLatticeBinaryObject(field,filename,std::string(ILDG_BINARY_DATA),File,LimeW);      // Closes message with checksum
+
+    limeDestroyWriter(LimeW);
+    fclose(File);
+  }
+};
+
+class IldgIO : public ScidacIO {
+ public:
+
+  ///////////////////////////////////
+  // A little helper
+  ///////////////////////////////////
+  static void writeLimeIldgLFN(std::string &LFN,LimeWriter *LimeW)
+  {
+    uint64_t PayloadSize = LFN.size();
+    int err;
+    createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize, LimeW);
+    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize, LimeW); assert(err>=0);
+    err=limeWriterCloseRecord(LimeW); assert(err>=0);
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Special ILDG operations ; gauge configs only.
+  // Don't require scidac records EXCEPT checksum
+  // Use Grid MetaData object if present.
+  ////////////////////////////////////////////////////////////////
+  template <class vsimd>
+  static void writeConfiguration(std::string filename,Lattice<iLorentzColourMatrix<vsimd> > &Umu) 
+  {
+    GridBase * grid = Umu._grid;
+    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+    typedef iLorentzColourMatrix<vsimd> vobj;
+    typedef typename vobj::scalar_object sobj;
 
     uint64_t nbytes;
 
-    ildgFormat ildgfmt ;
-    usqcdInfo info;
+    ////////////////////////////////////////
+    // fill the Grid header
+    ////////////////////////////////////////
+    FieldMetaData header;
+    scidacRecord  _scidacRecord;
+    scidacFile    _scidacFile;
+
+    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
+
+    std::string format = header.floating_point;
+
+    assert ( (format == std::string("IEEE32BIG"))  
+           ||(format == std::string("IEEE64BIG")) );
 
     //////////////////////////////////////////////////////
     // Fill ILDG header data struct
     //////////////////////////////////////////////////////
+    ildgFormat ildgfmt ;
     ildgfmt.field     = std::string("su3gauge");
-    ildgfmt.precision = 64;
+
+    if ( format == std::string("IEEE32BIG") ) { 
+      ildgfmt.precision = 32;
+    } else { 
+      ildgfmt.precision = 64;
+    }
     ildgfmt.version = 1.0;
     ildgfmt.lx = header.dimension[0];
     ildgfmt.ly = header.dimension[1];
@@ -107,108 +441,59 @@ class IldgIO : public BinaryIO {
     assert(header.nd==4);
     assert(header.nd==header.dimension.size());
 
+    //////////////////////////////////////////////////////////////////////////////
+    // Fill the USQCD info field
+    //////////////////////////////////////////////////////////////////////////////
+    usqcdInfo info;
     info.version=1.0;
     info.plaq   = header.plaquette;
     info.linktr = header.link_trace;
 
-    // Following scidac file downloaded from NERSC under MILC
-    // Begin message, keep open on successive records
-    //Message 1
-    // Type:           scidac-private-file-xml <scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 48 </dims><volfmt>0</volfmt></scidacFile>
-    // Type:           scidac-file-xml         <title>MILC ILDG archival gauge configuration</title>
-    //Message 2
-    // Type:           scidac-private-record-xml <scidacRecord><version>1.0</version><date>Thu May 11 00:11:33 2006 UTC</date><globaldata>0</globaldata>
-    //                    <datatype>QDP_F3_ColorMatrix</datatype><precision>F</precision><colors>3</colors><typesize>72</typesize><datacount>4</datacount></scidacRecord>
-    // Type:           scidac-record-xml 
-    // Type:           ildg-format
-    // Type:           ildg-data-lfn
-    // Type:           ildg-binary-data
-    // Type:           scidac-checksum
-
-    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW);
-    writeLimeObject(0,0,info   ,std::string("usqcdInfo"    ),std::string(USQCD_INFO ),LimeW);
-    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT),LimeW);
-    // LFN is not a serializable object
-    {
-      std::string LFN = header.ildg_lfn; 
-      uint64_t PayloadSize = LFN.size();
-      createHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize, LimeW);
-      limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize, LimeW);
-      limeWriterCloseRecord(LimeW);
-    }
-    return 0;
-  }
-
-  template <class vsimd>
-  static void writeConfiguration(std::string filename,Lattice<iLorentzColourMatrix<vsimd> > &Umu, std::string format) {
+    std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
+    //////////////////////////////////////////////
+    // Fill the Lime file record by record
+    //////////////////////////////////////////////
 
     FILE *File = fopen(filename.c_str(), "w");
-    LimeWriter *LimeW = limeCreateWriter(File);
-
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef iLorentzColourMatrix<vsimd> vobj;
-    typedef typename vobj::scalar_object sobj;
-    typedef LorentzColourMatrixD fobj;
-
-    GridBase * grid = Umu._grid;
-
-    ////////////////////////////////////////
-    // fill the headers
-    ////////////////////////////////////////
-    FieldMetaData header;
-
-    GridMetaData(grid,header); 
-    GaugeStatistics<GaugeField>(Umu,header);
-    MachineCharacteristics(header);
-
-    assert( (format=="IEEE64BIG") || (format=="IEEE32BIG"));
-    header.floating_point = format;
-    header.checksum = 0x0; // unused in ILDG
-    writeHeader(header,LimeW);
-
-    ////////////////////////////////////////
-    // Write data record header
-    ////////////////////////////////////////
-    uint64_t PayloadSize = sizeof(fobj) * Umu._grid->_gsites;
-    createHeader(ILDG_BINARY_DATA, 0, 0, PayloadSize, LimeW);
-    
-    off_t offset = ftell(File);
-    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    GaugeSimpleMunger<sobj, fobj> munge;
-    BinaryIO::writeLatticeObject<vobj, fobj >(Umu, filename, munge, offset, header.floating_point,
-					      nersc_csum,scidac_csuma,scidac_csumb);
-    limeWriterCloseRecord(LimeW);
-
-    ////////////////////////////////////////
-    // Write checksum element, propagaing forward from the BinaryIO
-    ////////////////////////////////////////
-    scidacChecksum checksum;
-    checksum.suma= scidac_csuma;
-    checksum.sumb= scidac_csumb;
-    //    std::cout << " writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
-    writeLimeObject(0,1,checksum,std::string("scidacChecksum"    ),std::string(SCIDAC_CHECKSUM),LimeW);
-
+    LimeWriter *LimeW = limeCreateWriter(File); assert(LimeW != NULL);
+    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); // Open message 
+    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML),LimeW);
+    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML),LimeW);
+    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML),LimeW);
+    writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML),LimeW);
+    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT),LimeW); // rec
+    writeLimeIldgLFN(header.ildg_lfn, LimeW);                                                 // rec
+    writeLimeLatticeBinaryObject(Umu,filename,std::string(ILDG_BINARY_DATA),File,LimeW);      // Closes message with checksum
+    limeDestroyWriter(LimeW);
     fclose(File);
   }
 
+  ////////////////////////////////////////////////////////////////
+  // Read either Grid/SciDAC/ILDG configuration
+  // Don't require scidac records EXCEPT checksum
+  // Use Grid MetaData object if present.
+  // Else use ILDG MetaData object if present.
+  // Else use SciDAC MetaData object if present.
+  ////////////////////////////////////////////////////////////////
   template <class vsimd>
   static void readConfiguration(std::string filename,Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
 
     typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef LorentzColourMatrixD sobjd;
-    typedef LorentzColourMatrixF sobjf;
-    typedef iLorentzColourMatrix<vsimd> itype;
-    typedef LorentzColourMatrix sobj;
+    typedef typename GaugeField::vector_object  vobj;
+    typedef typename vobj::scalar_object sobj;
+
+    typedef LorentzColourMatrixF fobj;
+    typedef LorentzColourMatrixD dobj;
 
     GridBase *grid = Umu._grid;
 
     std::vector<int> dims = Umu._grid->FullDimensions();
+
     assert(dims.size()==4);
 
     FILE *File = fopen(filename.c_str(), "r");
     LimeReader *LimeR = limeCreateReader(File);
 
-
     // Metadata holders
     ildgFormat     ildgFormat_    ;
     std::string    ildgLFN_       ;
@@ -263,8 +548,6 @@ class IldgIO : public BinaryIO {
 	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
 	  if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
 
-	  //	  std::cout << "This is an ILDG format record : "<<format<<std::endl;
-
 	  assert( ildgFormat_.lx == dims[0]);
 	  assert( ildgFormat_.ly == dims[1]);
 	  assert( ildgFormat_.lz == dims[2]);
@@ -275,7 +558,6 @@ class IldgIO : public BinaryIO {
 
 	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
 	  FieldMetaData_.ildg_lfn = std::string(&xmlc[0]);
-	  //	  std::cout << "ILDG logical file name "<< FieldMetaData_.ildg_lfn << std::endl;
 	  found_ildgLFN = 1;
 	}
 
@@ -284,8 +566,6 @@ class IldgIO : public BinaryIO {
 	  XmlReader RD(&xmlc[0],"");
 	  read(RD,"FieldMetaData",FieldMetaData_);
 
-	  //	  std::cout << "Grid header found : format is "<<FieldMetaData_.floating_point<<std::endl;
-
 	  format = FieldMetaData_.floating_point;
 
 	  assert(FieldMetaData_.dimension[0] == dims[0]);
@@ -296,19 +576,15 @@ class IldgIO : public BinaryIO {
 	  found_FieldMetaData = 1;
 	}
 
-	if ( !strncmp(limeReaderType(LimeR), USQCD_INFO,strlen(USQCD_INFO)) ) { 
+	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
 	  XmlReader RD(&xmlc[0],"");
-	  read(RD,USQCD_INFO,usqcdInfo_);
-	  //	  std::cout << "USQCD info record found " <<std::endl;
+	  read(RD,"usqcdInfo",usqcdInfo_);
 	  found_usqcdInfo = 1;
 	}
 
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
 	  XmlReader RD(&xmlc[0],"");
 	  read(RD,"scidacChecksum",scidacChecksum_);
-	  FieldMetaData_.scidac_checksuma = scidacChecksum_.suma;
-	  FieldMetaData_.scidac_checksumb = scidacChecksum_.sumb;
-	  //std::cout << " Read Out "<<scidacChecksum_.version<<"/"<< scidacChecksum_.suma<<"/"<<scidacChecksum_.sumb<<std::endl;
 	  found_scidacChecksum = 1;
 	}
 
@@ -316,11 +592,17 @@ class IldgIO : public BinaryIO {
 	/////////////////////////////////
 	// Binary data
 	/////////////////////////////////
-	std::cout << GridLogMessage << ILDG_BINARY_DATA << std::endl;
+	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
 	off_t offset= ftell(File);
-	GaugeSimpleMunger<sobjd, sobj> munge;
-	BinaryIO::readLatticeObject< itype, sobjd >(Umu, filename, munge, offset, format,
-						    nersc_csum,scidac_csuma,scidac_csumb);
+
+	if ( format == std::string("IEEE64BIG") ) {
+	  GaugeSimpleMunger<dobj, sobj> munge;
+	  BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+	} else { 
+	  GaugeSimpleMunger<fobj, sobj> munge;
+	  BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+	}
+
 	found_ildgBinary = 1;
       }
 
@@ -328,8 +610,10 @@ class IldgIO : public BinaryIO {
 
     //////////////////////////////////////////////////////
     // Minimally must find binary segment and checksum
+    // Since this is an ILDG reader require ILDG format
     //////////////////////////////////////////////////////
     assert(found_ildgBinary);
+    assert(found_ildgFormat);
     assert(found_scidacChecksum);
 
     // Must find something with the lattice dimensions
@@ -337,9 +621,7 @@ class IldgIO : public BinaryIO {
 
     if ( found_FieldMetaData ) {
 
-      std::cout << GridLogMessage<<"a Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
-      //      std::cout << "Read Grid Plaqette  "<<FieldMetaData_.plaquette<<std::endl;
-      //      std::cout << "Read Grid LinkTrace "<<FieldMetaData_.link_trace<<std::endl;
+      std::cout << GridLogMessage<<"Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
 
     } else { 
 
@@ -354,8 +636,8 @@ class IldgIO : public BinaryIO {
       FieldMetaData_.hdr_version = vers.str();
       FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
 
-      assert(FieldMetaData_.nd==4);
-      assert(FieldMetaData_.dimension.size()==4);
+      FieldMetaData_.nd=4;
+      FieldMetaData_.dimension.resize(4);
 
       FieldMetaData_.dimension[0] = ildgFormat_.lx ;
       FieldMetaData_.dimension[1] = ildgFormat_.ly ;
@@ -365,38 +647,42 @@ class IldgIO : public BinaryIO {
       if ( found_usqcdInfo ) { 
 	FieldMetaData_.plaquette = usqcdInfo_.plaq;
 	FieldMetaData_.link_trace= usqcdInfo_.linktr;
-	//	std::cout << "This configuration was probably written by USQCD and not Grid "<<std::endl;
-	//	std::cout << "Read USQCD Plaquette  "<<FieldMetaData_.plaquette<<std::endl;
-	//	std::cout << "Read USQCD LinkTrace  "<<FieldMetaData_.link_trace<<std::endl;
+	std::cout << GridLogMessage <<"This configuration was probably written by USQCD "<<std::endl;
+	std::cout << GridLogMessage <<"USQCD xml record Plaquette : "<<FieldMetaData_.plaquette<<std::endl;
+	std::cout << GridLogMessage <<"USQCD xml record LinkTrace : "<<FieldMetaData_.link_trace<<std::endl;
       } else { 
 	FieldMetaData_.plaquette = 0.0;
 	FieldMetaData_.link_trace= 0.0;
-	std::cout << "Uhoh... This configuration is unsafe and contains no recognised checksum or physics records that can verify it !!! "<<std::endl;
+	std::cout << GridLogWarning << "This configuration is unsafe with no plaquette records that can verify it !!! "<<std::endl;
       }
     }
 
+    ////////////////////////////////////////////////////////////
+    // Really really want to mandate a scidac checksum
+    ////////////////////////////////////////////////////////////
     if ( found_scidacChecksum ) {
+      FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
+      FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
+      scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
       assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
       assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
       std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
     } else { 
       std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
-      //Could choose to fail ?
+      assert(0); // Can I insist always checksum ?
     }
 
     if ( found_FieldMetaData || found_usqcdInfo ) {
       FieldMetaData checker;
-      GaugeStatistics<GaugeField>(Umu,checker);
+      GaugeStatistics(Umu,checker);
       assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
       assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
       std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
     }
   }
+ };
 
-  // format for RNG? Now just binary out
-};
-}
-}
+}}
 
 //HAVE_LIME
 #endif
diff --git a/lib/parallelIO/IldgIOtypes.h b/lib/parallelIO/IldgIOtypes.h
index 8e1316eb..c3a5321c 100644
--- a/lib/parallelIO/IldgIOtypes.h
+++ b/lib/parallelIO/IldgIOtypes.h
@@ -34,16 +34,110 @@ extern "C" { // for linkage
 
 namespace Grid {
 
+/////////////////////////////////////////////////////////////////////////////////
+// Data representation of records that enter ILDG and SciDac formats
+/////////////////////////////////////////////////////////////////////////////////
+
 #define GRID_FORMAT      "grid-format"
 #define ILDG_FORMAT      "ildg-format"
 #define ILDG_BINARY_DATA "ildg-binary-data"
 #define ILDG_DATA_LFN    "ildg-data-lfn"
-#define USQCD_INFO       "usqcdInfo"
-#define SCIDAC_CHECKSUM  "scidac-checksum"
+#define SCIDAC_CHECKSUM           "scidac-checksum"
+#define SCIDAC_PRIVATE_FILE_XML   "scidac-private-file-xml"
+#define SCIDAC_FILE_XML           "scidac-file-xml"
+#define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml"
+#define SCIDAC_RECORD_XML         "scidac-record-xml"
+#define SCIDAC_BINARY_DATA        "scidac-binary-data"
+// Unused SCIDAC records names; could move to support this functionality
+#define SCIDAC_SITELIST           "scidac-sitelist"
+
+  ////////////////////////////////////////////////////////////
+  const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
+  const int GRID_IO_MULTIFILE  = 1; // hardcode lift from QIO compat
+  const int GRID_IO_FIELD      = 0; // hardcode lift from QIO compat
+  const int GRID_IO_GLOBAL     = 1; // hardcode lift from QIO compat
+  ////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////////
-// Data representation of records that enter ILDG and SciDac formats
+// QIO uses mandatory "private" records fixed format
+// Private is in principle "opaque" however it can't be changed now because that would break existing 
+// file compatability, so should be correct to assume the undocumented but defacto file structure.
 /////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////
+// Scidac private file xml
+// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
+////////////////////////
+struct scidacFile : Serializable {
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
+                                  double, version,
+                                  int, spacetime,
+				  std::string, dims, // must convert to int
+                                  int, volfmt);
+
+  std::vector<int> getDimensions(void) { 
+    std::stringstream stream(dims);
+    std::vector<int> dimensions;
+    int n;
+    while(stream >> n){
+      dimensions.push_back(n);
+    }
+    return dimensions;
+  }
+
+  void setDimensions(std::vector<int> dimensions) { 
+    char delimiter = ' ';
+    std::stringstream stream;
+    for(int i=0;i<dimensions.size();i++){ 
+      stream << dimensions[i];
+      if ( i != dimensions.size()-1) { 
+	stream << delimiter <<std::endl;
+      }
+    }
+    dims = stream.str();
+  }
+
+  // Constructor provides Grid
+  scidacFile() =default; // default constructor
+  scidacFile(GridBase * grid){
+    version      = 1.0;
+    spacetime    = grid->_ndimension;
+    setDimensions(grid->FullDimensions()); 
+    volfmt       = GRID_IO_SINGLEFILE;
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////
+// scidac-private-record-xml : example
+// <scidacRecord>
+// <version>1.1</version><date>Tue Jul 26 21:14:44 2011 UTC</date><recordtype>0</recordtype>
+// <datatype>QDP_D3_ColorMatrix</datatype><precision>D</precision><colors>3</colors><spins>4</spins>
+// <typesize>144</typesize><datacount>4</datacount>
+// </scidacRecord>
+///////////////////////////////////////////////////////////////////////
+
+struct scidacRecord : Serializable {
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
+                                  double, version,
+                                  std::string, date,
+				  int, recordtype,
+				  std::string, datatype,
+				  std::string, precision,
+				  int, colors,
+				  int, spins,
+				  int, typesize,
+				  int, datacount);
+
+  scidacRecord() { version =1.0; }
+
+};
+
+////////////////////////
+// ILDG format
+////////////////////////
 struct ildgFormat : Serializable {
 public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
@@ -54,10 +148,11 @@ public:
 				  int, ly,
 				  int, lz,
 				  int, lt);
-  ildgFormat() { 
-    version=1.0; 
-  };
+  ildgFormat() { version=1.0; };
 };
+////////////////////////
+// USQCD info
+////////////////////////
 struct usqcdInfo : Serializable { 
  public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
@@ -69,7 +164,36 @@ struct usqcdInfo : Serializable {
     version=1.0; 
   };
 };
+////////////////////////
+// Scidac Checksum
+////////////////////////
+struct scidacChecksum : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
+				  double, version,
+				  std::string, suma,
+				  std::string, sumb);
+  scidacChecksum() { 
+    version=1.0; 
+  };
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Type:           scidac-file-xml         <title>MILC ILDG archival gauge configuration</title>
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Type:           
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////
+// Scidac private file xml 
+// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile> 
+////////////////////////                                                                                                                                                                              
+
+#if 0
+////////////////////////////////////////////////////////////////////////////////////////
+// From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
+////////////////////////////////////////////////////////////////////////////////////////
 struct usqcdPropFile : Serializable { 
  public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
@@ -100,17 +224,8 @@ struct usqcdPropInfo : Serializable {
     version=1.0; 
   };
 };
-struct scidacChecksum : Serializable { 
- public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
-				  double, version,
-				  uint32_t, suma,
-				  uint32_t, sumb);
-  scidacChecksum() { 
-    version=1.0; 
-    suma=sumb=0;
-  };
-};
+#endif
+
 }
 #endif
 #endif

From ae4de947989d1c9299b7dbeb8c1a570f745a84d7 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:11:23 +0100
Subject: [PATCH 083/177] SciDAC I/O support

---
 lib/parallelIO/MetaData.h | 124 ++++++++++++++++++++++++++++++++++----
 lib/parallelIO/NerscIO.h  |   4 +-
 2 files changed, 114 insertions(+), 14 deletions(-)

diff --git a/lib/parallelIO/MetaData.h b/lib/parallelIO/MetaData.h
index 1bad07f2..6d45d0a5 100644
--- a/lib/parallelIO/MetaData.h
+++ b/lib/parallelIO/MetaData.h
@@ -38,9 +38,24 @@
 
 namespace Grid {
 
-    ////////////////////////////////////////////////////////////////////////////////
-    // header specification/interpretation
-    ////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////
+  // Precision mapping
+  ///////////////////////////////////////////////////////
+  template<class vobj> static std::string getFormatString (void)
+  {
+    std::string format;
+    typedef typename getPrecision<vobj>::real_scalar_type stype;
+    if ( sizeof(stype) == sizeof(float) ) {
+      format = std::string("IEEE32BIG");
+    }
+    if ( sizeof(stype) == sizeof(double) ) {
+      format = std::string("IEEE64BIG");
+    }
+    return format;
+  }
+  ////////////////////////////////////////////////////////////////////////////////
+  // header specification/interpretation
+  ////////////////////////////////////////////////////////////////////////////////
     class FieldMetaData : Serializable {
     public:
 
@@ -66,8 +81,15 @@ namespace Grid {
 				      std::string, creation_date,
 				      std::string, archive_date,
 				      std::string, floating_point);
+      FieldMetaData(void) { 
+	nd=4;
+	dimension.resize(4);
+	boundary.resize(4);
+      }
     };
 
+
+
   namespace QCD {
 
     using namespace Grid;
@@ -89,13 +111,6 @@ namespace Grid {
 	header.boundary[d] = std::string("PERIODIC");
       }
     }
-    template<class GaugeField>
-    inline void GaugeStatistics(GaugeField & data,FieldMetaData &header)
-    {
-      // How to convert data precision etc...
-      header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplR>::linkTrace(data);
-      header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplR>::avgPlaquette(data);
-    }
 
     inline void MachineCharacteristics(FieldMetaData &header)
     {
@@ -133,7 +148,7 @@ namespace Grid {
 	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
       }									\
 									\
-      s << "CHECKSUM (NERSC) = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
+      s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
       s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
       s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
       s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
@@ -146,6 +161,48 @@ namespace Grid {
       s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
       s << "END_HEADER"         << std::endl;
 
+template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
+{
+  GridBase *grid = field._grid;
+  std::string format = getFormatString<vobj>();
+   header.floating_point = format;
+   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
+   GridMetaData(grid,header); 
+   MachineCharacteristics(header);
+ }
+ inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
+ {
+   // How to convert data precision etc...
+   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
+   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
+ }
+ inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
+ {
+   // How to convert data precision etc...
+   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
+   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
+ }
+ template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
+ {
+   
+   GridBase *grid = field._grid;
+   std::string format = getFormatString<vLorentzColourMatrixF>();
+   header.floating_point = format;
+   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
+   GridMetaData(grid,header); 
+   GaugeStatistics(field,header);
+   MachineCharacteristics(header);
+ }
+ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
+ {
+   GridBase *grid = field._grid;
+   std::string format = getFormatString<vLorentzColourMatrixD>();
+   header.floating_point = format;
+   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
+   GridMetaData(grid,header); 
+   GaugeStatistics(field,header);
+   MachineCharacteristics(header);
+ }
 
     //////////////////////////////////////////////////////////////////////
     // Utilities ; these are QCD aware
@@ -171,6 +228,48 @@ namespace Grid {
     typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
     typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
 
+/////////////////////////////////////////////////////////////////////////////////
+// Simple classes for precision conversion
+/////////////////////////////////////////////////////////////////////////////////
+template <class fobj, class sobj>
+struct BinarySimpleUnmunger {
+  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
+  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
+  
+  void operator()(sobj &in, fobj &out) {
+    // take word by word and transform accoding to the status
+    fobj_stype *out_buffer = (fobj_stype *)&out;
+    sobj_stype *in_buffer = (sobj_stype *)&in;
+    size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
+    size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
+    assert(fobj_words == sobj_words);
+    
+    for (unsigned int word = 0; word < sobj_words; word++)
+      out_buffer[word] = in_buffer[word];  // type conversion on the fly
+    
+  }
+};
+
+template <class fobj, class sobj>
+struct BinarySimpleMunger {
+  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
+  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
+
+  void operator()(fobj &in, sobj &out) {
+    // take word by word and transform accoding to the status
+    fobj_stype *in_buffer = (fobj_stype *)&in;
+    sobj_stype *out_buffer = (sobj_stype *)&out;
+    size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
+    size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
+    assert(fobj_words == sobj_words);
+    
+    for (unsigned int word = 0; word < sobj_words; word++)
+      out_buffer[word] = in_buffer[word];  // type conversion on the fly
+    
+  }
+};
+
+
     template<class fobj,class sobj>
     struct GaugeSimpleMunger{
       void operator()(fobj &in, sobj &out) {
@@ -220,6 +319,7 @@ namespace Grid {
 	}
       }
     };
-
   }
+
+
 }
diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index cc37b537..786839f2 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -179,7 +179,7 @@ namespace Grid {
 	assert(0);
       }
 
-      GaugeStatistics<GaugeField>(Umu,clone);
+      GaugeStatistics(Umu,clone);
 
       std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	       <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -233,7 +233,7 @@ namespace Grid {
 
 	GridMetaData(grid,header);
 	assert(header.nd==4);
-	GaugeStatistics<GaugeField>(Umu,header);
+	GaugeStatistics(Umu,header);
 	MachineCharacteristics(header);
 
 	int offset;

From 46879e165814015c8d82195771573df01a1edd66 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:11:45 +0100
Subject: [PATCH 084/177] Complex defined in Impl even for gauge.

---
 lib/qcd/action/fermion/FermionOperatorImpl.h |  6 ------
 lib/qcd/action/gauge/GaugeImplTypes.h        | 16 +++++++++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 20458b6d..524179f5 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -644,19 +644,16 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
 
     INHERIT_GIMPL_TYPES(Gimpl);
       
-    template <typename vtype> using iImplScalar            = iScalar<iScalar<iScalar<vtype> > >;
     template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
     template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
     template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
     template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
     
-    typedef iImplScalar<Simd>            SiteComplex;
     typedef iImplSpinor<Simd>            SiteSpinor;
     typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
     typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
     typedef iImplPropagator<Simd>        SitePropagator;
     
-    typedef Lattice<SiteComplex>           ComplexField;
     typedef Lattice<SiteSpinor>            FermionField;
     typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
     typedef Lattice<SitePropagator> PropagatorField;
@@ -775,7 +772,6 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
 
     INHERIT_GIMPL_TYPES(Gimpl);
 
-    template <typename vtype> using iImplScalar            = iScalar<iScalar<iScalar<vtype> > >;
     template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
     template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
     template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
@@ -792,12 +788,10 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
     typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
     typedef Lattice<SitePropagator> PropagatorField;
     
-    typedef iImplScalar<Simd>            SiteComplex;
     typedef iImplSpinor<Simd>            SiteSpinor;
     typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
 
     
-    typedef Lattice<SiteComplex>           ComplexField;
     typedef Lattice<SiteSpinor>            FermionField;
     
     typedef SimpleCompressor<SiteSpinor> Compressor;
diff --git a/lib/qcd/action/gauge/GaugeImplTypes.h b/lib/qcd/action/gauge/GaugeImplTypes.h
index 9d36eead..0c0df219 100644
--- a/lib/qcd/action/gauge/GaugeImplTypes.h
+++ b/lib/qcd/action/gauge/GaugeImplTypes.h
@@ -40,12 +40,15 @@ namespace QCD {
   typedef typename GImpl::Simd Simd;                \
   typedef typename GImpl::LinkField GaugeLinkField; \
   typedef typename GImpl::Field GaugeField;         \
+  typedef typename GImpl::ComplexField ComplexField;\
   typedef typename GImpl::SiteField SiteGaugeField; \
+  typedef typename GImpl::SiteComplex SiteComplex;  \
   typedef typename GImpl::SiteLink SiteGaugeLink;
 
-#define INHERIT_FIELD_TYPES(Impl)             \
-  typedef typename Impl::Simd Simd;           \
-  typedef typename Impl::SiteField SiteField; \
+#define INHERIT_FIELD_TYPES(Impl)		    \
+  typedef typename Impl::Simd Simd;		    \
+  typedef typename Impl::ComplexField ComplexField; \
+  typedef typename Impl::SiteField SiteField;	    \
   typedef typename Impl::Field Field;
 
 // hardcodes the exponential approximation in the template
@@ -53,12 +56,15 @@ template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplType
 public:
   typedef S Simd;
 
-  template <typename vtype> using iImplGaugeLink  = iScalar<iScalar<iMatrix<vtype, Nrepresentation>>>;
-  template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation>>, Nd>;
+  template <typename vtype> using iImplScalar     = iScalar<iScalar<iScalar<vtype> > >;
+  template <typename vtype> using iImplGaugeLink  = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+  template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
 
+  typedef iImplScalar<Simd>     SiteComplex;
   typedef iImplGaugeLink<Simd>  SiteLink;
   typedef iImplGaugeField<Simd> SiteField;
 
+  typedef Lattice<SiteComplex> ComplexField;
   typedef Lattice<SiteLink>  LinkField; 
   typedef Lattice<SiteField> Field;
 

From b96daf53a0c060c530eee3769861133d764589cf Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:12:15 +0100
Subject: [PATCH 085/177] Query tensor structures

---
 lib/tensors/Tensor_index.h | 60 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/lib/tensors/Tensor_index.h b/lib/tensors/Tensor_index.h
index 7f34f3ac..f114baf8 100644
--- a/lib/tensors/Tensor_index.h
+++ b/lib/tensors/Tensor_index.h
@@ -47,6 +47,28 @@ template<int Level>
 class TensorIndexRecursion {
 
  public:
+
+  ////////////////////////////////////////////////////
+  // Type Queries
+  ////////////////////////////////////////////////////
+  template<class vtype>       static inline int indexRank(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::indexRank(tmp._internal);  }
+  template<class vtype,int N> static inline int indexRank(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::indexRank(tmp._internal[0]);  }
+  template<class vtype,int N> static inline int indexRank(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::indexRank(tmp._internal[0][0]);  }
+
+  template<class vtype>       static inline int isScalar(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isScalar(tmp._internal);  }
+  template<class vtype,int N> static inline int isScalar(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isScalar(tmp._internal[0]);  }
+  template<class vtype,int N> static inline int isScalar(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isScalar(tmp._internal[0][0]);  }
+
+  template<class vtype>       static inline int isVector(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isVector(tmp._internal);  }
+  template<class vtype,int N> static inline int isVector(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isVector(tmp._internal[0]);  }
+  template<class vtype,int N> static inline int isVector(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isVector(tmp._internal[0][0]);  }
+
+  template<class vtype>       static inline int isMatrix(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal);  }
+  template<class vtype,int N> static inline int isMatrix(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal[0]);  }
+  template<class vtype,int N> static inline int isMatrix(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal[0][0]);  }
+  ////////////////////////////////////////////////////
+  // Trace
+  ////////////////////////////////////////////////////
   template<class vtype>
   static auto traceIndex(const iScalar<vtype> arg) ->  iScalar<decltype(TensorIndexRecursion<Level-1>::traceIndex(arg._internal))> 
   {
@@ -215,6 +237,24 @@ class TensorIndexRecursion {
 template<>
 class TensorIndexRecursion<0> {
  public:
+  ////////////////////////////////////////////////////
+  // Type Queries
+  ////////////////////////////////////////////////////
+  template<class vtype>       static inline int indexRank(const iScalar<vtype> tmp)  { return 1; }
+  template<class vtype,int N> static inline int indexRank(const iVector<vtype,N> tmp){ return N; }
+  template<class vtype,int N> static inline int indexRank(const iMatrix<vtype,N> tmp){ return N; }
+
+  template<class vtype>       static inline int isScalar(const iScalar<vtype> tmp)  { return true;}
+  template<class vtype,int N> static inline int isScalar(const iVector<vtype,N> tmp){ return false;}
+  template<class vtype,int N> static inline int isScalar(const iMatrix<vtype,N> tmp){ return false;}
+
+  template<class vtype>       static inline int isVector(const iScalar<vtype> tmp)  { return false;}
+  template<class vtype,int N> static inline int isVector(const iVector<vtype,N> tmp){ return true;}
+  template<class vtype,int N> static inline int isVector(const iMatrix<vtype,N> tmp){ return false;}
+
+  template<class vtype>       static inline int isMatrix(const iScalar<vtype> tmp)  { return false;}
+  template<class vtype,int N> static inline int isMatrix(const iVector<vtype,N> tmp){ return false;}
+  template<class vtype,int N> static inline int isMatrix(const iMatrix<vtype,N> tmp){ return true;}
 
   /////////////////////////////////////////
   // Ends recursion for trace (scalar/vector/matrix)
@@ -302,6 +342,26 @@ class TensorIndexRecursion<0> {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 // External wrappers
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<int Level,class vtype> inline int indexRank(void)
+{
+  vtype tmp;
+  return TensorIndexRecursion<Level>::indexRank(tmp);
+}
+template<int Level,class vtype> inline int isScalar(void)
+{
+  vtype tmp;
+  return TensorIndexRecursion<Level>::isScalar(tmp);
+}
+template<int Level,class vtype> inline int isVector(void)
+{
+  vtype tmp;
+  return TensorIndexRecursion<Level>::isVector(tmp);
+}
+template<int Level,class vtype> inline int isMatrix(void)
+{
+  vtype tmp;
+  return TensorIndexRecursion<Level>::isMatrix(tmp);
+}
 
 template<int Level,class vtype> inline auto traceIndex (const vtype &arg) -> RemoveCRV(TensorIndexRecursion<Level>::traceIndex(arg))
 {

From ae39ec85a3b89072d9ea325cb953068a064ec822 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:12:48 +0100
Subject: [PATCH 086/177] ComplexField defined

---
 lib/qcd/utils/WilsonLoops.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h
index 5382882e..ea713ec8 100644
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -73,7 +73,7 @@ public:
   //////////////////////////////////////////////////
   // trace of directed plaquette oriented in mu,nu plane
   //////////////////////////////////////////////////
-  static void traceDirPlaquette(LatticeComplex &plaq,
+  static void traceDirPlaquette(ComplexField &plaq,
                                 const std::vector<GaugeMat> &U, const int mu,
                                 const int nu) {
     GaugeMat sp(U[0]._grid);
@@ -83,9 +83,9 @@ public:
   //////////////////////////////////////////////////
   // sum over all planes of plaquette
   //////////////////////////////////////////////////
-  static void sitePlaquette(LatticeComplex &Plaq,
+  static void sitePlaquette(ComplexField &Plaq,
                             const std::vector<GaugeMat> &U) {
-    LatticeComplex sitePlaq(U[0]._grid);
+    ComplexField sitePlaq(U[0]._grid);
     Plaq = zero;
     for (int mu = 1; mu < Nd; mu++) {
       for (int nu = 0; nu < mu; nu++) {
@@ -104,11 +104,11 @@ public:
       U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
     }
 
-    LatticeComplex Plaq(Umu._grid);
+    ComplexField Plaq(Umu._grid);
 
     sitePlaquette(Plaq, U);
-    TComplex Tp = sum(Plaq);
-    Complex p = TensorRemove(Tp);
+    auto Tp = sum(Plaq);
+    auto p = TensorRemove(Tp);
     return p.real();
   }
 
@@ -129,15 +129,15 @@ public:
   static RealD linkTrace(const GaugeLorentz &Umu) {
     std::vector<GaugeMat> U(Nd, Umu._grid);
 
-    LatticeComplex Tr(Umu._grid);
+    ComplexField Tr(Umu._grid);
     Tr = zero;
     for (int mu = 0; mu < Nd; mu++) {
       U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
       Tr = Tr + trace(U[mu]);
     }
 
-    TComplex Tp = sum(Tr);
-    Complex p = TensorRemove(Tp);
+    auto Tp = sum(Tr);
+    auto p = TensorRemove(Tp);
 
     double vol = Umu._grid->gSites();
 
@@ -330,8 +330,8 @@ public:
 
     double coeff = 8.0/(32.0*M_PI*M_PI);
 
-    LatticeComplex qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
-    TComplex Tq = sum(qfield);
+    ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
+    auto Tq = sum(qfield);
     return TensorRemove(Tq).real();
   }
 
@@ -350,16 +350,16 @@ public:
                adj(Gimpl::CovShiftForward(
                    U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu])));
   }
-  static void traceDirRectangle(LatticeComplex &rect,
+  static void traceDirRectangle(ComplexField &rect,
                                 const std::vector<GaugeMat> &U, const int mu,
                                 const int nu) {
     GaugeMat sp(U[0]._grid);
     dirRectangle(sp, U, mu, nu);
     rect = trace(sp);
   }
-  static void siteRectangle(LatticeComplex &Rect,
+  static void siteRectangle(ComplexField &Rect,
                             const std::vector<GaugeMat> &U) {
-    LatticeComplex siteRect(U[0]._grid);
+    ComplexField siteRect(U[0]._grid);
     Rect = zero;
     for (int mu = 1; mu < Nd; mu++) {
       for (int nu = 0; nu < mu; nu++) {
@@ -379,12 +379,12 @@ public:
       U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
     }
 
-    LatticeComplex Rect(Umu._grid);
+    ComplexField Rect(Umu._grid);
 
     siteRectangle(Rect, U);
 
-    TComplex Tp = sum(Rect);
-    Complex p = TensorRemove(Tp);
+    auto Tp = sum(Rect);
+    auto p = TensorRemove(Tp);
     return p.real();
   }
   //////////////////////////////////////////////////

From 1d18d95d4f1457e2f37f0237db79873a346873df Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:13:03 +0100
Subject: [PATCH 087/177] Class name return

---
 lib/serialisation/MacroMagic.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h
index 04f1b401..774c947f 100644
--- a/lib/serialisation/MacroMagic.h
+++ b/lib/serialisation/MacroMagic.h
@@ -115,6 +115,7 @@ THE SOFTWARE.
 #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);
 
 #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)\
+  std::string SerialisableClassName(void) {return std::string(#cname);}	\
 GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\
 template <typename T>\
 static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \

From e6d984b484f9679bf1240414b2df239bc888e595 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 18 Jun 2017 00:13:22 +0100
Subject: [PATCH 088/177] ILDG tests

---
 tests/IO/Test_ildg_io.cc   | 2 +-
 tests/IO/Test_ildg_read.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc
index 1408c638..199773ab 100644
--- a/tests/IO/Test_ildg_io.cc
+++ b/tests/IO/Test_ildg_io.cc
@@ -78,7 +78,7 @@ int main (int argc, char ** argv)
   std::cout <<GridLogMessage<<"** Writing out  ILDG conf    *********"<<std::endl;
   std::cout <<GridLogMessage<<"**************************************"<<std::endl;
   std::string file("./ckpoint_ildg.4000");
-  IldgIO::writeConfiguration(file,Umu, "IEEE64BIG");
+  IldgIO::writeConfiguration(file,Umu);
 
   Umu_saved = Umu;
   std::cout <<GridLogMessage<<"**************************************"<<std::endl;
diff --git a/tests/IO/Test_ildg_read.cc b/tests/IO/Test_ildg_read.cc
index 70a46dbf..9ea1d412 100644
--- a/tests/IO/Test_ildg_read.cc
+++ b/tests/IO/Test_ildg_read.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   std::vector<LatticeColourMatrix> U(4,&Fine);
   
   FieldMetaData header;
-  std::string file("./ckpoint_lat");
+  std::string file("./ildg.file");
   IldgIO::readConfiguration(file,Umu,header);
 
   for(int mu=0;mu<Nd;mu++){

From 1300b0b04b8bd4a00c6a1b3ee6b93a5984bf88f5 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 19 Jun 2017 01:01:48 +0100
Subject: [PATCH 089/177] Update to enable multiple records per file more
 consistent with SciDAC. open, close, write records...

---
 lib/parallelIO/IldgIO.h                      | 285 ++++++++++---------
 lib/qcd/hmc/checkpointers/ILDGCheckpointer.h |  10 +-
 tests/IO/Test_ildg_io.cc                     |  10 +-
 tests/IO/Test_ildg_read.cc                   |   5 +-
 4 files changed, 173 insertions(+), 137 deletions(-)

diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h
index 9a1612d5..1d1b5e0c 100644
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -38,14 +38,17 @@ directory
 #include <sys/utsname.h>
 #include <unistd.h>
 
-//Lime is a must have for this functionality
-extern "C" {  // for linkage
+//C-Lime is a must have for this functionality
+extern "C" {  
 #include "lime.h"
 }
 
 namespace Grid {
 namespace QCD {
 
+  /////////////////////////////////
+  // Encode word types as strings
+  /////////////////////////////////
  template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
  template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
  template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
@@ -54,6 +57,9 @@ namespace QCD {
  template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
  template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
 
+  /////////////////////////////////////////
+  // Encode a generic tensor as a string
+  /////////////////////////////////////////
  template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
 
    typedef typename getPrecision<vobj>::real_scalar_type stype;
@@ -113,6 +119,10 @@ namespace QCD {
    return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
  };
 
+
+ ////////////////////////////////////////////////////////////
+ // Helper to fill out metadata
+ ////////////////////////////////////////////////////////////
  template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 					  FieldMetaData &header,
 					  scidacRecord & _scidacRecord,
@@ -159,88 +169,38 @@ namespace QCD {
 ////////////////////////////////////////////////////////////////////////////////////
 // Lime, ILDG and Scidac I/O classes
 ////////////////////////////////////////////////////////////////////////////////////
-class LimeIO : public BinaryIO {
+class GridLimeReader : public BinaryIO {
  public:
-
    ///////////////////////////////////////////////////
    // FIXME: format for RNG? Now just binary out instead
-   // FIXME: Make interface able to write multiple records
-   // FIXME: Split into LimeReader and LimeWriter
    ///////////////////////////////////////////////////
-   /*
-   FILE * File;
-   LimeWriter LimeW;
-   LimeReader LimeR;
-   template<class serialisable_object>
-   int readObject(serialisable_object &object,std::string object_name,std::string record_name)
 
-  int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize);
-  template<class serialisable_object>
-  int writeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name)
-  template<class vobj>
-  int writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string filename,std::string record_name)
-   */
-  ///////////////////////////////////////////////////////
-  // Lime utility functions
-  ///////////////////////////////////////////////////////
+   FILE       *File;
+   LimeReader *LimeR;
+   std::string filename;
 
-  static int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L)
-  {
-    LimeRecordHeader *h;
-    h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
-    assert(limeWriteRecordHeader(h, L) >= 0);
-    limeDestroyHeader(h);
-    return LIME_SUCCESS;
-  }
-
-  ////////////////////////////////////////////
-  // Write a generic serialisable object
-  ////////////////////////////////////////////
-  template<class serialisable_object>
-  static void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, LimeWriter *LimeW)
-  {
-    std::string xmlstring;
-    {
-      XmlWriter WR("","");
-      write(WR,object_name,object);
-      xmlstring = WR.XmlString();
-    }
-    uint64_t nbytes = xmlstring.size();
-    LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes);
-    int err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
-    err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
-    err=limeWriterCloseRecord(LimeW);  assert(err>=0);
-    limeDestroyHeader(h);
-  }
-  ////////////////////////////////////////////
-  // Read a generic serialisable object
-  ////////////////////////////////////////////
-  template<class serialisable_object>
-  static void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name, LimeReader *LimeR)
-  {
-    std::string xmlstring;
-    // should this be a do while; can we miss a first record??
-    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
-
-      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
-
-      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
-	std::vector<char> xmlc(nbytes+1,'\0');
-	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
-	XmlReader RD(&xmlc[0],"");
-	read(RD,object_name,object);
-	return;
-      }
-
-    }  
-    assert(0);
-  }
+   /////////////////////////////////////////////
+   // Open the file
+   /////////////////////////////////////////////
+   void open(std::string &_filename) 
+   {
+     filename= _filename;
+     File = fopen(filename.c_str(), "r");
+     LimeR = limeCreateReader(File);
+   }
+   /////////////////////////////////////////////
+   // Close the file
+   /////////////////////////////////////////////
+   void close(void){
+     fclose(File);
+     //     limeDestroyReader(LimeR);
+   }
 
   ////////////////////////////////////////////
   // Read a generic lattice field and verify checksum
   ////////////////////////////////////////////
   template<class vobj>
-  static void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string filename,std::string record_name,FILE *File, LimeReader *LimeR)
+  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
   {
     typedef typename vobj::scalar_object sobj;
     scidacChecksum scidacChecksum_;
@@ -262,7 +222,7 @@ class LimeIO : public BinaryIO {
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
-	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name,LimeR);
+	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name);
 
 	/////////////////////////////////////////////
 	// Verify checksums
@@ -272,14 +232,91 @@ class LimeIO : public BinaryIO {
       }
     }
   }
+  ////////////////////////////////////////////
+  // Read a generic serialisable object
+  ////////////////////////////////////////////
+  template<class serialisable_object>
+  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
+  {
+    std::string xmlstring;
+    // should this be a do while; can we miss a first record??
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 
+      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
+
+      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+	std::vector<char> xmlc(nbytes+1,'\0');
+	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
+	XmlReader RD(&xmlc[0],"");
+	read(RD,object_name,object);
+	return;
+      }
+
+    }  
+    assert(0);
+  }
+};
+
+class GridLimeWriter : public BinaryIO {
+ public:
+   ///////////////////////////////////////////////////
+   // FIXME: format for RNG? Now just binary out instead
+   ///////////////////////////////////////////////////
+
+   FILE       *File;
+   LimeWriter *LimeW;
+   std::string filename;
+
+   void open(std::string &_filename) { 
+     filename= _filename;
+     File = fopen(filename.c_str(), "w");
+     LimeW = limeCreateWriter(File); assert(LimeW != NULL );
+   }
+   /////////////////////////////////////////////
+   // Close the file
+   /////////////////////////////////////////////
+   void close(void) {
+     fclose(File);
+     //  limeDestroyWriter(LimeW);
+   }
+  ///////////////////////////////////////////////////////
+  // Lime utility functions
+  ///////////////////////////////////////////////////////
+  int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
+  {
+    LimeRecordHeader *h;
+    h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
+    assert(limeWriteRecordHeader(h, LimeW) >= 0);
+    limeDestroyHeader(h);
+    return LIME_SUCCESS;
+  }
+  ////////////////////////////////////////////
+  // Write a generic serialisable object
+  ////////////////////////////////////////////
+  template<class serialisable_object>
+  void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name)
+  {
+    std::string xmlstring;
+    {
+      XmlWriter WR("","");
+      write(WR,object_name,object);
+      xmlstring = WR.XmlString();
+    }
+    uint64_t nbytes = xmlstring.size();
+    int err;
+    LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); assert(h!= NULL);
+
+    err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
+    err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
+    err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
+    limeDestroyHeader(h);
+  }
   ////////////////////////////////////////////
   // Write a generic lattice field and csum
   ////////////////////////////////////////////
   template<class vobj>
-  static void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string filename,std::string record_name,FILE *File, LimeWriter *LimeW)
+  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
   {
-
     ////////////////////////////////////////////
     // Create record header
     ////////////////////////////////////////////
@@ -287,7 +324,7 @@ class LimeIO : public BinaryIO {
     int err;
     uint32_t nersc_csum,scidac_csuma,scidac_csumb;
     uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
-    createLimeRecordHeader(record_name, 0, 0, PayloadSize, LimeW);
+    createLimeRecordHeader(record_name, 0, 0, PayloadSize);
 
     ////////////////////////////////////////////////////////////////////
     // NB: FILE and iostream are jointly writing disjoint sequences in the
@@ -317,34 +354,25 @@ class LimeIO : public BinaryIO {
     checksum.suma= streama.str();
     checksum.sumb= streamb.str();
     std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
-    writeLimeObject(0,1,checksum,std::string("scidacChecksum"    ),std::string(SCIDAC_CHECKSUM),LimeW);
+    writeLimeObject(0,1,checksum,std::string("scidacChecksum"    ),std::string(SCIDAC_CHECKSUM));
   }
-  // Could end the LIME base class here
 };
 
-class ScidacIO : public LimeIO {
+class ScidacWriter : public GridLimeWriter {
  public:
-   /*
-    LimeWriter *LimeW;
-    LimeReader *LimeR;
-    FILE *File;
-  template<class userFile>
-  int open(std::string filename,GridBase *grid,userFile &_userFile,int volfmt) {
- 
-  }
-  void close(void) {
- 
-  }
-  template<class vobj,class userRecord>
-  int writeScidacField(Lattice<vobj> &field,userRecord &_userRecord,int volfmt) 
-  template<class vobj,class userRecord>
-  int  readScidacField(Lattice<vobj> &field,userRecord &_userRecord,int volfmt) 
-   */
+
+   template<class SerialisableUserFile>
+   void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
+   {
+     scidacFile    _scidacFile(grid);
+     writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
+     writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+   }
   ////////////////////////////////////////////////
   // Write generic lattice field in scidac format
   ////////////////////////////////////////////////
-  template <class vobj,class userFile, class userRecord>
-  static void writeScidacField(std::string filename,Lattice<vobj> &field,userFile _userFile,userRecord _userRecord) 
+   template <class vobj, class userRecord>
+  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord) 
   {
     typedef typename vobj::scalar_object sobj;
     uint64_t nbytes;
@@ -362,34 +390,25 @@ class ScidacIO : public LimeIO {
     //////////////////////////////////////////////
     // Fill the Lime file record by record
     //////////////////////////////////////////////
-    FILE *File = fopen(filename.c_str(), "w");
-    LimeWriter *LimeW = limeCreateWriter(File);
-    assert(LimeW != NULL );
-
-    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); // Open message 
-    writeLimeObject(0,0,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML),LimeW);
-    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML),LimeW);
-    writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML),LimeW);
-    writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML),LimeW);
-    writeLimeLatticeBinaryObject(field,filename,std::string(ILDG_BINARY_DATA),File,LimeW);      // Closes message with checksum
-
-    limeDestroyWriter(LimeW);
-    fclose(File);
+    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
+    writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
+    writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
+    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
   }
 };
 
-class IldgIO : public ScidacIO {
+class IldgWriter : public ScidacWriter {
  public:
 
   ///////////////////////////////////
   // A little helper
   ///////////////////////////////////
-  static void writeLimeIldgLFN(std::string &LFN,LimeWriter *LimeW)
+  void writeLimeIldgLFN(std::string &LFN)
   {
     uint64_t PayloadSize = LFN.size();
     int err;
-    createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize, LimeW);
-    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize, LimeW); assert(err>=0);
+    createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
+    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
     err=limeWriterCloseRecord(LimeW); assert(err>=0);
   }
 
@@ -399,7 +418,7 @@ class IldgIO : public ScidacIO {
   // Use Grid MetaData object if present.
   ////////////////////////////////////////////////////////////////
   template <class vsimd>
-  static void writeConfiguration(std::string filename,Lattice<iLorentzColourMatrix<vsimd> > &Umu) 
+  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
   {
     GridBase * grid = Umu._grid;
     typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
@@ -418,6 +437,10 @@ class IldgIO : public ScidacIO {
     ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
 
     std::string format = header.floating_point;
+    header.ensemble_id    = description;
+    header.ensemble_label = description;
+    header.sequence_number = sequence;
+    header.ildg_lfn = LFN;
 
     assert ( (format == std::string("IEEE32BIG"))  
            ||(format == std::string("IEEE64BIG")) );
@@ -453,20 +476,21 @@ class IldgIO : public ScidacIO {
     //////////////////////////////////////////////
     // Fill the Lime file record by record
     //////////////////////////////////////////////
-
-    FILE *File = fopen(filename.c_str(), "w");
-    LimeWriter *LimeW = limeCreateWriter(File); assert(LimeW != NULL);
-    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT),LimeW); // Open message 
-    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML),LimeW);
-    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML),LimeW);
-    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML),LimeW);
-    writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML),LimeW);
-    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT),LimeW); // rec
-    writeLimeIldgLFN(header.ildg_lfn, LimeW);                                                 // rec
-    writeLimeLatticeBinaryObject(Umu,filename,std::string(ILDG_BINARY_DATA),File,LimeW);      // Closes message with checksum
-    limeDestroyWriter(LimeW);
+    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
+    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
+    writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+    writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
+    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
+    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT)); // rec
+    writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
+    writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
+    //    limeDestroyWriter(LimeW);
     fclose(File);
   }
+};
+
+class IldgReader : public GridLimeReader {
+ public:
 
   ////////////////////////////////////////////////////////////////
   // Read either Grid/SciDAC/ILDG configuration
@@ -476,7 +500,7 @@ class IldgIO : public ScidacIO {
   // Else use SciDAC MetaData object if present.
   ////////////////////////////////////////////////////////////////
   template <class vsimd>
-  static void readConfiguration(std::string filename,Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
+  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
 
     typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
     typedef typename GaugeField::vector_object  vobj;
@@ -491,9 +515,6 @@ class IldgIO : public ScidacIO {
 
     assert(dims.size()==4);
 
-    FILE *File = fopen(filename.c_str(), "r");
-    LimeReader *LimeR = limeCreateReader(File);
-
     // Metadata holders
     ildgFormat     ildgFormat_    ;
     std::string    ildgLFN_       ;
diff --git a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
index b72fc6f7..118a8e25 100644
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -77,7 +77,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
       
       uint32_t nersc_csum,scidac_csuma,scidac_csumb;
       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      IldgIO::writeConfiguration(config,U, Params.format);
+      IldgWriter _IldgWriter;
+      _IldgWriter.open(config);
+      _IldgWriter.writeConfiguration(U, traj, config, config);
+      _IldgWriter.close();
 
       std::cout << GridLogMessage << "Written ILDG Configuration on " << config
                 << " checksum " << std::hex 
@@ -97,7 +100,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
     BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
 
     FieldMetaData header;
-    IldgIO::readConfiguration(config,U,header);  // format from the header
+    IldgReader _IldgReader;
+    _IldgReader.open(config);
+    _IldgReader.readConfiguration(config,U,header);  // format from the header
+    _IldgReader.close();
 
     std::cout << GridLogMessage << "Read ILDG Configuration from " << config
               << " checksum " << std::hex 
diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc
index 199773ab..e3e9d385 100644
--- a/tests/IO/Test_ildg_io.cc
+++ b/tests/IO/Test_ildg_io.cc
@@ -78,13 +78,19 @@ int main (int argc, char ** argv)
   std::cout <<GridLogMessage<<"** Writing out  ILDG conf    *********"<<std::endl;
   std::cout <<GridLogMessage<<"**************************************"<<std::endl;
   std::string file("./ckpoint_ildg.4000");
-  IldgIO::writeConfiguration(file,Umu);
+  IldgWriter _IldgWriter;
+  _IldgWriter.open(file);
+  _IldgWriter.writeConfiguration(Umu,4000,std::string("dummy_ildg_LFN"),std::string("dummy_config"));
+  _IldgWriter.close();
 
   Umu_saved = Umu;
   std::cout <<GridLogMessage<<"**************************************"<<std::endl;
   std::cout <<GridLogMessage<<"** Reading back ILDG conf    *********"<<std::endl;
   std::cout <<GridLogMessage<<"**************************************"<<std::endl;
-  IldgIO::readConfiguration(file,Umu,header);
+  IldgReader _IldgReader;
+  _IldgReader.open(file);
+  _IldgReader.readConfiguration(Umu,header);
+  _IldgReader.close();
   Umu_diff = Umu - Umu_saved;
 
   std::cout <<GridLogMessage<< "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;
diff --git a/tests/IO/Test_ildg_read.cc b/tests/IO/Test_ildg_read.cc
index 9ea1d412..cb1f2efc 100644
--- a/tests/IO/Test_ildg_read.cc
+++ b/tests/IO/Test_ildg_read.cc
@@ -52,7 +52,10 @@ int main (int argc, char ** argv)
   
   FieldMetaData header;
   std::string file("./ildg.file");
-  IldgIO::readConfiguration(file,Umu,header);
+  IldgReader IR;
+  IR.open(file);
+  IR.readConfiguration(Umu,header);
+  IR.close();
 
   for(int mu=0;mu<Nd;mu++){
     U[mu] = PeekIndex<LorentzIndex>(Umu,mu);

From 8b7049f737617f67815433b52a7888874f7ffec1 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 19 Jun 2017 08:46:07 +0100
Subject: [PATCH 090/177] Improved detectino of usqcdInfo for plaq/linktr

---
 lib/parallelIO/IldgIO.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h
index 1d1b5e0c..17ce4a06 100644
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -598,9 +598,14 @@ class IldgReader : public GridLimeReader {
 	}
 
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
-	  XmlReader RD(&xmlc[0],"");
-	  read(RD,"usqcdInfo",usqcdInfo_);
-	  found_usqcdInfo = 1;
+	  std::string xmls(&xmlc[0]);
+	  // is it a USQCD info field
+	  if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) { 
+	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
+	    XmlReader RD(&xmlc[0],"");
+	    read(RD,"usqcdInfo",usqcdInfo_);
+	    found_usqcdInfo = 1;
+	  }
 	}
 
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 

From cfe3cd76d1543d1f13b3b53c6b029f154347cc1a Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Mon, 19 Jun 2017 14:04:21 +0100
Subject: [PATCH 091/177] Block solver improvements

---
 .../iterative/BlockConjugateGradient.h        |  19 +-
 lib/lattice/Lattice_reduction.h               | 189 +++++++++++++++++-
 .../solver/Test_staggered_block_cg_unprec.cc  |   7 +-
 3 files changed, 192 insertions(+), 23 deletions(-)

diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index d90194ae..53e11fa7 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -42,7 +42,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
 
   typedef typename Field::scalar_type scomplex;
 
-  const int blockDim = 0;
+  int blockDim ;
 
   int Nblock;
   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
@@ -51,14 +51,15 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
   Integer MaxIterations;
   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   
-  BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+  BlockConjugateGradient(int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
     : Tolerance(tol),
+    blockDim(_Orthog),
     MaxIterations(maxit),
     ErrorOnNoConverge(err_on_no_conv){};
 
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
-  int Orthog = 0; // First dimension is block dim
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
   Nblock = Src._grid->_fdimensions[Orthog];
 
   std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@@ -179,7 +180,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 
       Linop.HermOp(Psi, AP);
       AP = AP-Src;
-      std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      std::cout << GridLogMessage <<"\t A__ True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
 
       std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -209,8 +210,7 @@ class MultiRHSConjugateGradient : public OperatorFunction<Field> {
 
   typedef typename Field::scalar_type scomplex;
 
-  const int blockDim = 0;
-
+  int blockDim;
   int Nblock;
   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                            // Defaults true.
@@ -218,14 +218,15 @@ class MultiRHSConjugateGradient : public OperatorFunction<Field> {
   Integer MaxIterations;
   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   
-   MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+  MultiRHSConjugateGradient(int Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
     : Tolerance(tol),
+    blockDim(Orthog),
     MaxIterations(maxit),
     ErrorOnNoConverge(err_on_no_conv){};
 
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
-  int Orthog = 0; // First dimension is block dim
+  int Orthog = blockDim; // First dimension is block dim
   Nblock = Src._grid->_fdimensions[Orthog];
 
   std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@@ -285,12 +286,10 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
     MatrixTimer.Stop();
 
     // Alpha
-    //    sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog);
     sliceInnerTimer.Start();
     sliceInnerProductVector(v_pAp,P,AP,Orthog);
     sliceInnerTimer.Stop();
     for(int b=0;b<Nblock;b++){
-      //      std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl;
       v_alpha[b] = v_rr[b]/real(v_pAp[b]);
     }
 
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 14234fe0..78f88ce3 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -1,4 +1,4 @@
- /*************************************************************************************
+/*************************************************************************************
     Grid physics library, www.github.com/paboyle/Grid 
     Source file: ./lib/lattice/Lattice_reduction.h
     Copyright (C) 2015
@@ -462,13 +462,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   typedef typename vobj::vector_type vector_type;
 
   int Nblock = X._grid->GlobalDimensions()[Orthog];
-  
+
   GridBase *FullGrid  = X._grid;
   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
+
   Lattice<vobj> Xslice(SliceGrid);
   Lattice<vobj> Rslice(SliceGrid);
-  
+
+#if 0
+  // R[i] = Y[i] + X[j] a(j,i) 
   for(int i=0;i<Nblock;i++){
     ExtractSlice(Rslice,Y,i,Orthog);
     for(int j=0;j<Nblock;j++){
@@ -477,8 +479,90 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
     }
     InsertSlice(Rslice,R,i,Orthog);
   }
+#endif
+#if 0
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+#pragma omp parallel 
+{ 
+
+  std::vector<int> lcoor(nl); // sliced coor
+  std::vector<int> hcoor(nh); // unsliced coor
+  std::vector<sobj> s_x(Nblock);
+
+#pragma omp for
+  for(int idx=0;idx<SliceGrid->lSites();idx++){
+
+    SliceGrid->LocalIndexToLocalCoor(idx,lcoor); 
+
+    int ddl=0;
+    for(int d=0;d<nh;d++){
+      if ( d!=Orthog ) { 
+	hcoor[d]=lcoor[ddl++];
+      }
+    }
+
+    sobj dot;
+    for(int i=0;i<Nblock;i++){
+      hcoor[Orthog] = i;
+      peekLocalSite(s_x[i],X,hcoor);
+    }
+
+    for(int i=0;i<Nblock;i++){
+      hcoor[Orthog] = i;
+      peekLocalSite(dot,Y,hcoor);
+      for(int j=0;j<Nblock;j++){
+	dot = dot + s_x[j]*(scale*aa(j,i));
+      }
+      pokeLocalSite(dot,R,hcoor);
+    }
+  }
+}
+#endif
+
+#if 1
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+
+      for(int i=0;i<Nblock;i++){
+	dot = Y[o+i*ostride];
+	for(int j=0;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+#endif
 };
 
+
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
@@ -497,7 +581,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
   Lattice<vobj> Rslice(SliceGrid);
   
   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  
+
+#if 0  
   for(int i=0;i<Nblock;i++){
     ExtractSlice(Lslice,lhs,i,Orthog);
     for(int j=0;j<Nblock;j++){
@@ -505,12 +590,96 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
       mat(i,j) = innerProduct(Lslice,Rslice);
     }
   }
-#undef FORCE_DIAG
-#ifdef FORCE_DIAG
-  for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
-      if ( i != j ) mat(i,j)=0.0;
+#endif
+
+#if 0
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+#pragma omp parallel 
+{ 
+  std::vector<int> lcoor(nl); // sliced coor
+  std::vector<int> hcoor(nh); // unsliced coor
+  std::vector<sobj> Left(Nblock);
+  std::vector<sobj> Right(Nblock);
+  Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+#pragma omp for
+  for(int idx=0;idx<SliceGrid->lSites();idx++){
+
+    SliceGrid->LocalIndexToLocalCoor(idx,lcoor); 
+
+    int ddl=0;
+    for(int d=0;d<nh;d++){
+      if ( d!=Orthog ) { 
+	hcoor[d]=lcoor[ddl++];
+      }
     }
+
+    // Get the scalar objects
+    for(int i=0;i<Nblock;i++){
+      hcoor[Orthog] = i;
+      peekLocalSite(Left[i] ,lhs,hcoor);
+      peekLocalSite(Right[i],rhs,hcoor);
+    }
+
+    for(int i=0;i<Nblock;i++){
+    for(int j=0;j<Nblock;j++){
+      std::complex<double> ip = innerProduct(Left[i],Right[j]);
+      mat_thread(i,j) += ip;
+    }}
+  }
+
+#pragma omp critical
+  {
+    mat += mat_thread;
+  }  
+
+}
+#endif
+
+#if 1
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  typedef typename vobj::vector_typeD vector_typeD;
+
+#pragma omp parallel 
+  {
+    std::vector<vobj> Left(Nblock);
+    std::vector<vobj> Right(Nblock);
+    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	Left [i] = lhs[o+i*ostride];
+	Right[i] = rhs[o+i*ostride];
+      }
+
+      for(int i=0;i<Nblock;i++){
+      for(int j=0;j<Nblock;j++){
+	auto tmp = innerProduct(Left[i],Right[j]);
+	vector_typeD rtmp = TensorRemove(tmp);
+	mat_thread(i,j) += Reduce(rtmp);
+      }}
+    }}
+#pragma omp critical
+    {
+      mat += mat_thread;
+    }  
   }
 #endif
   return;
diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc
index 44e8fb52..8da93195 100644
--- a/tests/solver/Test_staggered_block_cg_unprec.cc
+++ b/tests/solver/Test_staggered_block_cg_unprec.cc
@@ -74,13 +74,14 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
 
-  RealD mass=0.01;
+  RealD mass=0.003;
   ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
   MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
 
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
-  BlockConjugateGradient<FermionField> BCG(1.0e-8,10000);
-  MultiRHSConjugateGradient<FermionField> mCG(1.0e-8,10000);
+  int blockDim = 0;
+  BlockConjugateGradient<FermionField>    BCG(blockDim,1.0e-8,10000);
+  MultiRHSConjugateGradient<FermionField> mCG(blockDim,1.0e-8,10000);
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;

From abc4de0fd2e779040b4600d362954688888199b0 Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Mon, 19 Jun 2017 22:03:03 +0100
Subject: [PATCH 092/177] No compile make tests fix

---
 lib/simd/Grid_vector_types.h   | 4 ++--
 lib/tensors/Tensor_class.h     | 9 ++++++++-
 lib/tensors/Tensor_exp.h       | 7 +++++--
 tests/core/Test_GaugeAction.cc | 2 +-
 tests/core/Test_RectPlaq.cc    | 2 +-
 tests/core/Test_main.cc        | 2 +-
 6 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 1ebe7379..e05fecc4 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -751,8 +751,8 @@ inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
 
   conv.v = in.v;
   for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
-    assert(conv.s[i + 1] ==
-           conv.s[i]);  // trap any cases where real was not duplicated
+    assert(conv.s[i + 1] == conv.s[i]);  
+    // trap any cases where real was not duplicated
     // indicating the SIMD grids of real and imag assignment did not correctly
     // match
     conv.s[i + 1] = 0.0;  // zero imaginary parts
diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h
index cb90da6c..c7f868db 100644
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -156,11 +156,18 @@ class iScalar {
 
   // convert from a something to a scalar via constructor of something arg
   template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
-    strong_inline iScalar<vtype> operator=(T arg) {
+  strong_inline iScalar<vtype> operator=(T arg) {
     _internal = arg;
     return *this;
   }
 
+  // Convert elements
+  template <class ttype>
+  strong_inline iScalar<vtype> operator=(iScalar<ttype> &&arg) {
+    _internal = arg._internal;
+    return *this;
+  }
+
   friend std::ostream &operator<<(std::ostream &stream,const iScalar<vtype> &o) {
     stream << "S {" << o._internal << "}";
     return stream;
diff --git a/lib/tensors/Tensor_exp.h b/lib/tensors/Tensor_exp.h
index e18fed70..f7eee8f0 100644
--- a/lib/tensors/Tensor_exp.h
+++ b/lib/tensors/Tensor_exp.h
@@ -80,8 +80,11 @@ template<class vtype, int N> inline iVector<vtype, N> Exponentiate(const iVector
       mat iQ2 = arg*arg*alpha*alpha;
       mat iQ3 = arg*iQ2*alpha;   
       // sign in c0 from the conventions on the Ta
-      c0 = -imag( trace(iQ3) ) * one_over_three;  
-      c1 = -real( trace(iQ2) ) * one_over_two;
+      scalar imQ3, reQ2;
+      imQ3 = imag( trace(iQ3) );
+      reQ2 = real( trace(iQ2) );
+      c0 = -imQ3 * one_over_three;  
+      c1 = -reQ2 * one_over_two;
 
       // Cayley Hamilton checks to machine precision, tested
       tmp = c1 * one_over_three;
diff --git a/tests/core/Test_GaugeAction.cc b/tests/core/Test_GaugeAction.cc
index 2f0535f1..572f19fb 100644
--- a/tests/core/Test_GaugeAction.cc
+++ b/tests/core/Test_GaugeAction.cc
@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
 
   std::vector<LatticeColourMatrix> U(4,&Fine);
   
-  NerscField header;
+  FieldMetaData header;
   
   std::string file("./ckpoint_lat.4000");
   NerscIO::readConfiguration(Umu,header,file);
diff --git a/tests/core/Test_RectPlaq.cc b/tests/core/Test_RectPlaq.cc
index 9154f879..2e9cc832 100644
--- a/tests/core/Test_RectPlaq.cc
+++ b/tests/core/Test_RectPlaq.cc
@@ -90,7 +90,7 @@ int main (int argc, char ** argv)
 
   std::vector<LatticeColourMatrix> U(4,&Fine);
   
-  NerscField header;
+  FieldMetaData header;
   
   std::string file("./ckpoint_lat.4000");
   NerscIO::readConfiguration(Umu,header,file);
diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc
index 921298c7..378f49bd 100644
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -336,7 +336,7 @@ int main(int argc, char **argv) {
 
       std::cout << GridLogMessage << "norm cMmat : " << norm2(cMat)
                 << std::endl;
-      cMat = expMat(cMat, ComplexD(1.0, 0.0));
+      cMat = expMat(cMat,1.0);// ComplexD(1.0, 0.0));
       std::cout << GridLogMessage << "norm expMat: " << norm2(cMat)
                 << std::endl;
       peekSite(cm, cMat, mysite);

From 0a8faac2713c981be4a61c06d90ce0d6c5de211a Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Mon, 19 Jun 2017 22:54:18 +0100
Subject: [PATCH 093/177] Fix make tests compile

---
 lib/qcd/action/scalar/ScalarImpl.h | 13 ++++++++-----
 tests/debug/Test_cayley_ldop_cr.cc |  2 +-
 tests/solver/Test_dwf_hdcr.cc      |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h
index ee2d2fb8..0116b4f9 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -15,6 +15,8 @@ namespace Grid {
     
     typedef iImplField<Simd> SiteField;
     
+    template <typename vtype> using iImplScalar= iScalar<iScalar<iScalar<vtype   > > >;
+    typedef iImplScalar<Simd> ComplexField;
     
     typedef Lattice<SiteField> Field;
     
@@ -51,13 +53,14 @@ namespace Grid {
   public:
     typedef S Simd;
     
-    template <typename vtype>
-    using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
-    
+    template <typename vtype> using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
+
     typedef iImplField<Simd> SiteField;
-    
-    
     typedef Lattice<SiteField> Field;
+
+    template <typename vtype> using iImplScalar= iScalar<iScalar<iScalar<vtype   > > >;
+    typedef iImplScalar<Simd> ComplexField;
+    
     
     static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
       gaussian(pRNG, P);
diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc
index dfda43d2..cbefdd46 100644
--- a/tests/debug/Test_cayley_ldop_cr.cc
+++ b/tests/debug/Test_cayley_ldop_cr.cc
@@ -67,7 +67,7 @@ int main (int argc, char ** argv)
   LatticeFermion    err(FGrid);
   LatticeGaugeField Umu(UGrid); 
 
-  NerscField header;
+  FieldMetaData header;
   std::string file("./ckpoint_lat.400");
   NerscIO::readConfiguration(Umu,header,file);
 
diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc
index 64ca0b33..c553ba0a 100644
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -516,7 +516,7 @@ int main (int argc, char ** argv)
   LatticeColourMatrix U(UGrid);
   LatticeColourMatrix zz(UGrid);
 
-  NerscField header;
+  FieldMetaData header;
   std::string file("./ckpoint_lat.4000");
   NerscIO::readConfiguration(Umu,header,file);
 

From e9cc21900f00b81a17ab87d649e014edc99c636b Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 20 Jun 2017 12:37:41 +0100
Subject: [PATCH 094/177] Block solver complete for staggered. Now stable on
 mass 0.003 and gives 8x (!) speed up on Haswell laptop vs. standard CG for 8
 RHS solves.

166 iterations vs. 537 iterations so algorithmic gain + 2x in flop rate gain.

Better than a slap in the face with a wet kipper.
---
 .../iterative/BlockConjugateGradient.h        | 295 ++++++++++++++++--
 lib/lattice/Lattice_reduction.h               | 235 +++-----------
 .../solver/Test_staggered_block_cg_unprec.cc  |  13 +-
 3 files changed, 321 insertions(+), 222 deletions(-)

diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index 53e11fa7..f8b83b1f 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -33,6 +33,8 @@ directory
 
 namespace Grid {
 
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
+
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
@@ -40,24 +42,274 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
  public:
 
+
   typedef typename Field::scalar_type scomplex;
 
   int blockDim ;
-
   int Nblock;
+
+  BlockCGtype CGtype;
   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                            // Defaults true.
   RealD Tolerance;
   Integer MaxIterations;
   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   
-  BlockConjugateGradient(int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
+  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
     : Tolerance(tol),
+    CGtype(cgtype),
     blockDim(_Orthog),
     MaxIterations(maxit),
     ErrorOnNoConverge(err_on_no_conv){};
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Thin QR factorisation (google it)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  //Dimensions
+  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
+  //
+  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
+  //
+  //   Q  C = R => Q = R C^{-1}
+  //
+  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
+  //
+  // Set C = L^{dag}, and then Q^dag Q = ident 
+  //
+  // Checks:
+  // Cdag C = Rdag R ; passes.
+  // QdagQ  = 1      ; passes
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  sliceInnerProductMatrix(m_rr,R,R,Orthog);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Cholesky from Eigen
+  // There exists a ldlt that is documented as more stable
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Q = R C^{-1}
+  //
+  // Q_j  = R_i Cinv(i,j) 
+  //
+  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // FIXME:: make a sliceMulMatrix to avoid zero vector
+  sliceMulMatrix(Q,Cinv,R,Orthog);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Call one of several implementations
+////////////////////////////////////////////////////////////////////////////////////////////////////
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  if ( CGtype == BlockCGrQ ) {
+    BlockCGrQsolve(Linop,Src,Psi);
+  } else if (CGtype == BlockCG ) {
+    BlockCGsolve(Linop,Src,Psi);
+  } else if (CGtype == CGmultiRHS ) {
+    CGmultiRHSsolve(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQ implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  Nblock = B._grid->_fdimensions[Orthog];
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  X.checkerboard = B.checkerboard;
+  conformable(X, B);
+
+  Field tmp(B);
+  Field Q(B);
+  Field D(B);
+  Field Z(B);
+  Field AD(B);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,B,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,B,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,X,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+
+  Linop.HermOp(X, AD);
+  tmp = B - AD;  
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  D=Q;
+
+  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    Linop.HermOp(D, Z);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(X, AD);
+      AD = AD-B;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+//////////////////////////////////////////////////////////////////////////
+// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
+//////////////////////////////////////////////////////////////////////////
+void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
   int Orthog = blockDim; // First dimension is block dim; this is an assumption
   Nblock = Src._grid->_fdimensions[Orthog];
@@ -163,8 +415,9 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
      *********************
      */
     RealD max_resid=0;
+    RealD rr;
     for(int b=0;b<Nblock;b++){
-      RealD rr = real(m_rr(b,b))/ssq[b];
+      rr = real(m_rr(b,b))/ssq[b];
       if ( rr > max_resid ) max_resid = rr;
     }
     
@@ -174,13 +427,14 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 
       std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
       for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
       }
       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
 
       Linop.HermOp(Psi, AP);
       AP = AP-Src;
-      std::cout << GridLogMessage <<"\t A__ True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
 
       std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -198,33 +452,11 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
   if (ErrorOnNoConverge) assert(0);
   IterationsToComplete = k;
 }
-};
-
-
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
+// Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
-template <class Field>
-class MultiRHSConjugateGradient : public OperatorFunction<Field> {
- public:
-
-  typedef typename Field::scalar_type scomplex;
-
-  int blockDim;
-  int Nblock;
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
-                           // Defaults true.
-  RealD Tolerance;
-  Integer MaxIterations;
-  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  
-  MultiRHSConjugateGradient(int Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol),
-    blockDim(Orthog),
-    MaxIterations(maxit),
-    ErrorOnNoConverge(err_on_no_conv){};
-
-void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
   int Orthog = blockDim; // First dimension is block dim
   Nblock = Src._grid->_fdimensions[Orthog];
@@ -331,7 +563,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 
       std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
       for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
       }
       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
 
@@ -357,9 +589,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
   if (ErrorOnNoConverge) assert(0);
   IterationsToComplete = k;
 }
+
 };
 
-
-
 }
 #endif
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 78f88ce3..c5b20f3c 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -369,71 +369,6 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
   }
 };
 
-
-/*
-template<class vobj>
-static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
-			     int Orthog,RealD scale=1.0) 
-{    
-  // FIXME: Implementation is slow
-  // Best base the linear combination by constructing a 
-  // set of vectors of size grid->_rdimensions[Orthog].
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  
-  int Nblock = X._grid->GlobalDimensions()[Orthog];
-  
-  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
-  // If we based this on Cshift it would work for spread out
-  // but it would be even slower
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    ExtractSlice(Xslice,X,i,Orthog);
-    Rslice = Rslice + Xslice*(scale*a[i]);
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-};
-template<class vobj>
-static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-  {
-    // FIXME: Implementation is slow
-    // Look at localInnerProduct implementation,
-    // and do inside a site loop with block strided iterators
-    typedef typename vobj::scalar_object sobj;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::vector_type vector_type;
-    typedef typename vobj::tensor_reduced scalar;
-    typedef typename scalar::scalar_object  scomplex;
-  
-    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
-    vec.resize(Nblock);
-    std::vector<scomplex> sip(Nblock);
-    Lattice<scalar> IP(lhs._grid); 
-    IP=localInnerProduct(lhs,rhs);
-    sliceSum(IP,sip,Orthog);
-  
-    for(int ss=0;ss<Nblock;ss++){
-      vec[ss] = TensorRemove(sip[ss]);
-    }
-  }
-*/
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// FIXME: Implementation is slow
-// If we based this on Cshift it would work for spread out
-// but it would be even slower
-//
-// Repeated extract slice is inefficient
-//
-// Best base the linear combination by constructing a 
-// set of vectors of size grid->_rdimensions[Orthog].
-//////////////////////////////////////////////////////////////////////////////////////////
-
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
   int NN    = BlockSolverGrid->_ndimension;
@@ -453,7 +388,6 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
 
-
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
@@ -469,64 +403,10 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   Lattice<vobj> Xslice(SliceGrid);
   Lattice<vobj> Rslice(SliceGrid);
 
-#if 0
-  // R[i] = Y[i] + X[j] a(j,i) 
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Xslice,X,j,Orthog);
-      Rslice = Rslice + Xslice*(scale*aa(j,i));
-    }
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-#endif
-#if 0
-  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
-
-#pragma omp parallel 
-{ 
-
-  std::vector<int> lcoor(nl); // sliced coor
-  std::vector<int> hcoor(nh); // unsliced coor
-  std::vector<sobj> s_x(Nblock);
-
-#pragma omp for
-  for(int idx=0;idx<SliceGrid->lSites();idx++){
-
-    SliceGrid->LocalIndexToLocalCoor(idx,lcoor); 
-
-    int ddl=0;
-    for(int d=0;d<nh;d++){
-      if ( d!=Orthog ) { 
-	hcoor[d]=lcoor[ddl++];
-      }
-    }
-
-    sobj dot;
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(s_x[i],X,hcoor);
-    }
-
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(dot,Y,hcoor);
-      for(int j=0;j<Nblock;j++){
-	dot = dot + s_x[j]*(scale*aa(j,i));
-      }
-      pokeLocalSite(dot,R,hcoor);
-    }
-  }
-}
-#endif
-
-#if 1
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
   int nl = SliceGrid->_ndimension;
 
-
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
   int stride=FullGrid->_slice_stride[Orthog];
@@ -535,7 +415,6 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
   {
-
     std::vector<vobj> s_x(Nblock);
 
 #pragma omp for collapse(2)
@@ -543,13 +422,11 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
     for(int b=0;b<block;b++){
       int o  = n*stride + b;
 
-
       for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
       }
 
       vobj dot;
-
       for(int i=0;i<Nblock;i++){
 	dot = Y[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
@@ -559,15 +436,63 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
       }
     }}
   }
-#endif
+};
+
+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X._grid->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X._grid;
+  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  Lattice<vobj> Xslice(SliceGrid);
+  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+
 };
 
 
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
-  // FIXME: Implementation is slow
-  // Not sure of best solution.. think about it
   typedef typename vobj::scalar_object sobj;
   typedef typename vobj::scalar_type scalar_type;
   typedef typename vobj::vector_type vector_type;
@@ -582,63 +507,6 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
   
   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 
-#if 0  
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Lslice,lhs,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Rslice,rhs,j,Orthog);
-      mat(i,j) = innerProduct(Lslice,Rslice);
-    }
-  }
-#endif
-
-#if 0
-  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
-
-#pragma omp parallel 
-{ 
-  std::vector<int> lcoor(nl); // sliced coor
-  std::vector<int> hcoor(nh); // unsliced coor
-  std::vector<sobj> Left(Nblock);
-  std::vector<sobj> Right(Nblock);
-  Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-#pragma omp for
-  for(int idx=0;idx<SliceGrid->lSites();idx++){
-
-    SliceGrid->LocalIndexToLocalCoor(idx,lcoor); 
-
-    int ddl=0;
-    for(int d=0;d<nh;d++){
-      if ( d!=Orthog ) { 
-	hcoor[d]=lcoor[ddl++];
-      }
-    }
-
-    // Get the scalar objects
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(Left[i] ,lhs,hcoor);
-      peekLocalSite(Right[i],rhs,hcoor);
-    }
-
-    for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
-      std::complex<double> ip = innerProduct(Left[i],Right[j]);
-      mat_thread(i,j) += ip;
-    }}
-  }
-
-#pragma omp critical
-  {
-    mat += mat_thread;
-  }  
-
-}
-#endif
-
-#if 1
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
   int nl = SliceGrid->_ndimension;
@@ -681,7 +549,6 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
       mat += mat_thread;
     }  
   }
-#endif
   return;
 }
 
diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc
index 8da93195..8db41e98 100644
--- a/tests/solver/Test_staggered_block_cg_unprec.cc
+++ b/tests/solver/Test_staggered_block_cg_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; 
   typename ImprovedStaggeredFermion5DR::ImplParams params; 
 
-  const int Ls=4;
+  const int Ls=8;
 
   Grid_init(&argc,&argv);
 
@@ -80,12 +80,13 @@ int main (int argc, char ** argv)
 
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
   int blockDim = 0;
-  BlockConjugateGradient<FermionField>    BCG(blockDim,1.0e-8,10000);
-  MultiRHSConjugateGradient<FermionField> mCG(blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCGrQ(BlockCGrQ,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCG  (BlockCG,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    mCG  (CGmultiRHS,blockDim,1.0e-8,10000);
 
-  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
   std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
-  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
   ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
   MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
   FermionField src4d(UGrid); random(pRNG,src4d);
@@ -112,7 +113,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=zero;
-  BCG(HermOp,src,result);
+  BCGrQ(HermOp,src,result);
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
 

From 1e8a2e1621044c831c2ed402c7d784d0b1cd4052 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 20 Jun 2017 17:24:55 +0100
Subject: [PATCH 095/177] various compatibility fixes after merge

---
 lib/qcd/action/gauge/Photon.h      |  6 ++++--
 lib/qcd/action/scalar/ScalarImpl.h | 17 +++++++++++++----
 tests/IO/Test_ildg_io.cc           |  2 ++
 tests/IO/Test_ildg_read.cc         |  2 ++
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index 1512d4e3..7e21a1de 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -41,11 +41,13 @@ namespace QCD{
     template <typename vtype>
     using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
     
-    typedef iImplGaugeLink<Simd> SiteLink;
+    typedef iImplGaugeLink<Simd>  SiteLink;
     typedef iImplGaugeField<Simd> SiteField;
+    typedef SiteField             SiteComplex;
     
-    typedef Lattice<SiteLink> LinkField;
+    typedef Lattice<SiteLink>  LinkField;
     typedef Lattice<SiteField> Field;
+    typedef Field              ComplexField;
   };
   
   typedef QedGimpl<vComplex> QedGimplR;
diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h
index 868bfc84..5342a1fa 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -15,8 +15,10 @@ class ScalarImplTypes {
 
     typedef iImplField<Simd> SiteField;
     typedef SiteField        SitePropagator;
+    typedef SiteField        SiteComplex;
     
     typedef Lattice<SiteField> Field;
+    typedef Field              ComplexField;
     typedef Field              FermionField;
     typedef Field              PropagatorField;
     
@@ -92,11 +94,18 @@ class ScalarImplTypes {
   public:
     typedef S Simd;
     template <typename vtype>
-    using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
+    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
+    template <typename vtype>
+    using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
 
-    typedef iImplField<Simd> SiteField;
-
-    typedef Lattice<SiteField> Field;
+    typedef iImplField<Simd>   SiteField;
+    typedef SiteField          SitePropagator;
+    typedef iImplComplex<Simd> SiteComplex;
+    
+    typedef Lattice<SiteField>   Field;
+    typedef Lattice<SiteComplex> ComplexField;
+    typedef Field                FermionField;
+    typedef Field                PropagatorField;
 
     static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
       QCD::SU<N>::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc
index e3e9d385..6aac2e38 100644
--- a/tests/IO/Test_ildg_io.cc
+++ b/tests/IO/Test_ildg_io.cc
@@ -36,6 +36,7 @@ using namespace Grid::QCD;
 
 int main (int argc, char ** argv)
 {
+#ifdef HAVE_LIME
   Grid_init(&argc,&argv);
 
   std::cout <<GridLogMessage<< " main "<<std::endl;
@@ -96,4 +97,5 @@ int main (int argc, char ** argv)
   std::cout <<GridLogMessage<< "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;
 
   Grid_finalize();
+#endif
 }
diff --git a/tests/IO/Test_ildg_read.cc b/tests/IO/Test_ildg_read.cc
index cb1f2efc..958fbe4e 100644
--- a/tests/IO/Test_ildg_read.cc
+++ b/tests/IO/Test_ildg_read.cc
@@ -36,6 +36,7 @@ using namespace Grid::QCD;
 
 int main (int argc, char ** argv)
 {
+#ifdef HAVE_LIME
   Grid_init(&argc,&argv);
 
 
@@ -112,4 +113,5 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "calculated link trace " <<l*LinkTraceScale<<std::endl;
 
   Grid_finalize();
+#endif
 }

From 0486ff8e7901dccd53f47031cececf04af70f1fd Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 20 Jun 2017 18:46:01 +0100
Subject: [PATCH 096/177] Improved the lancos

---
 TODO                                          |  28 +-
 lib/algorithms/densematrix/DenseMatrix.h      | 137 ---
 lib/algorithms/densematrix/Francis.h          | 525 ----------
 lib/algorithms/densematrix/Householder.h      | 242 -----
 .../iterative/ImplicitlyRestartedLanczos.h    | 987 ++++--------------
 lib/qcd/hmc/checkpointers/ILDGCheckpointer.h  |   2 +-
 tests/solver/Test_dwf_lanczos.cc              |   2 +-
 7 files changed, 211 insertions(+), 1712 deletions(-)
 delete mode 100644 lib/algorithms/densematrix/DenseMatrix.h
 delete mode 100644 lib/algorithms/densematrix/Francis.h
 delete mode 100644 lib/algorithms/densematrix/Householder.h

diff --git a/TODO b/TODO
index a5d4cabd..eeb7dfa5 100644
--- a/TODO
+++ b/TODO
@@ -1,24 +1,28 @@
 TODO:
 ---------------
 
-Peter's work list:
-1)- Precision conversion and sort out localConvert      <-- 
-2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
-
--- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
--- Physical propagator interface
--- Conserved currents
--- GaugeFix into central location
--- Multigrid Wilson and DWF, compare to other Multigrid implementations
--- HDCR resume
+Large item work list:
+1)- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
+2)- MultiRHS with spread out extra dim
+3)- BG/Q port and check
+4)- Precision conversion and sort out localConvert      <-- partial
+  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
+5)- Physical propagator interface
+6)- Conserved currents
+7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
+8)- HDCR resume
 
 Recent DONE 
+-- GaugeFix into central location                      <-- DONE
+-- Scidac and Ildg metadata handling                   <-- DONE
+-- Binary I/O MPI2 IO                                  <-- DONE
 -- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
--- Merge high precision reduction into develop        
--- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+-- Merge high precision reduction into develop         <-- DONE
+-- BlockCG, BCGrQ                                      <-- DONE
+-- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
    -- slice* linalg routines for multiRHS, BlockCG    
 
 -----
diff --git a/lib/algorithms/densematrix/DenseMatrix.h b/lib/algorithms/densematrix/DenseMatrix.h
deleted file mode 100644
index d86add21..00000000
--- a/lib/algorithms/densematrix/DenseMatrix.h
+++ /dev/null
@@ -1,137 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/DenseMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_DENSE_MATRIX_H
-#define GRID_DENSE_MATRIX_H
-
-namespace Grid {
-    /////////////////////////////////////////////////////////////
-    // Matrix untils
-    /////////////////////////////////////////////////////////////
-
-template<class T> using DenseVector = std::vector<T>;
-template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
-
-template<class T> void Size(DenseVector<T> & vec, int &N) 
-{ 
-  N= vec.size();
-}
-template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
-{ 
-  N= mat.size();
-  M= mat[0].size();
-}
-
-template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
-{ 
-  int M; Size(mat,N,M);
-  assert(N==M);
-}
-
-template<class T> void Resize(DenseVector<T > & mat, int N) { 
-  mat.resize(N);
-}
-template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
-  mat.resize(N);
-  for(int i=0;i<N;i++){
-    mat[i].resize(M);
-  }
-}
-template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
-  int N,M;
-  Size(mat,N,M);
-  for(int i=0;i<N;i++){
-  for(int j=0;j<M;j++){
-    mat[i][j] = val;
-  }}
-}
-
-/** Transpose of a matrix **/
-template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
-  int N,M;
-  Size(mat,N,M);
-  DenseMatrix<T> C; Resize(C,M,N);
-  for(int i=0;i<M;i++){
-  for(int j=0;j<N;j++){
-    C[i][j] = mat[j][i];
-  }} 
-  return C;
-}
-/** Set DenseMatrix to unit matrix **/
-template<class T> void Unity(DenseMatrix<T> &A){
-  int N;  SizeSquare(A,N);
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      if ( i==j ) A[i][j] = 1;
-      else        A[i][j] = 0;
-    } 
-  } 
-}
-
-/** Add C * I to matrix **/
-template<class T>
-void PlusUnit(DenseMatrix<T> & A,T c){
-  int dim;  SizeSquare(A,dim);
-  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
-}
-
-/** return the Hermitian conjugate of matrix **/
-template<class T>
-DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
-
-  int dim; SizeSquare(mat,dim);
-
-  DenseMatrix<T> C; Resize(C,dim,dim);
-
-  for(int i=0;i<dim;i++){
-    for(int j=0;j<dim;j++){
-      C[i][j] = conj(mat[j][i]);
-    } 
-  } 
-  return C;
-}
-/**Get a square submatrix**/
-template <class T>
-DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
-{
-  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
-
-  for(int i = row_st; i<row_end; i++){
-  for(int j = col_st; j<col_end; j++){
-    H[i-row_st][j-col_st]=A[i][j];
-  }}
-  return H;
-}
-
-}
-
-#include "Householder.h"
-#include "Francis.h"
-
-#endif
-
diff --git a/lib/algorithms/densematrix/Francis.h b/lib/algorithms/densematrix/Francis.h
deleted file mode 100644
index 08ecbd7b..00000000
--- a/lib/algorithms/densematrix/Francis.h
+++ /dev/null
@@ -1,525 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/Francis.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef FRANCIS_H
-#define FRANCIS_H
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-
-//#include <timer.h>
-//#include <lapacke.h>
-//#include <Eigen/Dense>
-
-namespace Grid {
-
-template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
-template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
-
-/**
-  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
-H =
-      x  x  x  x  x  x  x  x  x
-      x  x  x  x  x  x  x  x  x
-      0  x  x  x  x  x  x  x  x
-      0  0  x  x  x  x  x  x  x
-      0  0  0  x  x  x  x  x  x
-      0  0  0  0  x  x  x  x  x
-      0  0  0  0  0  x  x  x  x
-      0  0  0  0  0  0  x  x  x
-      0  0  0  0  0  0  0  x  x
-Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
-**/
-template <class T>
-int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
-{
-  DenseMatrix<T> H = Hin; 
-
-  int N ; SizeSquare(H,N);
-  int M = N;
-
-  Fill(evals,0);
-  Fill(evecs,0);
-
-  T s,t,x=0,y=0,z=0;
-  T u,d;
-  T apd,amd,bc;
-  DenseVector<T> p(N,0);
-  T nrm = Norm(H);    ///DenseMatrix Norm
-  int n, m;
-  int e = 0;
-  int it = 0;
-  int tot_it = 0;
-  int l = 0;
-  int r = 0;
-  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
-  DenseVector<int> trows(N,0);
-
-  /// Check if the matrix is really hessenberg, if not abort
-  RealD sth = 0;
-  for(int j=0;j<N;j++){
-    for(int i=j+2;i<N;i++){
-      sth = abs(H[i][j]);
-      if(sth > small){
-	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
-	exit(1);
-      }
-    }
-  }
-
-  do{
-    std::cout << "Francis QR Step N = " << N << std::endl;
-    /** Check for convergence
-      x  x  x  x  x
-      0  x  x  x  x
-      0  0  x  x  x
-      0  0  x  x  x
-      0  0  0  0  x
-      for this matrix l = 4
-     **/
-    do{
-      l = Chop_subdiag(H,nrm,e,small);
-      r = 0;    ///May have converged on more than one eval
-      ///Single eval
-      if(l == N-1){
-        evals[e] = H[l][l];
-        N--; e++; r++; it = 0;
-      }
-      ///RealD eval
-      if(l == N-2){
-        trows[l+1] = 1;    ///Needed for UTSolve
-        apd = H[l][l] + H[l+1][l+1];
-        amd = H[l][l] - H[l+1][l+1];
-        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
-        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
-        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
-        N-=2; e+=2; r++; it = 0;
-      }
-    } while(r>0);
-
-    if(N ==0) break;
-
-    DenseVector<T > ck; Resize(ck,3);
-    DenseVector<T> v;   Resize(v,3);
-
-    for(int m = N-3; m >= l; m--){
-      ///Starting vector essentially random shift.
-      if(it%10 == 0 && N >= 3 && it > 0){
-        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
-        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
-        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
-        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
-        z = H[m+1][m]*H[m+2][m+1];
-      }
-      ///Starting vector implicit Q theorem
-      else{
-        s = (H[N-2][N-2] + H[N-1][N-1]);
-        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
-        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
-        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
-        z = H[m+1][m]*H[m+2][m+1];
-      }
-      ck[0] = x; ck[1] = y; ck[2] = z;
-
-      if(m == l) break;
-
-      /** Some stupid thing from numerical recipies, seems to work**/
-      // PAB.. for heaven's sake quote page, purpose, evidence it works.
-      //       what sort of comment is that!?!?!?
-      u=abs(H[m][m-1])*(abs(y)+abs(z));
-      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
-      if ((T)abs(u+d) == (T)abs(d) ){
-	l = m; break;
-      }
-
-      //if (u < small){l = m; break;}
-    }
-    if(it > 100000){
-     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
-     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
-      exit(1);
-    }
-    normalize(ck);    ///Normalization cancels in PHP anyway
-    T beta;
-    Householder_vector<T >(ck, 0, 2, v, beta);
-    Householder_mult<T >(H,v,beta,0,l,l+2,0);
-    Householder_mult<T >(H,v,beta,0,l,l+2,1);
-    ///Accumulate eigenvector
-    Householder_mult<T >(P,v,beta,0,l,l+2,1);
-    int sw = 0;      ///Are we on the last row?
-    for(int k=l;k<N-2;k++){
-      x = H[k+1][k];
-      y = H[k+2][k];
-      z = (T)0.0;
-      if(k+3 <= N-1){
-	z = H[k+3][k];
-      } else{
-	sw = 1; 
-	v[2] = (T)0.0;
-      }
-      ck[0] = x; ck[1] = y; ck[2] = z;
-      normalize(ck);
-      Householder_vector<T >(ck, 0, 2-sw, v, beta);
-      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
-      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
-      ///Accumulate eigenvector
-      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
-    }
-    it++;
-    tot_it++;
-  }while(N > 1);
-  N = evals.size();
-  ///Annoying - UT solves in reverse order;
-  DenseVector<T> tmp; Resize(tmp,N);
-  for(int i=0;i<N;i++){
-    tmp[i] = evals[N-i-1];
-  } 
-  evals = tmp;
-  UTeigenvectors(H, trows, evals, evecs);
-  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
-  return tot_it;
-}
-
-template <class T>
-int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
-{
-  /**
-  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
-  H =
-  x  x  0  0  0  0
-  x  x  x  0  0  0
-  0  x  x  x  0  0
-  0  0  x  x  x  0
-  0  0  0  x  x  x
-  0  0  0  0  x  x
-  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
-  return my_Wilkinson(Hin, evals, evecs, small, small);
-}
-
-template <class T>
-int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
-{
-  int N; SizeSquare(Hin,N);
-  int M = N;
-
-  ///I don't want to modify the input but matricies must be passed by reference
-  //Scale a matrix by its "norm"
-  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
-  DenseMatrix<T> H;  H = Hin;
-  
-  RealD Hnorm = abs(Norm(Hin));
-  H = H * (1.0 / Hnorm);
-
-  // TODO use openmp and memset
-  Fill(evals,0);
-  Fill(evecs,0);
-
-  T s, t, x = 0, y = 0, z = 0;
-  T u, d;
-  T apd, amd, bc;
-  DenseVector<T> p; Resize(p,N); Fill(p,0);
-
-  T nrm = Norm(H);    ///DenseMatrix Norm
-  int n, m;
-  int e = 0;
-  int it = 0;
-  int tot_it = 0;
-  int l = 0;
-  int r = 0;
-  DenseMatrix<T> P; Resize(P,N,N);
-  Unity(P);
-  DenseVector<int> trows(N, 0);
-  /// Check if the matrix is really symm tridiag
-  RealD sth = 0;
-  for(int j = 0; j < N; ++j)
-  {
-    for(int i = j + 2; i < N; ++i)
-    {
-      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
-      {
-	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
-	std::cout << "Warning tridiagonalize and call again" << std::endl;
-        // exit(1); // see what is going on
-        //return;
-      }
-    }
-  }
-
-  do{
-    do{
-      //Jasper
-      //Check if the subdiagonal term is small enough (<small)
-      //if true then it is converged.
-      //check start from H.dim - e - 1
-      //How to deal with more than 2 are converged?
-      //What if Chop_symm_subdiag return something int the middle?
-      //--------------
-      l = Chop_symm_subdiag(H,nrm, e, small);
-      r = 0;    ///May have converged on more than one eval
-      //Jasper
-      //In this case
-      // x  x  0  0  0  0
-      // x  x  x  0  0  0
-      // 0  x  x  x  0  0
-      // 0  0  x  x  x  0
-      // 0  0  0  x  x  0
-      // 0  0  0  0  0  x  <- l
-      //--------------
-      ///Single eval
-      if(l == N - 1)
-      {
-        evals[e] = H[l][l];
-        N--;
-        e++;
-        r++;
-        it = 0;
-      }
-      //Jasper
-      // x  x  0  0  0  0
-      // x  x  x  0  0  0
-      // 0  x  x  x  0  0
-      // 0  0  x  x  0  0
-      // 0  0  0  0  x  x  <- l
-      // 0  0  0  0  x  x
-      //--------------
-      ///RealD eval
-      if(l == N - 2)
-      {
-        trows[l + 1] = 1;    ///Needed for UTSolve
-        apd = H[l][l] + H[l + 1][ l + 1];
-        amd = H[l][l] - H[l + 1][l + 1];
-        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
-        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
-        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
-        N -= 2;
-        e += 2;
-        r++;
-        it = 0;
-      }
-    }while(r > 0);
-    //Jasper
-    //Already converged
-    //--------------
-    if(N == 0) break;
-
-    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
-
-    for(int m = N - 3; m >= l; m--)
-    {
-      ///Starting vector essentially random shift.
-      if(it%10 == 0 && N >= 3 && it > 0)
-      {
-        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
-        x = H[m][m] - t;
-        z = H[m + 1][m];
-      } else {
-      ///Starting vector implicit Q theorem
-        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
-        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
-	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
-        x = H[m][m] - t;
-        z = H[m + 1][m];
-      }
-      //Jasper
-      //why it is here????
-      //-----------------------
-      if(m == l)
-        break;
-
-      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
-      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
-      if ((T)abs(u + d) == (T)abs(d))
-      {
-        l = m;
-        break;
-      }
-    }
-    //Jasper
-    if(it > 1000000)
-    {
-      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
-      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
-      exit(1);
-    }
-    //
-    T s, c;
-    Givens_calc<T>(x, z, c, s);
-    Givens_mult<T>(H, l, l + 1, c, -s, 0);
-    Givens_mult<T>(H, l, l + 1, c,  s, 1);
-    Givens_mult<T>(P, l, l + 1, c,  s, 1);
-    //
-    for(int k = l; k < N - 2; ++k)
-    {
-      x = H.A[k + 1][k];
-      z = H.A[k + 2][k];
-      Givens_calc<T>(x, z, c, s);
-      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
-      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
-      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
-    }
-    it++;
-    tot_it++;
-  }while(N > 1);
-
-  N = evals.size();
-  ///Annoying - UT solves in reverse order;
-  DenseVector<T> tmp(N);
-  for(int i = 0; i < N; ++i)
-    tmp[i] = evals[N-i-1];
-  evals = tmp;
-  //
-  UTeigenvectors(H, trows, evals, evecs);
-  //UTSymmEigenvectors(H, trows, evals, evecs);
-  for(int i = 0; i < evals.size(); ++i)
-  {
-    evecs[i] = P * evecs[i];
-    normalize(evecs[i]);
-    evals[i] = evals[i] * Hnorm;
-  }
-  // // FIXME this is to test
-  // Hin.write("evecs3", evecs);
-  // Hin.write("evals3", evals);
-  // // check rsd
-  // for(int i = 0; i < M; i++) {
-  //   vector<T> Aevec = Hin * evecs[i];
-  //   RealD norm2(0.);
-  //   for(int j = 0; j < M; j++) {
-  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
-  //   }
-  // }
-  return tot_it;
-}
-
-template <class T>
-void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
-
-  /**
-  turn a matrix A =
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  into
-  x  x  x  x  x
-  x  x  x  x  x
-  0  x  x  x  x
-  0  0  x  x  x
-  0  0  0  x  x
-  with householder rotations
-  Slow.
-  */
-  int N ; SizeSquare(A,N);
-  DenseVector<T > p; Resize(p,N); Fill(p,0);
-
-  for(int k=start;k<N-2;k++){
-    //cerr << "hess" << k << std::endl;
-    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
-    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
-    normalize(ck);    ///Normalization cancels in PHP anyway
-    T beta;
-    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
-    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
-    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
-    ///Accumulate eigenvector
-    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
-  }
-  /*for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-    A(0,k,l);
-    }
-    }*/
-}
-
-template <class T>
-void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
-///Tridiagonalize a matrix
-  int N; SizeSquare(A,N);
-  Hess(A,Q,start);
-  /*for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-    A(0,l,k);
-    }
-    }*/
-}
-
-template <class T>
-void ForceTridiagonal(DenseMatrix<T> &A){
-///Tridiagonalize a matrix
-  int N ; SizeSquare(A,N);
-  for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-      A[l][k]=0;
-      A[k][l]=0;
-    }
-  }
-}
-
-template <class T>
-int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
-  int N; SizeSquare(Ain,N);
-  DenseMatrix<T > A; A = Ain;
-  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
-  Tri(A,Q,0);
-  int it = my_Wilkinson<T>(A, evals, evecs, small);
-  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
-  return it;
-}
-
-
-template <class T>
-int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  return my_Wilkinson(Ain, evals, evecs, small);
-}
-
-template <class T>
-int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  return my_SymmEigensystem(Ain, evals, evecs, small);
-}
-
-template <class T>
-int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-///Solve a general eigensystem, not necessarily in tridiagonal form
-  int N = Ain.dim;
-  DenseMatrix<T > A(N); A = Ain;
-  DenseMatrix<T > Q(N);Q.Unity();
-  Hess(A,Q,0);
-  int it = QReigensystem<T>(A, evals, evecs, small);
-  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
-  return it;
-}
-
-}
-#endif
diff --git a/lib/algorithms/densematrix/Householder.h b/lib/algorithms/densematrix/Householder.h
deleted file mode 100644
index 0c6b7d0b..00000000
--- a/lib/algorithms/densematrix/Householder.h
+++ /dev/null
@@ -1,242 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/Householder.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef HOUSEHOLDER_H
-#define HOUSEHOLDER_H
-
-#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-#define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-#define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-
-namespace Grid {
-/** Comparison function for finding the max element in a vector **/
-template <class T> bool cf(T i, T j) { 
-  return abs(i) < abs(j); 
-}
-
-/** 
-	Calculate a real Givens angle 
- **/
-template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
-
-  RealD mz = (RealD)abs(z);
-  
-  if(mz==0.0){
-    c = 1; s = 0;
-  }
-  if(mz >= (RealD)abs(y)){
-    T t = -y/z;
-    s = (T)1.0 / sqrt ((T)1.0 + t * t);
-    c = s * t;
-  } else {
-    T t = -z/y;
-    c = (T)1.0 / sqrt ((T)1.0 + t * t);
-    s = c * t;
-  }
-}
-
-template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
-{
-  int q ; SizeSquare(A,q);
-
-  if(dir == 0){
-    for(int j=0;j<q;j++){
-      T nu = A[i][j];
-      T w  = A[k][j];
-      A[i][j] = (c*nu + s*w);
-      A[k][j] = (-s*nu + c*w);
-    }
-  }
-
-  if(dir == 1){
-    for(int j=0;j<q;j++){
-      T nu = A[j][i];
-      T w  = A[j][k];
-      A[j][i] = (c*nu - s*w);
-      A[j][k] = (s*nu + c*w);
-    }
-  }
-}
-
-/**
-	from input = x;
-	Compute the complex Householder vector, v, such that
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-
-	P | x |    | x | k = 0
-	| x |    | 0 | 
-	| x | =  | 0 |
-	| x |    | 0 | j = 3
-	| x |	   | x |
-
-	These are the "Unreduced" Householder vectors.
-
- **/
-template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
-{
-  int N ; Size(input,N);
-  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
-
-  if(abs(m) > 0.0){
-    T alpha = 0;
-
-    for(int i=k; i<j+1; i++){
-      v[i] = input[i]/m;
-      alpha = alpha + v[i]*conj(v[i]);
-    }
-    alpha = sqrt(alpha);
-    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
-
-    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
-    else                 v[k] = -alpha;
-  } else{
-    for(int i=k; i<j+1; i++){
-      v[i] = 0.0;
-    } 
-  }
-}
-
-/**
-	from input = x;
-	Compute the complex Householder vector, v, such that
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-
-	Px = alpha*e_dir
-
-	These are the "Unreduced" Householder vectors.
-
- **/
-
-template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
-{
-  int N = input.size();
-  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
-  
-  if(abs(m) > 0.0){
-    T alpha = 0;
-
-    for(int i=k; i<j+1; i++){
-      v[i] = input[i]/m;
-      alpha = alpha + v[i]*conj(v[i]);
-    }
-    
-    alpha = sqrt(alpha);
-    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
-	
-    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
-    else                  v[dir] = -alpha;
-  }else{
-    for(int i=k; i<j+1; i++){
-      v[i] = 0.0;
-    } 
-  }
-}
-
-/**
-	Compute the product PA if trans = 0
-	AP if trans = 1
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-	start at element l of matrix A
-	v is of length j - k + 1 of v are nonzero
- **/
-
-template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
-{
-  int N ; SizeSquare(A,N);
-
-  if(abs(beta) > 0.0){
-    for(int p=l; p<N; p++){
-      T s = 0;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
-	s *= beta;
-	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
-      } else {
-	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
-	s *= beta;
-	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
-      }
-    }
-  }
-}
-
-/**
-	Compute the product PA if trans = 0
-	AP if trans = 1
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-	start at element l of matrix A
-	v is of length j - k + 1 of v are nonzero
-	A is tridiagonal
- **/
-template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
-{
-  if(abs(beta) > 0.0){
-
-    int N ; SizeSquare(A,N);
-
-    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
-
-    T s;
-    for(int p=l; p<M; p++){
-      s = 0;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
-      }else{
-	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
-      }
-      s = beta*s;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
-      }else{
-	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
-      }
-    }
-    for(int p=l; p<M; p++){
-      if(trans==0){
-	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
-      }else{
-	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
-      }
-    }
-  }
-}
-}
-#endif
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 3aa54360..acd67592 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -39,7 +39,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
                    int *info);
 #endif
 
-#include <Grid/algorithms/densematrix/DenseMatrix.h>
+template<class T> using DenseVector = std::vector<T>;
+
+//#include <Grid/algorithms/densematrix/DenseMatrix.h>
 #include <Grid/algorithms/iterative/EigenSort.h>
 
 namespace Grid {
@@ -47,104 +49,85 @@ namespace Grid {
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
-
-
 template<class Field> 
-    class ImplicitlyRestartedLanczos {
+class ImplicitlyRestartedLanczos {
 
-    const RealD small = 1.0e-16;
 public:       
-    int lock;
-    int get;
-    int Niter;
-    int converged;
+  int Niter;   // Max iterations
+  int Nstop;   // Number of evecs checked for convergence
+  int Nk;      // Number of converged sought
+  int Nm;      // Nm -- total number of vectors
 
-    int Nstop;   // Number of evecs checked for convergence
-    int Nk;      // Number of converged sought
-    int Np;      // Np -- Number of spare vecs in kryloc space
-    int Nm;      // Nm -- total number of vectors
+  RealD eresid;
 
-    RealD eresid;
+  ////////////////////////////////////
+  // Embedded objects
+  ////////////////////////////////////
+           SortEigen<Field> _sort;
+  LinearOperatorBase<Field> &_Linop;
+    OperatorFunction<Field> &_poly;
 
-    SortEigen<Field> _sort;
+  /////////////////////////
+  // Constructor
+  /////////////////////////
+ ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
+			    OperatorFunction<Field> & poly,   // polynmial
+			    int _Nstop, // sought vecs
+			    int _Nk,    // sought vecs
+			    int _Nm,    // total vecs
+			    RealD _eresid, // resid in lmdue deficit 
+			    int _Niter) : // Max iterations
+    _Linop(Linop),    _poly(poly),
+    Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
+    eresid(_eresid),  Niter(_Niter)  { };
 
-//    GridCartesian &_fgrid;
-
-    LinearOperatorBase<Field> &_Linop;
-
-    OperatorFunction<Field>   &_poly;
-
-    /////////////////////////
-    // Constructor
-    /////////////////////////
-    void init(void){};
-    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
-
-    ImplicitlyRestartedLanczos(
-				LinearOperatorBase<Field> &Linop, // op
-			       OperatorFunction<Field> & poly,   // polynmial
-			       int _Nstop, // sought vecs
-			       int _Nk, // sought vecs
-			       int _Nm, // spare vecs
-			       RealD _eresid, // resid in lmdue deficit 
-			       int _Niter) : // Max iterations
-      _Linop(Linop),
-      _poly(poly),
-      Nstop(_Nstop),
-      Nk(_Nk),
-      Nm(_Nm),
-      eresid(_eresid),
-      Niter(_Niter)
-    { 
-      Np = Nm-Nk; assert(Np>0);
-    };
-
-    ImplicitlyRestartedLanczos(
-				LinearOperatorBase<Field> &Linop, // op
+#if 0
+    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nk, // sought vecs
-			       int _Nm, // spare vecs
+			       int _Nm, // total vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _Niter) : // Max iterations
-      _Linop(Linop),
-      _poly(poly),
-      Nstop(_Nk),
-      Nk(_Nk),
-      Nm(_Nm),
-      eresid(_eresid),
-      Niter(_Niter)
-    { 
-      Np = Nm-Nk; assert(Np>0);
-    };
+    _Linop(Linop),      _poly(poly),
+    Nstop(_Nk), Nk(_Nk), Nm(_Nm),      
+    eresid(_eresid),      Niter(_Niter) { };
+#endif
 
-    /////////////////////////
-    // Sanity checked this routine (step) against Saad.
-    /////////////////////////
-    void RitzMatrix(DenseVector<Field>& evec,int k){
+#if 0
+    void calc(DenseVector<RealD>& eval,
+	      DenseVector<Field>& evec,
+	      const Field& src,
+	      int& Nconv);
 
-      if(1) return;
+    void step(DenseVector<RealD>& lmd,
+	      DenseVector<RealD>& lme, 
+	      DenseVector<Field>& evec,
+	      Field& w,int Nm,int k);
 
-      GridBase *grid = evec[0]._grid;
-      Field w(grid);
-      std::cout << "RitzMatrix "<<std::endl;
-      for(int i=0;i<k;i++){
-	_poly(_Linop,evec[i],w);
-	std::cout << "["<<i<<"] ";
-	for(int j=0;j<k;j++){
-	  ComplexD in = innerProduct(evec[j],w);
-	  if ( fabs((double)i-j)>1 ) { 
-	    if (abs(in) >1.0e-9 )  { 
-	      std::cout<<"oops"<<std::endl;
-	      abort();
-	    } else 
-	      std::cout << " 0 ";
-	  } else { 
-	    std::cout << " "<<in<<" ";
-	  }
-	}
-	std::cout << std::endl;
-      }
-    }
+    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) ;
+
+    static RealD normalise(Field& v) ;
+    void orthogonalize(Field& w, DenseVector<Field>& evec, int k);
+    void diagonalize(DenseVector<RealD>& lmd,
+		     DenseVector<RealD>& lme, 
+		     int N2, int N1,
+		     DenseVector<RealD>& Qt,
+		     GridBase *grid);
+
+    void qr_decomp(DenseVector<RealD>& lmd,
+		   DenseVector<RealD>& lme,
+		   int Nk, int Nm,
+		   DenseVector<RealD>& Qt,
+		   RealD Dsh, int kmin, int kmax);
+
+#ifdef USE_LAPACK
+    void diagonalize_lapack(DenseVector<RealD>& lmd,
+			    DenseVector<RealD>& lme, 
+			    int N1, int N2,
+			    DenseVector<RealD>& Qt,
+			    GridBase *grid);
+#endif
+#endif
 
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
@@ -161,12 +144,12 @@ public:
 	      DenseVector<Field>& evec,
 	      Field& w,int Nm,int k)
     {
+      const RealD tiny = 1.0e-20;
       assert( k< Nm );
       
       _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
-      if(k>0){
-	w -= lme[k-1] * evec[k-1];
-      }    
+
+      if(k>0) w -= lme[k-1] * evec[k-1];
 
       ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
       RealD     alph = real(zalph);
@@ -176,29 +159,20 @@ public:
       RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
                                  // 7. vk+1 := wk/βk+1
 
-//	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
-      const RealD tiny = 1.0e-20;
-      if ( beta < tiny ) { 
-	std::cout << " beta is tiny "<<beta<<std::endl;
-     }
       lmd[k] = alph;
-      lme[k]  = beta;
+      lme[k] = beta;
 
-      if (k>0) { 
-	orthogonalize(w,evec,k); // orthonormalise
-      }
-      
-      if(k < Nm-1) evec[k+1] = w;
+      if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
+      if ( k < Nm-1) evec[k+1] = w;
+
+      if ( beta < tiny ) std::cout << " beta is tiny "<<beta<<std::endl;
     }
-
-    void qr_decomp(DenseVector<RealD>& lmd,
-		   DenseVector<RealD>& lme,
-		   int Nk,
-		   int Nm,
-		   DenseVector<RealD>& Qt,
-		   RealD Dsh, 
-		   int kmin,
-		   int kmax)
+      
+    void qr_decomp(DenseVector<RealD>& lmd,   // Nm 
+		   DenseVector<RealD>& lme,   // Nm 
+		   int Nk, int Nm,
+		   DenseVector<RealD>& Qt,     // Nm x Nm matrix
+		   RealD Dsh, int kmin, int kmax)
     {
       int k = kmin-1;
       RealD x;
@@ -218,7 +192,7 @@ public:
       lme[k+1] = c*lme[k+1];
       
       for(int i=0; i<Nk; ++i){
-	RealD Qtmp1 = Qt[i+Nm*k  ];
+	RealD Qtmp1 = Qt[i+Nm*k    ];
 	RealD Qtmp2 = Qt[i+Nm*(k+1)];
 	Qt[i+Nm*k    ] = c*Qtmp1 - s*Qtmp2;
 	Qt[i+Nm*(k+1)] = s*Qtmp1 + c*Qtmp2; 
@@ -254,92 +228,88 @@ public:
       }
     }
 
+
 #ifdef USE_LAPACK
     void diagonalize_lapack(DenseVector<RealD>& lmd,
-		     DenseVector<RealD>& lme, 
-		     int N1,
-		     int N2,
-		     DenseVector<RealD>& Qt,
-		     GridBase *grid){
-  const int size = Nm;
-//  tevals.resize(size);
-//  tevecs.resize(size);
-  int NN = N1;
-  double evals_tmp[NN];
-  double evec_tmp[NN][NN];
-  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
-//  double AA[NN][NN];
-  double DD[NN];
-  double EE[NN];
-  for (int i = 0; i< NN; i++)
-    for (int j = i - 1; j <= i + 1; j++)
-      if ( j < NN && j >= 0 ) {
-        if (i==j) DD[i] = lmd[i];
-        if (i==j) evals_tmp[i] = lmd[i];
-        if (j==(i-1)) EE[j] = lme[j];
+			    DenseVector<RealD>& lme, 
+			    int N1,
+			    int N2,
+			    DenseVector<RealD>& Qt,
+			    GridBase *grid)
+    {
+      const int size = Nm;
+      int NN = N1;
+      double evals_tmp[NN];
+      double evec_tmp[NN][NN];
+      memset(evec_tmp[0],0,sizeof(double)*NN*NN);
+      double DD[NN];
+      double EE[NN];
+      for (int i = 0; i< NN; i++) {
+	for (int j = i - 1; j <= i + 1; j++) {
+	  if ( j < NN && j >= 0 ) {
+	    if (i==j) DD[i] = lmd[i];
+	    if (i==j) evals_tmp[i] = lmd[i];
+	    if (j==(i-1)) EE[j] = lme[j];
+	  }
+	}
       }
-  int evals_found;
-  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
-  int liwork =  3+NN*10 ;
-  int iwork[liwork];
-  double work[lwork];
-  int isuppz[2*NN];
-  char jobz = 'V'; // calculate evals & evecs
-  char range = 'I'; // calculate all evals
-  //    char range = 'A'; // calculate all evals
-  char uplo = 'U'; // refer to upper half of original matrix
-  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
-  int ifail[NN];
-  int info;
-//  int total = QMP_get_number_of_nodes();
-//  int node = QMP_get_node_number();
-//  GridBase *grid = evec[0]._grid;
-  int total = grid->_Nprocessors;
-  int node = grid->_processor;
-  int interval = (NN/total)+1;
-  double vl = 0.0, vu = 0.0;
-  int il = interval*node+1 , iu = interval*(node+1);
-  if (iu > NN)  iu=NN;
-  double tol = 0.0;
-    if (1) {
-      memset(evals_tmp,0,sizeof(double)*NN);
-      if ( il <= NN){
-        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
-        LAPACK_dstegr(&jobz, &range, &NN,
-            (double*)DD, (double*)EE,
-            &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
-            &tol, // tolerance
-            &evals_found, evals_tmp, (double*)evec_tmp, &NN,
-            isuppz,
-            work, &lwork, iwork, &liwork,
-            &info);
-        for (int i = iu-1; i>= il-1; i--){
-          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
-          evals_tmp[i] = evals_tmp[i - (il-1)];
-          if (il>1) evals_tmp[i-(il-1)]=0.;
-          for (int j = 0; j< NN; j++){
-            evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
-            if (il>1) evec_tmp[i-(il-1)][j]=0.;
-          }
-        }
+      int evals_found;
+      int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+      int liwork =  3+NN*10 ;
+      int iwork[liwork];
+      double work[lwork];
+      int isuppz[2*NN];
+      char jobz = 'V'; // calculate evals & evecs
+      char range = 'I'; // calculate all evals
+      //    char range = 'A'; // calculate all evals
+      char uplo = 'U'; // refer to upper half of original matrix
+      char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+      int ifail[NN];
+      int info;
+      int total = grid->_Nprocessors;
+      int node  = grid->_processor;
+      int interval = (NN/total)+1;
+      double vl = 0.0, vu = 0.0;
+      int il = interval*node+1 , iu = interval*(node+1);
+      if (iu > NN)  iu=NN;
+      double tol = 0.0;
+      if (1) {
+	memset(evals_tmp,0,sizeof(double)*NN);
+	if ( il <= NN){
+	  LAPACK_dstegr(&jobz, &range, &NN,
+			(double*)DD, (double*)EE,
+			&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+			&tol, // tolerance
+			&evals_found, evals_tmp, (double*)evec_tmp, &NN,
+			isuppz,
+			work, &lwork, iwork, &liwork,
+			&info);
+	  for (int i = iu-1; i>= il-1; i--){
+	    evals_tmp[i] = evals_tmp[i - (il-1)];
+	    if (il>1) evals_tmp[i-(il-1)]=0.;
+	    for (int j = 0; j< NN; j++){
+	      evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
+	      if (il>1) evec_tmp[i-(il-1)][j]=0.;
+	    }
+	  }
+	}
+	{
+	  grid->GlobalSumVector(evals_tmp,NN);
+	  grid->GlobalSumVector((double*)evec_tmp,NN*NN);
+	}
+      } 
+      // cheating a bit.
+      // It is better to sort instead of just reversing it, 
+      // but the document of the routine says evals are sorted in increasing order. 
+      // qr gives evals in decreasing order.
+      for(int i=0;i<NN;i++){
+	for(int j=0;j<NN;j++)
+	  Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
+	lmd [NN-1-i]=evals_tmp[i];
       }
-      {
-//        QMP_sum_double_array(evals_tmp,NN);
-//        QMP_sum_double_array((double *)evec_tmp,NN*NN);
-         grid->GlobalSumVector(evals_tmp,NN);
-         grid->GlobalSumVector((double*)evec_tmp,NN*NN);
-      }
-    } 
-// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
-  for(int i=0;i<NN;i++){
-    for(int j=0;j<NN;j++)
-      Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
-      lmd [NN-1-i]=evals_tmp[i];
-  }
-}
+    }
 #endif
 
-
     void diagonalize(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
 		     int N2,
@@ -354,24 +324,23 @@ public:
     if(!check_lapack)
 	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
 
-	DenseVector <RealD> lmd2(N1);
-	DenseVector <RealD> lme2(N1);
-	DenseVector<RealD> Qt2(N1*N1);
-         for(int k=0; k<N1; ++k){
-	    lmd2[k] = lmd[k];
-	    lme2[k] = lme[k];
-	  }
-         for(int k=0; k<N1*N1; ++k)
-	Qt2[k] = Qt[k];
-
-//	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+    DenseVector <RealD> lmd2(N1);
+    DenseVector <RealD> lme2(N1);
+    DenseVector<RealD> Qt2(N1*N1);
+    for(int k=0; k<N1; ++k){
+      lmd2[k] = lmd[k];
+      lme2[k] = lme[k];
+    }
+    for(int k=0; k<N1*N1; ++k){
+      Qt2[k] = Qt[k];
+    }
 #endif
 
       int Niter = 100*N1;
       int kmin = 1;
       int kmax = N2;
-      // (this should be more sophisticated)
 
+      // (this should be more sophisticated)
       for(int iter=0; iter<Niter; ++iter){
 
 	// determination of 2x2 leading submatrix
@@ -393,21 +362,17 @@ public:
 	}
 	Niter = iter;
 #ifdef USE_LAPACK
-    if(check_lapack){
-	const double SMALL=1e-8;
-	diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
-	DenseVector <RealD> lmd3(N2);
-         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
-        _sort.push(lmd3,N2);
-        _sort.push(lmd2,N2);
-         for(int k=0; k<N2; ++k){
+	if(check_lapack){
+	  const double SMALL=1e-8;
+	  diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
+	  DenseVector <RealD> lmd3(N2);
+	  for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
+	  _sort.push(lmd3,N2);
+	  _sort.push(lmd2,N2);
+	  for(int k=0; k<N2; ++k){
 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
-//	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
 	  }
-         for(int k=0; k<N1*N1; ++k){
-//	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
 	}
-    }
 #endif
 	return;
 
@@ -424,7 +389,6 @@ public:
       abort();
     }
 
-#if 1
     static RealD normalise(Field& v) 
     {
       RealD nn = norm2(v);
@@ -457,6 +421,7 @@ public:
       normalise(w);
     }
 
+
     void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) {
       for(int i=0; i<Qt.size(); ++i) Qt[i] = 0.0;
       for(int k=0; k<Nm; ++k) Qt[k + k*Nm] = 1.0;
@@ -488,10 +453,11 @@ until convergence
 	GridBase *grid = evec[0]._grid;
 	assert(grid == src._grid);
 
-	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
-	std::cout << " -- Nm = " << Nm << std::endl;
-	std::cout << " -- size of eval   = " << eval.size() << std::endl;
-	std::cout << " -- size of evec  = " << evec.size() << std::endl;
+	std::cout << " -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
+	std::cout << " -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
+	std::cout << " -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
+	std::cout << " -- size of eval = " << eval.size() << std::endl;
+	std::cout << " -- size of evec = " << evec.size() << std::endl;
 	
 	assert(Nm == evec.size() && Nm == eval.size());
 	
@@ -514,39 +480,25 @@ until convergence
 	RealD beta_k;
   
 	// Set initial vector
-	// (uniform vector) Why not src??
-	//	evec[0] = 1.0;
 	evec[0] = src;
 	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
-// << src._grid  << std::endl;
+
 	normalise(evec[0]);
 	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
-// << evec[0]._grid << std::endl;
 	
 	// Initial Nk steps
 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
-//	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
-//	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
-	RitzMatrix(evec,Nk);
-	for(int k=0; k<Nk; ++k){
-//	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
-//	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
-	}
 
 	// Restarting loop begins
-	for(int iter = 0; iter<Niter; ++iter){
+	int iter;
+	for(iter = 0; iter<Niter; ++iter){
 
 	  std::cout<<"\n Restart iteration = "<< iter << std::endl;
 
-	  // 
-	  // Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs.
-	  // We loop over 
-	  //
 	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
+
 	  f *= lme[Nm-1];
 
-	  RitzMatrix(evec,k2);
-	  
 	  // getting eigenvalues
 	  for(int k=0; k<Nm; ++k){
 	    eval2[k] = eval[k+k1-1];
@@ -561,10 +513,9 @@ until convergence
 	  // Implicitly shifted QR transformations
 	  setUnit_Qt(Nm,Qt);
 	  for(int ip=k2; ip<Nm; ++ip){ 
-	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
+	    //	    std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
-		
-	}
+	  }
     
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 	  
@@ -599,18 +550,14 @@ until convergence
 	  
 	  for(int j = 0; j<Nk; ++j){
 	    for(int k = 0; k<Nk; ++k){
-	    B[j].checkerboard = evec[k].checkerboard;
+	      B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
-//	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 	  }
-//	_sort.push(eval2,B,Nk);
 
 	  Nconv = 0;
-	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 	  for(int i=0; i<Nk; ++i){
 
-//	    _poly(_Linop,B[i],v);
 	    _Linop.HermOp(B[i],v);
 	    
 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
@@ -624,15 +571,13 @@ until convergence
 	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
 	    
-	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+	    // change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
 	    if((vv<eresid*eresid) && (i == Nconv) ){
 	      Iconv[Nconv] = i;
 	      ++Nconv;
 	    }
 
 	  }  // i-loop end
-	  //	  std::cout << std::resetiosflags(std::ios_base::scientific);
-
 
 	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
 
@@ -655,556 +600,10 @@ until convergence
       _sort.push(eval,evec,Nconv);
 
       std::cout << "\n Converged\n Summary :\n";
-      std::cout << " -- Iterations  = "<< Nconv  << "\n";
+      std::cout << " -- Iterations  = "<< iter   << "\n";
       std::cout << " -- beta(k)     = "<< beta_k << "\n";
       std::cout << " -- Nconv       = "<< Nconv  << "\n";
      }
-
-    /////////////////////////////////////////////////
-    // Adapted from Rudy's lanczos factor routine
-    /////////////////////////////////////////////////
-    int Lanczos_Factor(int start, int end,  int cont,
-		       DenseVector<Field> & bq, 
-		       Field &bf,
-		       DenseMatrix<RealD> &H){
-      
-      GridBase *grid = bq[0]._grid;
-
-      RealD beta;  
-      RealD sqbt;  
-      RealD alpha;
-
-      for(int i=start;i<Nm;i++){
-	for(int j=start;j<Nm;j++){
-	  H[i][j]=0.0;
-	}
-      }
-
-      std::cout<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl;
-
-      // Starting from scratch, bq[0] contains a random vector and |bq[0]| = 1
-      int first;
-      if(start == 0){
-
-	std::cout << "start == 0\n"; //TESTING
-
-	_poly(_Linop,bq[0],bf);
-
-	alpha = real(innerProduct(bq[0],bf));//alpha =  bq[0]^dag A bq[0]
-
-	std::cout << "alpha = " << alpha << std::endl;
-	
-	bf = bf - alpha * bq[0];  //bf =  A bq[0] - alpha bq[0]
-
-	H[0][0]=alpha;
-
-	std::cout << "Set H(0,0) to " << H[0][0] << std::endl;
-
-	first = 1;
-
-      } else {
-
-	first = start;
-
-      }
-
-      // I think start==0 and cont==zero are the same. Test this
-      // If so I can drop "cont" parameter?
-      if( cont ) assert(start!=0);
-
-      if( start==0 ) assert(cont!=0);
-
-      if( cont){
-
-	beta = 0;sqbt = 0;
-
-	std::cout << "cont is true so setting beta to zero\n";
-
-      }	else {
-
-	beta = norm2(bf);
-	sqbt = sqrt(beta);
-
-	std::cout << "beta = " << beta << std::endl;
-      }
-
-      for(int j=first;j<end;j++){
-
-	std::cout << "Factor j " << j <<std::endl;
-
-	if(cont){ // switches to factoring; understand start!=0 and initial bf value is right.
-	  bq[j] = bf; cont = false;
-	}else{
-	  bq[j] = (1.0/sqbt)*bf ;
-
-	  H[j][j-1]=H[j-1][j] = sqbt;
-	}
-
-	_poly(_Linop,bq[j],bf);
-
-	bf = bf - (1.0/sqbt)*bq[j-1]; 	       //bf = A bq[j] - beta bq[j-1] // PAB this comment was incorrect in beta term??
-
-	alpha = real(innerProduct(bq[j],bf));  //alpha = bq[j]^dag A bq[j]
-
-	bf = bf - alpha*bq[j];                 //bf = A bq[j] - beta bq[j-1] - alpha bq[j]
-	RealD fnorm = norm2(bf);
-
-	RealD bck = sqrt( real( conjugate(alpha)*alpha ) + beta );
-
-	beta = fnorm;
-	sqbt = sqrt(beta);
-	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';
-
-	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ]
-	int re = 0;
-	// FIXME undefined params; how set in Rudy's code
-	int ref =0;
-	Real rho = 1.0e-8;
-
-	while( re == ref || (sqbt < rho * bck && re < 5) ){
-
-	  Field tmp2(grid);
-	  Field tmp1(grid);
-
-	  //bex = V^dag bf
-	  DenseVector<ComplexD> bex(j+1);
-	  for(int k=0;k<j+1;k++){
-	    bex[k] = innerProduct(bq[k],bf);
-	  }
-	  
-	  zero_fermion(tmp2);
-	  //tmp2 = V s
-	  for(int l=0;l<j+1;l++){
-	    RealD nrm = norm2(bq[l]);
-	    axpy(tmp1,0.0,bq[l],bq[l]); scale(tmp1,bex[l]); 	//tmp1 = V[j] bex[j]
-	    axpy(tmp2,1.0,tmp2,tmp1);					//tmp2 += V[j] bex[j]
-	  }
-
-	  //bf = bf - V V^dag bf.   Subtracting off any component in span { V[j] } 
-	  RealD btc = axpy_norm(bf,-1.0,tmp2,bf);
-	  alpha = alpha + real(bex[j]);	      sqbt = sqrt(real(btc));	      
-	  // FIXME is alpha real in RUDY's code?
-	  RealD nmbex = 0;for(int k=0;k<j+1;k++){nmbex = nmbex + real( conjugate(bex[k])*bex[k]  );}
-	  bck = sqrt( nmbex );
-	  re++;
-	}
-	std::cout << "Iteratively refined orthogonality, changes alpha\n";
-	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
-	H[j][j]=alpha;
-      }
-
-      return end;
-    }
-
-    void EigenSort(DenseVector<double> evals,
-		   DenseVector<Field>  evecs){
-      int N= evals.size();
-      _sort.push(evals,evecs, evals.size(),N);
-    }
-
-    void ImplicitRestart(int TM, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs, DenseVector<Field> &bq, Field &bf, int cont)
-    {
-      std::cout << "ImplicitRestart begin. Eigensort starting\n";
-
-      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-
-      EigenSort(evals, evecs);
-
-      ///Assign shifts
-      int K=Nk;
-      int M=Nm;
-      int P=Np;
-      int converged=0;
-      if(K - converged < 4) P = (M - K-1); //one
-      //      DenseVector<RealD> shifts(P + shift_extra.size());
-      DenseVector<RealD> shifts(P);
-      for(int k = 0; k < P; ++k)
-	shifts[k] = evals[k]; 
-
-      /// Shift to form a new H and q
-      DenseMatrix<RealD> Q; Resize(Q,TM,TM);
-      Unity(Q);
-      Shift(Q, shifts); // H is implicitly passed in in Rudy's Shift routine
-
-      int ff = K;
-
-      /// Shifted H defines a new K step Arnoldi factorization
-      RealD  beta = H[ff][ff-1]; 
-      RealD  sig  = Q[TM - 1][ff - 1];
-      std::cout << "beta = " << beta << " sig = " << real(sig) <<std::endl;
-
-      std::cout << "TM = " << TM << " ";
-      std::cout << norm2(bq[0]) << " -- before" <<std::endl;
-
-      /// q -> q Q
-      times_real(bq, Q, TM);
-
-      std::cout << norm2(bq[0]) << " -- after " << ff <<std::endl;
-      bf =  beta* bq[ff] + sig* bf;
-
-      /// Do the rest of the factorization
-      ff = Lanczos_Factor(ff, M,cont,bq,bf,H);
-      
-      if(ff < M)
-	Abort(ff, evals, evecs);
-    }
-
-///Run the Eigensolver
-    void Run(int cont, DenseVector<Field> &bq, Field &bf, DenseVector<DenseVector<RealD> > & evecs,DenseVector<RealD> &evals)
-    {
-      init();
-
-      int M=Nm;
-
-      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-      Resize(evals,Nm);
-      Resize(evecs,Nm);
-
-      int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
-
-      if(ff < M) {
-	std::cout << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl;
-	abort(); // Why would this happen?
-      }
-
-      int itcount = 0;
-      bool stop = false;
-
-      for(int it = 0; it < Niter && (converged < Nk); ++it) {
-
-	std::cout << "Krylov: Iteration --> " << it << std::endl;
-	int lock_num = lock ? converged : 0;
-	DenseVector<RealD> tevals(M - lock_num );
-	DenseMatrix<RealD> tevecs; Resize(tevecs,M - lock_num,M - lock_num);
-	  
-	//check residual of polynominal 
-	TestConv(H,M, tevals, tevecs);
-
-	if(converged >= Nk)
-	    break;
-
-	ImplicitRestart(ff, tevals,tevecs,H);
-      }
-      Wilkinson<RealD>(H, evals, evecs, small); 
-      //      Check();
-
-      std::cout << "Done  "<<std::endl;
-
-    }
-
-   ///H - shift I = QR; H = Q* H Q
-    void Shift(DenseMatrix<RealD> & H,DenseMatrix<RealD> &Q, DenseVector<RealD> shifts) {
-      
-      int P; Size(shifts,P);
-      int M; SizeSquare(Q,M);
-
-      Unity(Q);
-
-      int lock_num = lock ? converged : 0;
-
-      RealD t_Househoulder_vector(0.0);
-      RealD t_Househoulder_mult(0.0);
-
-      for(int i=0;i<P;i++){
-
-	RealD x, y, z;
-	DenseVector<RealD> ck(3), v(3);
-	  
-	x = H[lock_num+0][lock_num+0]-shifts[i];
-	y = H[lock_num+1][lock_num+0];
-	ck[0] = x; ck[1] = y; ck[2] = 0; 
-
-	normalise(ck);	///Normalization cancels in PHP anyway
-	RealD beta;
-
-	Householder_vector<RealD>(ck, 0, 2, v, beta);
-	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,0);
-	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,1);
-	///Accumulate eigenvector
-	Householder_mult<RealD>(Q,v,beta,0,lock_num+0,lock_num+2,1);
-	  
-	int sw = 0;
-	for(int k=lock_num+0;k<M-2;k++){
-
-	  x = H[k+1][k]; 
-	  y = H[k+2][k]; 
-	  z = (RealD)0.0;
-	  if(k+3 <= M-1){
-	    z = H[k+3][k];
-	  }else{
-	    sw = 1; v[2] = 0.0;
-	  }
-
-	  ck[0] = x; ck[1] = y; ck[2] = z;
-
-	  normalise(ck);
-
-	  Householder_vector<RealD>(ck, 0, 2-sw, v, beta);
-	  Householder_mult<RealD>(H,v, beta,0,k+1,k+3-sw,0);
-	  Householder_mult<RealD>(H,v, beta,0,k+1,k+3-sw,1);
-	  ///Accumulate eigenvector
-	  Householder_mult<RealD>(Q,v, beta,0,k+1,k+3-sw,1);
-	}
-      }
-    }
-
-    void TestConv(DenseMatrix<RealD> & H,int SS, 
-		  DenseVector<Field> &bq, Field &bf,
-		  DenseVector<RealD> &tevals, DenseVector<DenseVector<RealD> > &tevecs, 
-		  int lock, int converged)
-    {
-      std::cout << "Converged " << converged << " so far." << std::endl;
-      int lock_num = lock ? converged : 0;
-      int M = Nm;
-
-      ///Active Factorization
-      DenseMatrix<RealD> AH; Resize(AH,SS - lock_num,SS - lock_num );
-
-      AH = GetSubMtx(H,lock_num, SS, lock_num, SS);
-
-      int NN=tevals.size();
-      int AHsize=SS-lock_num;
-
-      RealD small=1.0e-16;
-      Wilkinson<RealD>(AH, tevals, tevecs, small);
-
-      EigenSort(tevals, tevecs);
-
-      RealD resid_nrm=  norm2(bf);
-
-      if(!lock) converged = 0;
-#if 0
-      for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){
-
-	RealD diff = 0;
-	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;
-
-	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;
-
-	if(diff < converged) {
-
-	  if(lock) {
-	    
-	    DenseMatrix<RealD> Q; Resize(Q,M,M);
-	    bool herm = true; 
-
-	    Lock(H, Q, tevals[i], converged, small, SS, herm);
-
-	    times_real(bq, Q, bq.size());
-	    bf = Q[M - 1][M - 1]* bf;
-	    lock_num++;
-	  }
-	  converged++;
-	  std::cout << " converged on eval " << converged << " of " << Nk << std::endl;
-	} else {
-	  break;
-	}
-      }
-#endif
-      std::cout << "Got " << converged << " so far " <<std::endl;	
-    }
-
-    ///Check
-    void Check(DenseVector<RealD> &evals,
-	       DenseVector<DenseVector<RealD> > &evecs) {
-
-      DenseVector<RealD> goodval(this->get);
-
-      EigenSort(evals,evecs);
-
-      int NM = Nm;
-
-      DenseVector< DenseVector<RealD> > V; Size(V,NM);
-      DenseVector<RealD> QZ(NM*NM);
-
-      for(int i = 0; i < NM; i++){
-	for(int j = 0; j < NM; j++){
-	  // evecs[i][j];
-	}
-      }
-    }
-
-
-/**
-   There is some matrix Q such that for any vector y
-   Q.e_1 = y and Q is unitary.
-**/
-  template<class T>
-  static T orthQ(DenseMatrix<T> &Q, DenseVector<T> y){
-    int N = y.size();	//Matrix Size
-    Fill(Q,0.0);
-    T tau;
-    for(int i=0;i<N;i++){
-      Q[i][0]=y[i];
-    }
-    T sig = conj(y[0])*y[0];
-    T tau0 = abs(sqrt(sig));
-    
-    for(int j=1;j<N;j++){
-      sig += conj(y[j])*y[j]; 
-      tau = abs(sqrt(sig) ); 	
-
-      if(abs(tau0) > 0.0){
-	
-	T gam = conj( (y[j]/tau)/tau0 );
-	for(int k=0;k<=j-1;k++){  
-	  Q[k][j]=-gam*y[k];
-	}
-	Q[j][j]=tau0/tau;
-      } else {
-	Q[j-1][j]=1.0;
-      }
-      tau0 = tau;
-    }
-    return tau;
-  }
-
-/**
-	There is some matrix Q such that for any vector y
-	Q.e_k = y and Q is unitary.
-**/
-  template< class T>
-  static T orthU(DenseMatrix<T> &Q, DenseVector<T> y){
-    T tau = orthQ(Q,y);
-    SL(Q);
-    return tau;
-  }
-
-
-/**
-	Wind up with a matrix with the first con rows untouched
-
-say con = 2
-	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
-	and the matrix is upper hessenberg
-	and with f and Q appropriately modidied with Q is the arnoldi factorization
-
-**/
-
-template<class T>
-static void Lock(DenseMatrix<T> &H, 	// Hess mtx	
-		 DenseMatrix<T> &Q, 	// Lock Transform
-		 T val, 		// value to be locked
-		 int con, 	// number already locked
-		 RealD small,
-		 int dfg,
-		 bool herm)
-{	
-  //ForceTridiagonal(H);
-
-  int M = H.dim;
-  DenseVector<T> vec; Resize(vec,M-con);
-
-  DenseMatrix<T> AH; Resize(AH,M-con,M-con);
-  AH = GetSubMtx(H,con, M, con, M);
-
-  DenseMatrix<T> QQ; Resize(QQ,M-con,M-con);
-
-  Unity(Q);   Unity(QQ);
-  
-  DenseVector<T> evals; Resize(evals,M-con);
-  DenseMatrix<T> evecs; Resize(evecs,M-con,M-con);
-
-  Wilkinson<T>(AH, evals, evecs, small);
-
-  int k=0;
-  RealD cold = abs( val - evals[k]); 
-  for(int i=1;i<M-con;i++){
-    RealD cnew = abs( val - evals[i]);
-    if( cnew < cold ){k = i; cold = cnew;}
-  }
-  vec = evecs[k];
-
-  ComplexD tau;
-  orthQ(QQ,vec);
-  //orthQM(QQ,AH,vec);
-
-  AH = Hermitian(QQ)*AH;
-  AH = AH*QQ;
-
-  for(int i=con;i<M;i++){
-    for(int j=con;j<M;j++){
-      Q[i][j]=QQ[i-con][j-con];
-      H[i][j]=AH[i-con][j-con];
-    }
-  }
-
-  for(int j = M-1; j>con+2; j--){
-
-    DenseMatrix<T> U; Resize(U,j-1-con,j-1-con);
-    DenseVector<T> z; Resize(z,j-1-con); 
-    T nm = norm(z); 
-    for(int k = con+0;k<j-1;k++){
-      z[k-con] = conj( H(j,k+1) );
-    }
-    normalise(z);
-
-    RealD tmp = 0;
-    for(int i=0;i<z.size()-1;i++){tmp = tmp + abs(z[i]);}
-
-    if(tmp < small/( (RealD)z.size()-1.0) ){ continue;}	
-
-    tau = orthU(U,z);
-
-    DenseMatrix<T> Hb; Resize(Hb,j-1-con,M);	
-	
-    for(int a = 0;a<M;a++){
-      for(int b = 0;b<j-1-con;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += H[a][con+1+c]*U[c][b];
-	}//sum += H(a,con+1+c)*U(c,b);}
-	Hb[b][a] = sum;
-      }
-    }
-	
-    for(int k=con+1;k<j;k++){
-      for(int l=0;l<M;l++){
-	H[l][k] = Hb[k-1-con][l];
-      }
-    }//H(Hb[k-1-con][l] , l,k);}}
-
-    DenseMatrix<T> Qb; Resize(Qb,M,M);	
-	
-    for(int a = 0;a<M;a++){
-      for(int b = 0;b<j-1-con;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += Q[a][con+1+c]*U[c][b];
-	}//sum += Q(a,con+1+c)*U(c,b);}
-	Qb[b][a] = sum;
-      }
-    }
-	
-    for(int k=con+1;k<j;k++){
-      for(int l=0;l<M;l++){
-	Q[l][k] = Qb[k-1-con][l];
-      }
-    }//Q(Qb[k-1-con][l] , l,k);}}
-
-    DenseMatrix<T> Hc; Resize(Hc,M,M);	
-	
-    for(int a = 0;a<j-1-con;a++){
-      for(int b = 0;b<M;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += conj( U[c][a] )*H[con+1+c][b];
-	}//sum += conj( U(c,a) )*H(con+1+c,b);}
-	Hc[b][a] = sum;
-      }
-    }
-
-    for(int k=0;k<M;k++){
-      for(int l=con+1;l<j;l++){
-	H[l][k] = Hc[k][l-1-con];
-      }
-    }//H(Hc[k][l-1-con] , l,k);}}
-
-  }
-}
-#endif
-
-
  };
 
 }
diff --git a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
index 118a8e25..3bcdc77a 100644
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -102,7 +102,7 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
     FieldMetaData header;
     IldgReader _IldgReader;
     _IldgReader.open(config);
-    _IldgReader.readConfiguration(config,U,header);  // format from the header
+    _IldgReader.readConfiguration(U,header);  // format from the header
     _IldgReader.close();
 
     std::cout << GridLogMessage << "Read ILDG Configuration from " << config
diff --git a/tests/solver/Test_dwf_lanczos.cc b/tests/solver/Test_dwf_lanczos.cc
index bb978186..48cca378 100644
--- a/tests/solver/Test_dwf_lanczos.cc
+++ b/tests/solver/Test_dwf_lanczos.cc
@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG5rb(FrbGrid);  RNG5.SeedFixedIntegers(seeds5);
 
   LatticeGaugeField Umu(UGrid); 
-  SU3::TepidConfiguration(RNG4, Umu);
+  SU3::HotConfiguration(RNG4, Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
   for(int mu=0;mu<Nd;mu++){

From 7e3528686080357933ff87400fadb181abfd8f35 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 21 Jun 2017 02:26:03 +0100
Subject: [PATCH 097/177] Simplified lanczos, added Eigen diagonalisation.
 Curious if we can deprecate dependencly on BLAS. Will see when we get 48^3
 running on our BG/Q port

---
 .../iterative/BlockConjugateGradient.h        |    7 +-
 lib/algorithms/iterative/EigenSort.h          |   81 --
 .../iterative/ImplicitlyRestartedLanczos.h    | 1074 +++++++++--------
 tests/solver/Test_dwf_lanczos.cc              |    9 +-
 4 files changed, 547 insertions(+), 624 deletions(-)
 delete mode 100644 lib/algorithms/iterative/EigenSort.h

diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index f8b83b1f..9418f63c 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -56,11 +56,8 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   
   BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol),
-    CGtype(cgtype),
-    blockDim(_Orthog),
-    MaxIterations(maxit),
-    ErrorOnNoConverge(err_on_no_conv){};
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
+  {};
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
diff --git a/lib/algorithms/iterative/EigenSort.h b/lib/algorithms/iterative/EigenSort.h
deleted file mode 100644
index 23621544..00000000
--- a/lib/algorithms/iterative/EigenSort.h
+++ /dev/null
@@ -1,81 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/EigenSort.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_EIGENSORT_H
-#define GRID_EIGENSORT_H
-
-
-namespace Grid {
-    /////////////////////////////////////////////////////////////
-    // Eigen sorter to begin with
-    /////////////////////////////////////////////////////////////
-
-template<class Field>
-class SortEigen {
- private:
-  
-//hacking for testing for now
- private:
-  static bool less_lmd(RealD left,RealD right){
-    return left > right;
-  }  
-  static bool less_pair(std::pair<RealD,Field const*>& left,
-                        std::pair<RealD,Field const*>& right){
-    return left.first > (right.first);
-  }  
-  
-  
- public:
-
-  void push(DenseVector<RealD>& lmd,
-            DenseVector<Field>& evec,int N) {
-    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
-    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
-    
-    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
-    for(int i=0;i<lmd.size();++i)
-      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
-
-    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
-
-    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
-    for(int i=0;i<N;++i){
-      lmd[i]=it->first;
-      evec[i]=*(it->second);
-      ++it;
-    }
-  }
-  void push(DenseVector<RealD>& lmd,int N) {
-    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
-  }
-  bool saturated(RealD lmd, RealD thrs) {
-    return fabs(lmd) > fabs(thrs);
-  }
-};
-
-}
-#endif
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index acd67592..571bf1b2 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -7,7 +7,8 @@
     Copyright (C) 2015
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung
+Author: Guido Cossu
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -31,35 +32,71 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #include <string.h> //memset
 
-#ifdef USE_LAPACK
-void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
-                   double *vl, double *vu, int *il, int *iu, double *abstol,
-                   int *m, double *w, double *z, int *ldz, int *isuppz,
-                   double *work, int *lwork, int *iwork, int *liwork,
-                   int *info);
-#endif
-
-template<class T> using DenseVector = std::vector<T>;
-
-//#include <Grid/algorithms/densematrix/DenseMatrix.h>
-#include <Grid/algorithms/iterative/EigenSort.h>
-
 namespace Grid {
 
+  enum IRLdiagonalisation { 
+    IRLdiagonaliseWithDSTEGR,
+    IRLdiagonaliseWithQR,
+    IRLdiagonaliseWithEigen
+  };
+  ////////////////////////////////////////////////////////////////////////////////
+  // Helper class for sorting the evalues AND evectors by Field
+  // Use pointer swizzle on vectors
+  ////////////////////////////////////////////////////////////////////////////////
+template<class Field>
+class SortEigen {
+ private:
+  static bool less_lmd(RealD left,RealD right){
+    return left > right;
+  }  
+  static bool less_pair(std::pair<RealD,Field const*>& left,
+                        std::pair<RealD,Field const*>& right){
+    return left.first > (right.first);
+  }  
+  
+ public:
+  void push(std::vector<RealD>& lmd,std::vector<Field>& evec,int N) {
+    
+    ////////////////////////////////////////////////////////////////////////
+    // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set.
+    //    : The vector reorder should be done by pointer swizzle somehow
+    ////////////////////////////////////////////////////////////////////////
+    std::vector<Field> cpy(lmd.size(),evec[0]._grid);
+    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
+    
+    std::vector<std::pair<RealD, Field const*> > emod(lmd.size());    
+
+    for(int i=0;i<lmd.size();++i)  emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
+
+    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
+
+    typename std::vector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
+    for(int i=0;i<N;++i){
+      lmd[i]=it->first;
+      evec[i]=*(it->second);
+      ++it;
+    }
+  }
+  void push(std::vector<RealD>& lmd,int N) {
+    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
+  }
+  bool saturated(RealD lmd, RealD thrs) {
+    return fabs(lmd) > fabs(thrs);
+  }
+};
+
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
 template<class Field> 
 class ImplicitlyRestartedLanczos {
-
-public:       
-  int Niter;   // Max iterations
-  int Nstop;   // Number of evecs checked for convergence
-  int Nk;      // Number of converged sought
-  int Nm;      // Nm -- total number of vectors
-
+private:       
+  int MaxIter;   // Max iterations
+  int Nstop;     // Number of evecs checked for convergence
+  int Nk;        // Number of converged sought
+  int Nm;        // Nm -- total number of vectors
   RealD eresid;
-
+  IRLdiagonalisation diagonalisation;
   ////////////////////////////////////
   // Embedded objects
   ////////////////////////////////////
@@ -70,362 +107,20 @@ public:
   /////////////////////////
   // Constructor
   /////////////////////////
+public:       
  ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
-			    OperatorFunction<Field> & poly,   // polynmial
-			    int _Nstop, // sought vecs
+			    OperatorFunction<Field> & poly,   // polynomial
+			    int _Nstop, // really sought vecs
 			    int _Nk,    // sought vecs
 			    int _Nm,    // total vecs
-			    RealD _eresid, // resid in lmdue deficit 
-			    int _Niter) : // Max iterations
+			    RealD _eresid, // resid in lmd deficit 
+			    int _MaxIter,  // Max iterations
+			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen ) :
     _Linop(Linop),    _poly(poly),
-    Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
-    eresid(_eresid),  Niter(_Niter)  { };
-
-#if 0
-    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
-			       OperatorFunction<Field> & poly,   // polynmial
-			       int _Nk, // sought vecs
-			       int _Nm, // total vecs
-			       RealD _eresid, // resid in lmdue deficit 
-			       int _Niter) : // Max iterations
-    _Linop(Linop),      _poly(poly),
-    Nstop(_Nk), Nk(_Nk), Nm(_Nm),      
-    eresid(_eresid),      Niter(_Niter) { };
-#endif
-
-#if 0
-    void calc(DenseVector<RealD>& eval,
-	      DenseVector<Field>& evec,
-	      const Field& src,
-	      int& Nconv);
-
-    void step(DenseVector<RealD>& lmd,
-	      DenseVector<RealD>& lme, 
-	      DenseVector<Field>& evec,
-	      Field& w,int Nm,int k);
-
-    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) ;
-
-    static RealD normalise(Field& v) ;
-    void orthogonalize(Field& w, DenseVector<Field>& evec, int k);
-    void diagonalize(DenseVector<RealD>& lmd,
-		     DenseVector<RealD>& lme, 
-		     int N2, int N1,
-		     DenseVector<RealD>& Qt,
-		     GridBase *grid);
-
-    void qr_decomp(DenseVector<RealD>& lmd,
-		   DenseVector<RealD>& lme,
-		   int Nk, int Nm,
-		   DenseVector<RealD>& Qt,
-		   RealD Dsh, int kmin, int kmax);
-
-#ifdef USE_LAPACK
-    void diagonalize_lapack(DenseVector<RealD>& lmd,
-			    DenseVector<RealD>& lme, 
-			    int N1, int N2,
-			    DenseVector<RealD>& Qt,
-			    GridBase *grid);
-#endif
-#endif
-
-/* Saad PP. 195
-1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
-2. For k = 1,2,...,m Do:
-3. wk:=Avk−βkv_{k−1}      
-4. αk:=(wk,vk)       // 
-5. wk:=wk−αkvk       // wk orthog vk 
-6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-7. vk+1 := wk/βk+1
-8. EndDo
- */
-    void step(DenseVector<RealD>& lmd,
-	      DenseVector<RealD>& lme, 
-	      DenseVector<Field>& evec,
-	      Field& w,int Nm,int k)
-    {
-      const RealD tiny = 1.0e-20;
-      assert( k< Nm );
-      
-      _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
-
-      if(k>0) w -= lme[k-1] * evec[k-1];
-
-      ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
-      RealD     alph = real(zalph);
-
-      w = w - alph * evec[k];// 5. wk:=wk−αkvk
-
-      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-                                 // 7. vk+1 := wk/βk+1
-
-      lmd[k] = alph;
-      lme[k] = beta;
-
-      if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
-      if ( k < Nm-1) evec[k+1] = w;
-
-      if ( beta < tiny ) std::cout << " beta is tiny "<<beta<<std::endl;
-    }
-      
-    void qr_decomp(DenseVector<RealD>& lmd,   // Nm 
-		   DenseVector<RealD>& lme,   // Nm 
-		   int Nk, int Nm,
-		   DenseVector<RealD>& Qt,     // Nm x Nm matrix
-		   RealD Dsh, int kmin, int kmax)
-    {
-      int k = kmin-1;
-      RealD x;
-
-      RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
-      RealD c = ( lmd[k] -Dsh) *Fden;
-      RealD s = -lme[k] *Fden;
-      
-      RealD tmpa1 = lmd[k];
-      RealD tmpa2 = lmd[k+1];
-      RealD tmpb  = lme[k];
-
-      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
-      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
-      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
-      x        =-s*lme[k+1];
-      lme[k+1] = c*lme[k+1];
-      
-      for(int i=0; i<Nk; ++i){
-	RealD Qtmp1 = Qt[i+Nm*k    ];
-	RealD Qtmp2 = Qt[i+Nm*(k+1)];
-	Qt[i+Nm*k    ] = c*Qtmp1 - s*Qtmp2;
-	Qt[i+Nm*(k+1)] = s*Qtmp1 + c*Qtmp2; 
-      }
-
-      // Givens transformations
-      for(int k = kmin; k < kmax-1; ++k){
-
-	RealD Fden = 1.0/hypot(x,lme[k-1]);
-	RealD c = lme[k-1]*Fden;
-	RealD s = - x*Fden;
-	
-	RealD tmpa1 = lmd[k];
-	RealD tmpa2 = lmd[k+1];
-	RealD tmpb  = lme[k];
-
-	lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
-	lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
-	lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
-	lme[k-1] = c*lme[k-1] -s*x;
-
-	if(k != kmax-2){
-	  x = -s*lme[k+1];
-	  lme[k+1] = c*lme[k+1];
-	}
-
-	for(int i=0; i<Nk; ++i){
-	  RealD Qtmp1 = Qt[i+Nm*k    ];
-	  RealD Qtmp2 = Qt[i+Nm*(k+1)];
-	  Qt[i+Nm*k    ] = c*Qtmp1 -s*Qtmp2;
-	  Qt[i+Nm*(k+1)] = s*Qtmp1 +c*Qtmp2;
-	}
-      }
-    }
-
-
-#ifdef USE_LAPACK
-    void diagonalize_lapack(DenseVector<RealD>& lmd,
-			    DenseVector<RealD>& lme, 
-			    int N1,
-			    int N2,
-			    DenseVector<RealD>& Qt,
-			    GridBase *grid)
-    {
-      const int size = Nm;
-      int NN = N1;
-      double evals_tmp[NN];
-      double evec_tmp[NN][NN];
-      memset(evec_tmp[0],0,sizeof(double)*NN*NN);
-      double DD[NN];
-      double EE[NN];
-      for (int i = 0; i< NN; i++) {
-	for (int j = i - 1; j <= i + 1; j++) {
-	  if ( j < NN && j >= 0 ) {
-	    if (i==j) DD[i] = lmd[i];
-	    if (i==j) evals_tmp[i] = lmd[i];
-	    if (j==(i-1)) EE[j] = lme[j];
-	  }
-	}
-      }
-      int evals_found;
-      int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
-      int liwork =  3+NN*10 ;
-      int iwork[liwork];
-      double work[lwork];
-      int isuppz[2*NN];
-      char jobz = 'V'; // calculate evals & evecs
-      char range = 'I'; // calculate all evals
-      //    char range = 'A'; // calculate all evals
-      char uplo = 'U'; // refer to upper half of original matrix
-      char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
-      int ifail[NN];
-      int info;
-      int total = grid->_Nprocessors;
-      int node  = grid->_processor;
-      int interval = (NN/total)+1;
-      double vl = 0.0, vu = 0.0;
-      int il = interval*node+1 , iu = interval*(node+1);
-      if (iu > NN)  iu=NN;
-      double tol = 0.0;
-      if (1) {
-	memset(evals_tmp,0,sizeof(double)*NN);
-	if ( il <= NN){
-	  LAPACK_dstegr(&jobz, &range, &NN,
-			(double*)DD, (double*)EE,
-			&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
-			&tol, // tolerance
-			&evals_found, evals_tmp, (double*)evec_tmp, &NN,
-			isuppz,
-			work, &lwork, iwork, &liwork,
-			&info);
-	  for (int i = iu-1; i>= il-1; i--){
-	    evals_tmp[i] = evals_tmp[i - (il-1)];
-	    if (il>1) evals_tmp[i-(il-1)]=0.;
-	    for (int j = 0; j< NN; j++){
-	      evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
-	      if (il>1) evec_tmp[i-(il-1)][j]=0.;
-	    }
-	  }
-	}
-	{
-	  grid->GlobalSumVector(evals_tmp,NN);
-	  grid->GlobalSumVector((double*)evec_tmp,NN*NN);
-	}
-      } 
-      // cheating a bit.
-      // It is better to sort instead of just reversing it, 
-      // but the document of the routine says evals are sorted in increasing order. 
-      // qr gives evals in decreasing order.
-      for(int i=0;i<NN;i++){
-	for(int j=0;j<NN;j++)
-	  Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
-	lmd [NN-1-i]=evals_tmp[i];
-      }
-    }
-#endif
-
-    void diagonalize(DenseVector<RealD>& lmd,
-		     DenseVector<RealD>& lme, 
-		     int N2,
-		     int N1,
-		     DenseVector<RealD>& Qt,
-		     GridBase *grid)
-    {
-
-#ifdef USE_LAPACK
-    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
-
-    if(!check_lapack)
-	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
-
-    DenseVector <RealD> lmd2(N1);
-    DenseVector <RealD> lme2(N1);
-    DenseVector<RealD> Qt2(N1*N1);
-    for(int k=0; k<N1; ++k){
-      lmd2[k] = lmd[k];
-      lme2[k] = lme[k];
-    }
-    for(int k=0; k<N1*N1; ++k){
-      Qt2[k] = Qt[k];
-    }
-#endif
-
-      int Niter = 100*N1;
-      int kmin = 1;
-      int kmax = N2;
-
-      // (this should be more sophisticated)
-      for(int iter=0; iter<Niter; ++iter){
-
-	// determination of 2x2 leading submatrix
-	RealD dsub = lmd[kmax-1]-lmd[kmax-2];
-	RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
-	RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
-	// (Dsh: shift)
-	
-	// transformation
-	qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
-	
-	// Convergence criterion (redef of kmin and kamx)
-	for(int j=kmax-1; j>= kmin; --j){
-	  RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
-	  if(fabs(lme[j-1])+dds > dds){
-	    kmax = j+1;
-	    goto continued;
-	  }
-	}
-	Niter = iter;
-#ifdef USE_LAPACK
-	if(check_lapack){
-	  const double SMALL=1e-8;
-	  diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
-	  DenseVector <RealD> lmd3(N2);
-	  for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
-	  _sort.push(lmd3,N2);
-	  _sort.push(lmd2,N2);
-	  for(int k=0; k<N2; ++k){
-	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
-	  }
-	}
-#endif
-	return;
-
-      continued:
-	for(int j=0; j<kmax-1; ++j){
-	  RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
-	  if(fabs(lme[j])+dds > dds){
-	    kmin = j+1;
-	    break;
-	  }
-	}
-      }
-      std::cout << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
-      abort();
-    }
-
-    static RealD normalise(Field& v) 
-    {
-      RealD nn = norm2(v);
-      nn = sqrt(nn);
-      v = v * (1.0/nn);
-      return nn;
-    }
-
-    void orthogonalize(Field& w,
-		       DenseVector<Field>& evec,
-		       int k)
-    {
-      typedef typename Field::scalar_type MyComplex;
-      MyComplex ip;
-
-      if ( 0 ) {
-	for(int j=0; j<k; ++j){
-	  normalise(evec[j]);
-	  for(int i=0;i<j;i++){
-	    ip = innerProduct(evec[i],evec[j]); // are the evecs normalised? ; this assumes so.
-	    evec[j] = evec[j] - ip *evec[i];
-	  }
-	}
-      }
-
-      for(int j=0; j<k; ++j){
-	ip = innerProduct(evec[j],w); // are the evecs normalised? ; this assumes so.
-	w = w - ip * evec[j];
-      }
-      normalise(w);
-    }
-
-
-    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) {
-      for(int i=0; i<Qt.size(); ++i) Qt[i] = 0.0;
-      for(int k=0; k<Nm; ++k) Qt[k + k*Nm] = 1.0;
-    }
+      Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
+      eresid(_eresid),  MaxIter(_MaxIter),
+      diagonalisation(_diagonalisation)
+      { };
 
 /* Rudy Arthur's thesis pp.137
 ------------------------
@@ -443,169 +138,482 @@ repeat
   HK =HM(1:K,1:K)
   →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
 until convergence
- */
-    void calc(DenseVector<RealD>& eval,
-	      DenseVector<Field>& evec,
-	      const Field& src,
-	      int& Nconv)
-      {
-
-	GridBase *grid = evec[0]._grid;
-	assert(grid == src._grid);
-
-	std::cout << " -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
-	std::cout << " -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
-	std::cout << " -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
-	std::cout << " -- size of eval = " << eval.size() << std::endl;
-	std::cout << " -- size of evec = " << evec.size() << std::endl;
-	
-	assert(Nm == evec.size() && Nm == eval.size());
-	
-	DenseVector<RealD> lme(Nm);  
-	DenseVector<RealD> lme2(Nm);
-	DenseVector<RealD> eval2(Nm);
-	DenseVector<RealD> Qt(Nm*Nm);
-	DenseVector<int>   Iconv(Nm);
-
-	DenseVector<Field>  B(Nm,grid); // waste of space replicating
-	
-	Field f(grid);
-	Field v(grid);
-  
-	int k1 = 1;
-	int k2 = Nk;
-
-	Nconv = 0;
-
-	RealD beta_k;
-  
-	// Set initial vector
-	evec[0] = src;
-	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
-
-	normalise(evec[0]);
-	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
-	
-	// Initial Nk steps
-	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
-
-	// Restarting loop begins
-	int iter;
-	for(iter = 0; iter<Niter; ++iter){
-
-	  std::cout<<"\n Restart iteration = "<< iter << std::endl;
-
-	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
-
-	  f *= lme[Nm-1];
-
-	  // getting eigenvalues
-	  for(int k=0; k<Nm; ++k){
-	    eval2[k] = eval[k+k1-1];
-	    lme2[k] = lme[k+k1-1];
-	  }
-	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
-
-	  // sorting
-	  _sort.push(eval2,Nm);
-	  
-	  // Implicitly shifted QR transformations
-	  setUnit_Qt(Nm,Qt);
-	  for(int ip=k2; ip<Nm; ++ip){ 
-	    //	    std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
-	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
-	  }
+*/
+  void calc(std::vector<RealD>& eval,  std::vector<Field>& evec, const Field& src, int& Nconv)
+  {
     
-	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
-	  
-	  for(int j=k1-1; j<k2+1; ++j){
-	    for(int k=0; k<Nm; ++k){
-	    B[j].checkerboard = evec[k].checkerboard;
-	      B[j] += Qt[k+Nm*j] * evec[k];
-	    }
-	  }
-	  for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
-
-	  // Compressed vector f and beta(k2)
-	  f *= Qt[Nm-1+Nm*(k2-1)];
-	  f += lme[k2-1] * evec[k2];
-	  beta_k = norm2(f);
-	  beta_k = sqrt(beta_k);
-	  std::cout<<" beta(k) = "<<beta_k<<std::endl;
-
-	  RealD betar = 1.0/beta_k;
-	  evec[k2] = betar * f;
-	  lme[k2-1] = beta_k;
-
-	  // Convergence test
-	  for(int k=0; k<Nm; ++k){    
-	    eval2[k] = eval[k];
-	    lme2[k] = lme[k];
-	  }
-	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
-	  
-	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
-	  
-	  for(int j = 0; j<Nk; ++j){
-	    for(int k = 0; k<Nk; ++k){
-	      B[j].checkerboard = evec[k].checkerboard;
-	      B[j] += Qt[k+j*Nm] * evec[k];
-	    }
-	  }
-
-	  Nconv = 0;
-	  for(int i=0; i<Nk; ++i){
-
-	    _Linop.HermOp(B[i],v);
-	    
-	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
-	    RealD vden = norm2(B[i]);
-	    eval2[i] = vnum/vden;
-	    v -= eval2[i]*B[i];
-	    RealD vv = norm2(v);
-	    
-	    std::cout.precision(13);
-	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
-	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
-	    
-	    // change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
-	    if((vv<eresid*eresid) && (i == Nconv) ){
-	      Iconv[Nconv] = i;
-	      ++Nconv;
-	    }
-
-	  }  // i-loop end
-
-	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
-
-	  if( Nconv>=Nstop ){
-	    goto converged;
-	  }
-	} // end of iter loop
+    GridBase *grid = evec[0]._grid;
+    assert(grid == src._grid);
+    
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogMessage <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogMessage <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
+    std::cout << GridLogMessage <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
+    std::cout << GridLogMessage <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
+    std::cout << GridLogMessage <<" -- size of eval = " << eval.size() << std::endl;
+    std::cout << GridLogMessage <<" -- size of evec = " << evec.size() << std::endl;
+    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
+      std::cout << GridLogMessage << "Diagonalisation is DSTEGR "<<std::endl;
+    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
+      std::cout << GridLogMessage << "Diagonalisation is QR "<<std::endl;
+    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      std::cout << GridLogMessage << "Diagonalisation is Eigen "<<std::endl;
+    }
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    
+    assert(Nm == evec.size() && Nm == eval.size());
 	
-	std::cout<<"\n NOT converged.\n";
-	abort();
+    std::vector<RealD> lme(Nm);  
+    std::vector<RealD> lme2(Nm);
+    std::vector<RealD> eval2(Nm);
+    Eigen::MatrixXd    Qt = Eigen::MatrixXd::Zero(Nm,Nm);
+    std::vector<int>   Iconv(Nm);
+
+    std::vector<Field>  B(Nm,grid); // waste of space replicating
+    
+    Field f(grid);
+    Field v(grid);
+    
+    int k1 = 1;
+    int k2 = Nk;
+    
+    Nconv = 0;
+    
+    RealD beta_k;
+  
+    // Set initial vector
+    evec[0] = src;
+    std::cout << GridLogMessage <<"norm2(src)= " << norm2(src)<<std::endl;
+    
+    normalise(evec[0]);
+    std::cout << GridLogMessage <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
+    
+    // Initial Nk steps
+    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
+    
+    // Restarting loop begins
+    int iter;
+    for(iter = 0; iter<MaxIter; ++iter){
+      
+      std::cout<< GridLogMessage <<" **********************"<< std::endl;
+      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
+      std::cout<< GridLogMessage <<" **********************"<< std::endl;
+      
+      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
+      
+      f *= lme[Nm-1];
+      
+      // getting eigenvalues
+      for(int k=0; k<Nm; ++k){
+	eval2[k] = eval[k+k1-1];
+	lme2[k] = lme[k+k1-1];
+      }
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
+
+      // sorting
+      _sort.push(eval2,Nm);
+      
+      // Implicitly shifted QR transformations
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      for(int ip=k2; ip<Nm; ++ip){ 
+	qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
+      }
+    
+      for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
+	  
+      for(int j=k1-1; j<k2+1; ++j){
+	for(int k=0; k<Nm; ++k){
+	  B[j].checkerboard = evec[k].checkerboard;
+	  B[j] += Qt(j,k) * evec[k];
+	}
+      }
+      for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
+      
+      // Compressed vector f and beta(k2)
+      f *= Qt(k2-1,Nm-1);
+      f += lme[k2-1] * evec[k2];
+      beta_k = norm2(f);
+      beta_k = sqrt(beta_k);
+      std::cout<< GridLogMessage<<" beta(k) = "<<beta_k<<std::endl;
+      
+      RealD betar = 1.0/beta_k;
+      evec[k2] = betar * f;
+      lme[k2-1] = beta_k;
+      
+      // Convergence test
+      for(int k=0; k<Nm; ++k){    
+	eval2[k] = eval[k];
+	lme2[k] = lme[k];
+      }
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
+      
+      for(int k = 0; k<Nk; ++k) B[k]=0.0;
+      
+      for(int j = 0; j<Nk; ++j){
+	for(int k = 0; k<Nk; ++k){
+	  B[j].checkerboard = evec[k].checkerboard;
+	  B[j] += Qt(j,k) * evec[k];
+	}
+      }
+
+      Nconv = 0;
+      for(int i=0; i<Nk; ++i){
 	
-      converged:
-       // Sorting
-       eval.resize(Nconv);
-       evec.resize(Nconv,grid);
-       for(int i=0; i<Nconv; ++i){
-         eval[i] = eval2[Iconv[i]];
-         evec[i] = B[Iconv[i]];
-       }
-      _sort.push(eval,evec,Nconv);
+	_Linop.HermOp(B[i],v);
+	    
+	RealD vnum = real(innerProduct(B[i],v)); // HermOp.
+	RealD vden = norm2(B[i]);
+	eval2[i] = vnum/vden;
+	v -= eval2[i]*B[i];
+	RealD vv = norm2(v);
+	
+	std::cout.precision(13);
+	std::cout << GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
+	std::cout << " |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
+	
+	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+	if((vv<eresid*eresid) && (i == Nconv) ){
+	  Iconv[Nconv] = i;
+	  ++Nconv;
+	}
+	
+      }  // i-loop end
+      
+      std::cout<< GridLogMessage <<" #modes converged: "<<Nconv<<std::endl;
 
-      std::cout << "\n Converged\n Summary :\n";
-      std::cout << " -- Iterations  = "<< iter   << "\n";
-      std::cout << " -- beta(k)     = "<< beta_k << "\n";
-      std::cout << " -- Nconv       = "<< Nconv  << "\n";
-     }
- };
+      if( Nconv>=Nstop ){
+	goto converged;
+      }
+    } // end of iter loop
+    
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout<< GridLogError    <<" ImplicitlyRestartedLanczos::calc() NOT converged.";
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    abort();
+	
+  converged:
+    // Sorting
+    eval.resize(Nconv);
+    evec.resize(Nconv,grid);
+    for(int i=0; i<Nconv; ++i){
+      eval[i] = eval2[Iconv[i]];
+      evec[i] = B[Iconv[i]];
+    }
+    _sort.push(eval,evec,Nconv);
+    
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogMessage << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogMessage << " -- Iterations  = "<< iter   << "\n";
+    std::cout << GridLogMessage << " -- beta(k)     = "<< beta_k << "\n";
+    std::cout << GridLogMessage << " -- Nconv       = "<< Nconv  << "\n";
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+  }
 
-}
+private:
+/* Saad PP. 195
+1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
+2. For k = 1,2,...,m Do:
+3. wk:=Avk−βkv_{k−1}      
+4. αk:=(wk,vk)       // 
+5. wk:=wk−αkvk       // wk orthog vk 
+6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+7. vk+1 := wk/βk+1
+8. EndDo
+ */
+  void step(std::vector<RealD>& lmd,
+	    std::vector<RealD>& lme, 
+	    std::vector<Field>& evec,
+	    Field& w,int Nm,int k)
+  {
+    const RealD tiny = 1.0e-20;
+    assert( k< Nm );
+    
+    _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
+    
+    if(k>0) w -= lme[k-1] * evec[k-1];
+    
+    ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
+    RealD     alph = real(zalph);
+    
+    w = w - alph * evec[k];// 5. wk:=wk−αkvk
+    
+    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+    // 7. vk+1 := wk/βk+1
+    
+    lmd[k] = alph;
+    lme[k] = beta;
+    
+    if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
+    if ( k < Nm-1) evec[k+1] = w;
+    
+    if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<<beta<<std::endl;
+  }
+      
+  ///////////////////////////////////////////////////////////////////
+  // 
+  //
+  ///////////////////////////////////////////////////////////////////
+  void qr_decomp(std::vector<RealD>& lmd,   // Nm 
+		 std::vector<RealD>& lme,   // Nm 
+		 int Nk, int Nm,            // Nk, Nm
+		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
+		 RealD Dsh, int kmin, int kmax)
+  {
+    int k = kmin-1;
+    RealD x;
+    
+    RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
+    RealD c = ( lmd[k] -Dsh) *Fden;
+    RealD s = -lme[k] *Fden;
+      
+    RealD tmpa1 = lmd[k];
+    RealD tmpa2 = lmd[k+1];
+    RealD tmpb  = lme[k];
+
+    lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+    lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+    lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+    x        =-s*lme[k+1];
+    lme[k+1] = c*lme[k+1];
+      
+    for(int i=0; i<Nk; ++i){
+      RealD Qtmp1 = Qt(k,i);
+      RealD Qtmp2 = Qt(k+1,i);
+      Qt(k,i)  = c*Qtmp1 - s*Qtmp2;
+      Qt(k+1,i)= s*Qtmp1 + c*Qtmp2; 
+    }
+
+    // Givens transformations
+    for(int k = kmin; k < kmax-1; ++k){
+      
+      RealD Fden = 1.0/hypot(x,lme[k-1]);
+      RealD c = lme[k-1]*Fden;
+      RealD s = - x*Fden;
+	
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k+1];
+      RealD tmpb  = lme[k];
+
+      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+      lme[k-1] = c*lme[k-1] -s*x;
+
+      if(k != kmax-2){
+	x = -s*lme[k+1];
+	lme[k+1] = c*lme[k+1];
+      }
+
+      for(int i=0; i<Nk; ++i){
+	RealD Qtmp1 = Qt(k,i);
+	RealD Qtmp2 = Qt(k+1,i);
+	Qt(k,i)     = c*Qtmp1 -s*Qtmp2;
+	Qt(k+1,i)   = s*Qtmp1 +c*Qtmp2;
+      }
+    }
+  }
+
+  void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+		   int Nk, int Nm,   
+		   Eigen::MatrixXd & Qt,
+		   GridBase *grid)
+  {
+    Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
+      diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
+    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
+      diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
+    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
+    } else { 
+      assert(0);
+    }
+  }
+
+#ifdef USE_LAPACK
+void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
+                   double *vl, double *vu, int *il, int *iu, double *abstol,
+                   int *m, double *w, double *z, int *ldz, int *isuppz,
+                   double *work, int *lwork, int *iwork, int *liwork,
+                   int *info);
 #endif
 
+void diagonalize_lapack(std::vector<RealD>& lmd,
+			std::vector<RealD>& lme, 
+			int Nk, int Nm,  
+			Eigen::MatrixXd& Qt,
+			GridBase *grid)
+{
+#ifdef USE_LAPACK
+  const int size = Nm;
+  int NN = Nk;
+  double evals_tmp[NN];
+  double evec_tmp[NN][NN];
+  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
+  double DD[NN];
+  double EE[NN];
+  for (int i = 0; i< NN; i++) {
+    for (int j = i - 1; j <= i + 1; j++) {
+      if ( j < NN && j >= 0 ) {
+	if (i==j) DD[i] = lmd[i];
+	if (i==j) evals_tmp[i] = lmd[i];
+	if (j==(i-1)) EE[j] = lme[j];
+      }
+    }
+  }
+  int evals_found;
+  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+  int liwork =  3+NN*10 ;
+  int iwork[liwork];
+  double work[lwork];
+  int isuppz[2*NN];
+  char jobz = 'V'; // calculate evals & evecs
+  char range = 'I'; // calculate all evals
+  //    char range = 'A'; // calculate all evals
+  char uplo = 'U'; // refer to upper half of original matrix
+  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+  int ifail[NN];
+  int info;
+  int total = grid->_Nprocessors;
+  int node  = grid->_processor;
+  int interval = (NN/total)+1;
+  double vl = 0.0, vu = 0.0;
+  int il = interval*node+1 , iu = interval*(node+1);
+  if (iu > NN)  iu=NN;
+  double tol = 0.0;
+  if (1) {
+    memset(evals_tmp,0,sizeof(double)*NN);
+    if ( il <= NN){
+      LAPACK_dstegr(&jobz, &range, &NN,
+		    (double*)DD, (double*)EE,
+		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+		    &tol, // tolerance
+		    &evals_found, evals_tmp, (double*)evec_tmp, &NN,
+		    isuppz,
+		    work, &lwork, iwork, &liwork,
+		    &info);
+      for (int i = iu-1; i>= il-1; i--){
+	evals_tmp[i] = evals_tmp[i - (il-1)];
+	if (il>1) evals_tmp[i-(il-1)]=0.;
+	for (int j = 0; j< NN; j++){
+	  evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
+	  if (il>1) evec_tmp[i-(il-1)][j]=0.;
+	}
+      }
+    }
+    {
+      grid->GlobalSumVector(evals_tmp,NN);
+      grid->GlobalSumVector((double*)evec_tmp,NN*NN);
+    }
+  } 
+  // Safer to sort instead of just reversing it, 
+  // but the document of the routine says evals are sorted in increasing order. 
+  // qr gives evals in decreasing order.
+  for(int i=0;i<NN;i++){
+    lmd [NN-1-i]=evals_tmp[i];
+    for(int j=0;j<NN;j++){
+      Qt((NN-1-i),j)=evec_tmp[i][j];
+    }
+  }
+#else 
+  assert(0);
+#endif
+}
+
+  void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+		      int Nk, int Nm,   
+		      Eigen::MatrixXd & Qt,
+		      GridBase *grid)
+  {
+    int Niter = 100*Nm;
+    int kmin = 1;
+    int kmax = Nk;
+
+    // (this should be more sophisticated)
+    for(int iter=0; iter<Niter; ++iter){
+      
+      // determination of 2x2 leading submatrix
+      RealD dsub = lmd[kmax-1]-lmd[kmax-2];
+      RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
+      RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
+      // (Dsh: shift)
+	
+      // transformation
+      qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
+	
+      // Convergence criterion (redef of kmin and kamx)
+      for(int j=kmax-1; j>= kmin; --j){
+	RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
+	if(fabs(lme[j-1])+dds > dds){
+	  kmax = j+1;
+	  goto continued;
+	}
+      }
+      Niter = iter;
+      return;
+
+    continued:
+      for(int j=0; j<kmax-1; ++j){
+	RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
+	if(fabs(lme[j])+dds > dds){
+	  kmin = j+1;
+	  break;
+	}
+      }
+    }
+    std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
+    abort();
+  }
+
+  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+			 int Nk, int Nm,  
+			 Eigen::MatrixXd & Qt, // Nm x Nm
+			 GridBase *grid)
+  {
+    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
+
+    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
+    
+    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
+
+    for (int i = 0; i < Nk; i++) {
+      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
+    }
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
+      }
+    }
+  }
+
+
+  static RealD normalise(Field& v) 
+  {
+    RealD nn = norm2(v);
+    nn = sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+  
+  void orthogonalize(Field& w, std::vector<Field>& evec, int k)
+  {
+    typedef typename Field::scalar_type MyComplex;
+    MyComplex ip;
+    
+    for(int j=0; j<k; ++j){
+      ip = innerProduct(evec[j],w); // are the evecs normalised? ; this assumes so.
+      w = w - ip * evec[j];
+    }
+    normalise(w);
+  }
+
+ };
+}
+#endif
diff --git a/tests/solver/Test_dwf_lanczos.cc b/tests/solver/Test_dwf_lanczos.cc
index 48cca378..1dd5dae3 100644
--- a/tests/solver/Test_dwf_lanczos.cc
+++ b/tests/solver/Test_dwf_lanczos.cc
@@ -92,16 +92,15 @@ int main (int argc, char ** argv)
 
   
   std::vector<RealD>          eval(Nm);
-  FermionField    src(FrbGrid); gaussian(RNG5rb,src);
+  FermionField    src(FrbGrid); 
+  gaussian(RNG5rb,src);
   std::vector<FermionField> evec(Nm,FrbGrid);
   for(int i=0;i<1;i++){
-    std::cout << i<<" / "<< Nm<< " grid pointer "<<evec[i]._grid<<std::endl;
+    std::cout << GridLogMessage <<i<<" / "<< Nm<< " grid pointer "<<evec[i]._grid<<std::endl;
   };
 
   int Nconv;
-  IRL.calc(eval,evec,
-	   src,
-	   Nconv);
+  IRL.calc(eval,evec,src,Nconv);
 
 
   Grid_finalize();

From e8b95bd35b00b25384de0019dd454af853883f08 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 21 Jun 2017 02:50:09 +0100
Subject: [PATCH 098/177] Clean up finished. Could shrink Lanczos to around 400
 lines at a push

---
 .../iterative/ImplicitlyRestartedLanczos.h    | 114 +++++++++---------
 tests/debug/Test_synthetic_lanczos.cc         |   4 +-
 2 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 571bf1b2..a8723f32 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -39,10 +39,11 @@ namespace Grid {
     IRLdiagonaliseWithQR,
     IRLdiagonaliseWithEigen
   };
-  ////////////////////////////////////////////////////////////////////////////////
-  // Helper class for sorting the evalues AND evectors by Field
-  // Use pointer swizzle on vectors
-  ////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper class for sorting the evalues AND evectors by Field
+// Use pointer swizzle on vectors
+////////////////////////////////////////////////////////////////////////////////
 template<class Field>
 class SortEigen {
  private:
@@ -90,7 +91,9 @@ class SortEigen {
 /////////////////////////////////////////////////////////////
 template<class Field> 
 class ImplicitlyRestartedLanczos {
+
 private:       
+
   int MaxIter;   // Max iterations
   int Nstop;     // Number of evecs checked for convergence
   int Nk;        // Number of converged sought
@@ -122,6 +125,29 @@ public:
       diagonalisation(_diagonalisation)
       { };
 
+  ////////////////////////////////
+  // Helpers
+  ////////////////////////////////
+  static RealD normalise(Field& v) 
+  {
+    RealD nn = norm2(v);
+    nn = sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+  
+  void orthogonalize(Field& w, std::vector<Field>& evec, int k)
+  {
+    typedef typename Field::scalar_type MyComplex;
+    MyComplex ip;
+    
+    for(int j=0; j<k; ++j){
+      ip = innerProduct(evec[j],w); 
+      w = w - ip * evec[j];
+    }
+    normalise(w);
+  }
+
 /* Rudy Arthur's thesis pp.137
 ------------------------
 Require: M > K P = M − K †
@@ -167,9 +193,10 @@ until convergence
     std::vector<RealD> lme(Nm);  
     std::vector<RealD> lme2(Nm);
     std::vector<RealD> eval2(Nm);
-    Eigen::MatrixXd    Qt = Eigen::MatrixXd::Zero(Nm,Nm);
-    std::vector<int>   Iconv(Nm);
 
+    Eigen::MatrixXd    Qt = Eigen::MatrixXd::Zero(Nm,Nm);
+
+    std::vector<int>   Iconv(Nm);
     std::vector<Field>  B(Nm,grid); // waste of space replicating
     
     Field f(grid);
@@ -218,6 +245,7 @@ until convergence
       // Implicitly shifted QR transformations
       Qt = Eigen::MatrixXd::Identity(Nm,Nm);
       for(int ip=k2; ip<Nm; ++ip){ 
+	// Eigen replacement for qr_decomp ???
 	qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
       }
     
@@ -354,10 +382,32 @@ private:
     if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<<beta<<std::endl;
   }
       
-  ///////////////////////////////////////////////////////////////////
-  // 
-  //
-  ///////////////////////////////////////////////////////////////////
+  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+			 int Nk, int Nm,  
+			 Eigen::MatrixXd & Qt, // Nm x Nm
+			 GridBase *grid)
+  {
+    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
+
+    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
+    
+    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
+
+    for (int i = 0; i < Nk; i++) {
+      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
+    }
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
+      }
+    }
+  }
+  ///////////////////////////////////////////////////////////////////////////
+  // File could end here if settle on Eigen ???
+  ///////////////////////////////////////////////////////////////////////////
+
   void qr_decomp(std::vector<RealD>& lmd,   // Nm 
 		 std::vector<RealD>& lme,   // Nm 
 		 int Nk, int Nm,            // Nk, Nm
@@ -570,50 +620,6 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
     abort();
   }
 
-  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
-			 int Nk, int Nm,  
-			 Eigen::MatrixXd & Qt, // Nm x Nm
-			 GridBase *grid)
-  {
-    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
-
-    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
-    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
-    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
-    
-    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
-
-    for (int i = 0; i < Nk; i++) {
-      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
-    }
-    for (int i = 0; i < Nk; i++) {
-      for (int j = 0; j < Nk; j++) {
-	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
-      }
-    }
-  }
-
-
-  static RealD normalise(Field& v) 
-  {
-    RealD nn = norm2(v);
-    nn = sqrt(nn);
-    v = v * (1.0/nn);
-    return nn;
-  }
-  
-  void orthogonalize(Field& w, std::vector<Field>& evec, int k)
-  {
-    typedef typename Field::scalar_type MyComplex;
-    MyComplex ip;
-    
-    for(int j=0; j<k; ++j){
-      ip = innerProduct(evec[j],w); // are the evecs normalised? ; this assumes so.
-      w = w - ip * evec[j];
-    }
-    normalise(w);
-  }
-
  };
 }
 #endif
diff --git a/tests/debug/Test_synthetic_lanczos.cc b/tests/debug/Test_synthetic_lanczos.cc
index 8ffbcbe9..32fd6f32 100644
--- a/tests/debug/Test_synthetic_lanczos.cc
+++ b/tests/debug/Test_synthetic_lanczos.cc
@@ -133,8 +133,8 @@ int main (int argc, char ** argv)
   int Nconv;
   RealD eresid = 1.0e-6;
 
-  ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nm,eresid,Nit);
-  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit);
+  ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nk,Nm,eresid,Nit);
+  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nk,Nm,eresid,Nit);
 
   LatticeComplex src(grid); gaussian(RNG,src);
   {

From ef4f2b8c410d449ff0beea1682cfc3de9bda3f79 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 21 Jun 2017 09:22:20 +0100
Subject: [PATCH 099/177] todo update

---
 TODO | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/TODO b/TODO
index eeb7dfa5..8f80903e 100644
--- a/TODO
+++ b/TODO
@@ -2,8 +2,8 @@ TODO:
 ---------------
 
 Large item work list:
-1)- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
-2)- MultiRHS with spread out extra dim
+1)- MultiRHS with spread out extra dim
+2)- Christoph's local basis expansion Lanczos
 3)- BG/Q port and check
 4)- Precision conversion and sort out localConvert      <-- partial
   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
@@ -13,6 +13,7 @@ Large item work list:
 8)- HDCR resume
 
 Recent DONE 
+-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
 -- Binary I/O MPI2 IO                                  <-- DONE

From 9e56c6573007ccc857571aefa2ce3b6851f7b891 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 21 Jun 2017 14:02:58 +0100
Subject: [PATCH 100/177] Updated TODO list

---
 TODO | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TODO b/TODO
index 8f80903e..001c6c0c 100644
--- a/TODO
+++ b/TODO
@@ -2,7 +2,8 @@ TODO:
 ---------------
 
 Large item work list:
-1)- MultiRHS with spread out extra dim
+1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
+
 2)- Christoph's local basis expansion Lanczos
 3)- BG/Q port and check
 4)- Precision conversion and sort out localConvert      <-- partial

From af71c63f4ce48ccbe9bfdaf40d4171913483add7 Mon Sep 17 00:00:00 2001
From: Lanny91 <andrew.lawson@gmail.com>
Date: Fri, 23 Jun 2017 11:03:12 +0200
Subject: [PATCH 101/177] AVX2 fix

---
 lib/simd/Grid_avx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index 57d9064d..f4634432 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -711,7 +711,7 @@ namespace Optimization {
     v2  = _mm256_hadd_epi32(v1, v1);
     u1  = _mm256_castsi256_si128(v2);      // upper half
     u2  = _mm256_extracti128_si256(v2, 1); // lower half
-    ret = _mm256_add_epi32(u1, u2);
+    ret = _mm_add_epi32(u1, u2);
 #else
     // No AVX horizontal add; extract upper and lower halves of register & use
     // SSE intrinsics.

From 56abbdf4c2fa3848fe9037cf95cf5e4930631d3a Mon Sep 17 00:00:00 2001
From: Lanny91 <andrew.lawson@gmail.com>
Date: Fri, 23 Jun 2017 11:09:14 +0200
Subject: [PATCH 102/177] AVX512 integer reduce fix (for non-intel compiler)

---
 lib/simd/Grid_avx512.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 458a8f7c..85d27421 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -558,7 +558,7 @@ namespace Optimization {
     v2  = _mm256_hadd_epi32(v1, v1);
     u1  = _mm256_castsi256_si128(v2)        // upper half
     u2  = _mm256_extracti128_si256(v2, 1);  // lower half
-    ret = _mm256_add_epi32(u1, u2);
+    ret = _mm_add_epi32(u1, u2);
     return _mm_cvtsi128_si32(ret);
   }
 #else

From 869b99ec1efde04d94bdd02eb041a457accb930e Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 24 Jun 2017 10:55:54 +0100
Subject: [PATCH 103/177] Threaded calls to multiple communicators

---
 lib/communicator/Communicator_mpit.cc | 260 ++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 lib/communicator/Communicator_mpit.cc

diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc
new file mode 100644
index 00000000..07522900
--- /dev/null
+++ b/lib/communicator/Communicator_mpit.cc
@@ -0,0 +1,260 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_mpi.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/GridCore.h>
+#include <Grid/GridQCDcore.h>
+#include <Grid/qcd/action/ActionCore.h>
+#include <mpi.h>
+
+namespace Grid {
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+MPI_Comm CartesianCommunicator::communicator_world;
+
+// Should error check all MPI calls.
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  int flag;
+  int provided;
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
+    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
+    if ( provided != MPI_THREAD_MULTIPLE ) {
+      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
+    }
+  }
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+  ShmInitGeneric();
+}
+
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+{
+  _ndimension = processors.size();
+  std::vector<int> periodic(_ndimension,1);
+
+  _Nprocessors=1;
+  _processors = processors;
+  _processor_coor.resize(_ndimension);
+  
+  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Comm_rank(communicator,&_processor);
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
+
+  for(int i=0;i<_ndimension;i++){
+    _Nprocessors*=_processors[i];
+  }
+
+  communicator_halo.resize (2*_ndimension);
+  for(int i=0;i<_ndimension*2;i++){
+    MPI_Comm_dup(communicator,&communicator_halo[i]);
+  }
+  
+  int Size; 
+  MPI_Comm_size(communicator,&Size);
+  
+  assert(Size==_Nprocessors);
+}
+void CartesianCommunicator::GlobalSum(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalXOR(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalXOR(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(float *f,int N)
+{
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(double *d,int N)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+  assert(ierr==0);
+}
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
+{
+  int rank;
+  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
+  assert(ierr==0);
+  return rank;
+}
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+  coor.resize(_ndimension);
+  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+  assert(ierr==0);
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromComplete(reqs);
+}
+
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int sender,
+					   int receiver,
+					   int bytes)
+{
+  MPI_Status stat;
+  assert(sender != receiver);
+  int tag = sender;
+  if ( _processor == sender ) {
+    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
+  }
+  if ( _processor == receiver ) { 
+    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
+  }
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  int myrank = _processor;
+  int ierr;
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+    MPI_Request xrq;
+    MPI_Request rrq;
+
+    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    
+    assert(ierr==0);
+    list.push_back(xrq);
+    list.push_back(rrq);
+  } else { 
+    // Give the CPU to MPI immediately; can use threads to overlap optionally
+    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
+		      recv,bytes,MPI_CHAR,from, from,
+		      communicator,MPI_STATUS_IGNORE);
+    assert(ierr==0);
+  }
+}
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+    int nreq=list.size();
+    std::vector<MPI_Status> status(nreq);
+    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+    assert(ierr==0);
+  }
+}
+
+void CartesianCommunicator::Barrier(void)
+{
+  int ierr = MPI_Barrier(communicator);
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+  int ierr=MPI_Bcast(data,
+		     bytes,
+		     MPI_BYTE,
+		     root,
+		     communicator);
+  assert(ierr==0);
+}
+  ///////////////////////////////////////////////////////
+  // Should only be used prior to Grid Init finished.
+  // Check for this?
+  ///////////////////////////////////////////////////////
+int CartesianCommunicator::RankWorld(void){ 
+  int r; 
+  MPI_Comm_rank(communicator_world,&r);
+  return r;
+}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
+{
+  int ierr= MPI_Bcast(data,
+		      bytes,
+		      MPI_BYTE,
+		      root,
+		      communicator_world);
+  assert(ierr==0);
+}
+
+  double CartesianCommunicator::StencilSendToRecvFromBegin(int dir,
+							   std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int xmit_to_rank,
+							   void *recv,
+							   int recv_from_rank,
+							   int bytes)
+{
+  int myrank = _processor;
+  int ierr;
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
+		    recv,bytes,MPI_CHAR,from, from,
+		    communicator_halo[dir],MPI_STATUS_IGNORE);
+  assert(ierr==0);
+  return 2.0*bytes;
+}
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall){ };
+
+
+
+}
+

From d2e8372df3c0a39b9eb2c000c7f190c670a75501 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Sat, 24 Jun 2017 23:03:39 +0100
Subject: [PATCH 104/177] SU(N) algebra fix (was not working)

---
 lib/qcd/utils/SUn.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h
index 99a620bc..8f0c0a7b 100644
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>
 
     for (int a = 0; a < AdjointDimension; a++) {
       generator(a, Ta);
-      auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
-      pokeColour(h_out, tmp, a);
+      pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
     }
   }
 

From 0af740dc1521656ee549094fea038176791d6cac Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Sat, 24 Jun 2017 23:04:05 +0100
Subject: [PATCH 105/177] minor scalar HMC code improvement

---
 lib/qcd/action/scalar/ScalarImpl.h              | 8 +++++---
 lib/qcd/action/scalar/ScalarInteractionAction.h | 2 +-
 lib/qcd/hmc/HMC.h                               | 2 +-
 lib/qcd/hmc/HMCResourceManager.h                | 3 ++-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h
index 5342a1fa..174553a2 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -93,6 +93,8 @@ class ScalarImplTypes {
   class ScalarAdjMatrixImplTypes {
   public:
     typedef S Simd;
+    typedef QCD::SU<N> Group;
+    
     template <typename vtype>
     using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
     template <typename vtype>
@@ -108,7 +110,7 @@ class ScalarImplTypes {
     typedef Field                PropagatorField;
 
     static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
-      QCD::SU<N>::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
     }
 
     static inline Field projectForce(Field& P) {return P;}
@@ -122,11 +124,11 @@ class ScalarImplTypes {
     }
 
     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      QCD::SU<N>::LieRandomize(pRNG, U);
+      Group::LieRandomize(pRNG, U);
     }
 
     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      QCD::SU<N>::LieRandomize(pRNG, U, 0.01);
+      Group::LieRandomize(pRNG, U, 0.01);
     }
 
     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index 5f4c630c..1ff8fd37 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -98,7 +98,7 @@ namespace Grid {
 	      permute(temp2, *temp, permute_type);
 	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
 	    } else {
-	      action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp);
+	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
 	    }
 	  } else {
 	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
diff --git a/lib/qcd/hmc/HMC.h b/lib/qcd/hmc/HMC.h
index ac690b60..5688bb24 100644
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -76,7 +76,7 @@ struct HMCparameters: Serializable {
 
   template < class ReaderClass > 
   void initialize(Reader<ReaderClass> &TheReader){
-  	std::cout << "Reading HMC\n";
+  	std::cout << GridLogMessage << "Reading HMC\n";
   	read(TheReader, "HMC", *this);
   }
 
diff --git a/lib/qcd/hmc/HMCResourceManager.h b/lib/qcd/hmc/HMCResourceManager.h
index 9f4c99a9..cf0000ed 100644
--- a/lib/qcd/hmc/HMCResourceManager.h
+++ b/lib/qcd/hmc/HMCResourceManager.h
@@ -253,6 +253,7 @@ class HMCResourceManager {
   template<class T, class... Types>
   void AddObservable(Types&&... Args){
     ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
+    ObservablesList.back()->print_parameters();
   }
 
   std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
@@ -297,4 +298,4 @@ private:
 }
 }
 
-#endif  // HMC_RESOURCE_MANAGER_H
\ No newline at end of file
+#endif  // HMC_RESOURCE_MANAGER_H

From 54e94360ad06cde7edbaeede2cf18eb0d5a1227b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 24 Jun 2017 23:10:24 +0100
Subject: [PATCH 106/177] Experimental: Multiple communicators to see if we can
 avoid thread locks in --enable-comms=mpit

---
 benchmarks/Benchmark_comms.cc             | 27 ++++++-----
 configure.ac                              | 10 ++---
 lib/Makefile.am                           |  4 +-
 lib/communicator/Communicator_base.cc     | 22 +++++----
 lib/communicator/Communicator_base.h      | 20 +++++----
 lib/communicator/Communicator_mpi3.cc     | 12 ++---
 lib/communicator/Communicator_mpit.cc     | 26 ++++++-----
 lib/cshift/Cshift.h                       |  2 +-
 lib/log/Log.cc                            |  2 +-
 lib/parallelIO/BinaryIO.h                 |  2 +-
 lib/qcd/action/fermion/WilsonFermion5D.cc | 55 +++++++++++------------
 lib/stencil/Stencil.h                     | 45 ++++++++++++++++---
 lib/util/Init.cc                          |  2 +-
 13 files changed, 139 insertions(+), 90 deletions(-)

diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 532532f8..753b8a58 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
 
   int Nloop=100;
   int nmu=0;
-  int maxlat=24;
+  int maxlat=32;
   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
 
   std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   header();
   for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],
       				    lat*mpi_layout[1],
@@ -163,7 +163,7 @@ int main (int argc, char ** argv)
   header();
 
   for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
 
       std::vector<int> latt_size  ({lat,lat,lat,lat});
 
@@ -249,7 +249,7 @@ int main (int argc, char ** argv)
   header();
 
   for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],
       				    lat*mpi_layout[1],
@@ -299,7 +299,7 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu);
 	
 	    comm_proc = mpi_layout[mu]-1;
 	  
@@ -310,11 +310,11 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu+4);
 	  
 	  }
 	}
-	Grid.StencilSendToRecvFromComplete(requests);
+	Grid.StencilSendToRecvFromComplete(requests,0);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
@@ -346,7 +346,7 @@ int main (int argc, char ** argv)
   header();
 
   for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],
       				    lat*mpi_layout[1],
@@ -393,8 +393,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
-					      bytes);
-	    Grid.StencilSendToRecvFromComplete(requests);
+					      bytes,mu);
+	    Grid.StencilSendToRecvFromComplete(requests,mu);
 	    requests.resize(0);
 
 	    comm_proc = mpi_layout[mu]-1;
@@ -406,8 +406,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
-					      bytes);
-	    Grid.StencilSendToRecvFromComplete(requests);
+					      bytes,mu+4);
+	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  
 	  }
@@ -435,6 +435,9 @@ int main (int argc, char ** argv)
  
     }
   }    
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 
   Grid_finalize();
 }
diff --git a/configure.ac b/configure.ac
index f7284d48..9a596bd0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -324,14 +324,14 @@ case ${ac_COMMS} in
         AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
         comms_type='none'
      ;;
-     mpi3l*)
-       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
-       comms_type='mpi3l'
-     ;;
      mpi3*)
         AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
         comms_type='mpi3'
      ;;
+     mpit)
+        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
+        comms_type='mpit'
+     ;;
      mpi*)
         AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
         comms_type='mpi'
@@ -359,7 +359,7 @@ esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 
 ############### RNG selection
diff --git a/lib/Makefile.am b/lib/Makefile.am
index fac622ca..6dd7899e 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
   extra_sources+=communicator/Communicator_base.cc
 endif
 
-if BUILD_COMMS_MPI3L
-  extra_sources+=communicator/Communicator_mpi3_leader.cc
+if BUILD_COMMS_MPIT
+  extra_sources+=communicator/Communicator_mpit.cc
   extra_sources+=communicator/Communicator_base.cc
 endif
 
diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 557fef48..a5edf8e9 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -89,25 +89,31 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
   GlobalSumVector((double *)c,2*N);
 }
 
-#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
+#if !defined( GRID_COMMS_MPI3) 
 
 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
-
+#endif
+#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						       void *xmit,
-						       int xmit_to_rank,
-						       void *recv,
-						       int recv_from_rank,
-						       int bytes)
+							 void *xmit,
+							 int xmit_to_rank,
+							 void *recv,
+							 int recv_from_rank,
+							 int bytes, int dir)
 {
+  // Discard the "dir"
   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
   return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
   SendToRecvFromComplete(waitall);
 }
+#endif
+
+#if !defined( GRID_COMMS_MPI3) 
+
 void CartesianCommunicator::StencilBarrier(void){};
 
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 12a8429f..4e471b43 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
@@ -64,7 +64,7 @@ class CartesianCommunicator {
   std::vector<int> _processor_coor;  // linear processor coordinate
   unsigned long _ndimension;
 
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
   static MPI_Comm communicator_world;
          MPI_Comm communicator;
   typedef MPI_Request CommsRequest_t;
@@ -72,6 +72,10 @@ class CartesianCommunicator {
   typedef int CommsRequest_t;
 #endif
 
+#if defined (GRID_COMMS_MPIT)
+  std::vector<MPI_Comm> communicator_halo;
+#endif
+
   ////////////////////////////////////////////////////////////////////
   // Helper functionality for SHM Windows common to all other impls
   ////////////////////////////////////////////////////////////////////
@@ -212,13 +216,13 @@ class CartesianCommunicator {
   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 
   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				  void *xmit,
-				  int xmit_to_rank,
-				  void *recv,
-				  int recv_from_rank,
-				  int bytes);
+				    void *xmit,
+				    int xmit_to_rank,
+				    void *recv,
+				    int recv_from_rank,
+				    int bytes,int dir);
   
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
   void StencilBarrier(void);
 
   ////////////////////////////////////////////////////////////
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 632eb991..8046fef6 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -600,11 +600,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 }
 
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						       void *xmit,
-						       int dest,
-						       void *recv,
-						       int from,
-						       int bytes)
+							 void *xmit,
+							 int dest,
+							 void *recv,
+							 int from,
+							 int bytes,int dir)
 {
   MPI_Request xrq;
   MPI_Request rrq;
@@ -643,7 +643,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 
   return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
   SendToRecvFromComplete(waitall);
 }
diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc
index 07522900..24a518ec 100644
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -235,24 +235,30 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
   assert(ierr==0);
 }
 
-  double CartesianCommunicator::StencilSendToRecvFromBegin(int dir,
-							   std::vector<CommsRequest_t> &list,
-							   void *xmit,
-							   int xmit_to_rank,
-							   void *recv,
-							   int recv_from_rank,
-							   int bytes)
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int xmit_to_rank,
+							 void *recv,
+							 int recv_from_rank,
+							 int bytes,int dir)
 {
+
   int myrank = _processor;
   int ierr;
+  assert(dir < communicator_halo.size());
+
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
   // Give the CPU to MPI immediately; can use threads to overlap optionally
-  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		    recv,bytes,MPI_CHAR,from, from,
+  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
+		    recv,bytes,MPI_CHAR,recv_from_rank, recv_from_rank,
 		    communicator_halo[dir],MPI_STATUS_IGNORE);
   assert(ierr==0);
   return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall){ };
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+{ 
+  // Do nothing
+};
 
 
 
diff --git a/lib/cshift/Cshift.h b/lib/cshift/Cshift.h
index cd162e35..7d0caeee 100644
--- a/lib/cshift/Cshift.h
+++ b/lib/cshift/Cshift.h
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 
diff --git a/lib/log/Log.cc b/lib/log/Log.cc
index 69a9a0a8..65dc2812 100644
--- a/lib/log/Log.cc
+++ b/lib/log/Log.cc
@@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
   int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
   MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 117bec01..480afa01 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -29,7 +29,7 @@
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H
 
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 #define USE_MPI_IO
 #else
 #undef  USE_MPI_IO
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 27319fb0..6a6bc1f8 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -379,7 +379,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 {
 #ifdef GRID_OMP
   //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 
   Compressor compressor(dag);
 
@@ -388,46 +387,46 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 
   DhopFaceTime-=usecond();
   st.HaloExchangeOptGather(in,compressor);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
   DhopFaceTime+=usecond();
-  std::vector<std::vector<CommsRequest_t> > reqs;
 
   // Rely on async comms; start comms before merge of local data
+  DhopComputeTime-=usecond();
   DhopCommTime-=usecond();
-  st.CommunicateBegin(reqs);
-
-  DhopFaceTime-=usecond();
-  st.CommsMergeSHM(compressor);
-  DhopFaceTime+=usecond();
-
-  // Perhaps use omp task and region
 #pragma omp parallel 
   { 
-    int nthreads = omp_get_num_threads();
-    int me = omp_get_thread_num();
-    int myoff, mywork;
+    // Should time this somehow; hard as the threads fork nowait
+    st.CommunicateThreaded();
 
-    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
-    int sF = LLs * myoff;
-
-    if ( me == 0 ) {
-      st.CommunicateComplete(reqs);
-      DhopCommTime+=usecond();
-    } else { 
-      // Interior links in stencil
-      if ( me==1 ) DhopComputeTime-=usecond();
-      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
-      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
-      if ( me==1 ) DhopComputeTime+=usecond();
+  if (dag == DaggerYes) {
+#pragma omp for
+    for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU = ss;
+      int sF = LLs * sU;
+      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+    }
+  } else {
+#pragma omp for
+    for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU = ss;
+      int sF = LLs * sU;
+      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
     }
   }
+#pragma omp single
+  DhopComputeTime+=usecond();
+
+#pragma omp taskwait 
+
+#pragma omp single
+  DhopCommTime+=usecond();
+  } // Closes parallel region and waits the comms (I hope)
+
 
   DhopFaceTime-=usecond();
   st.CommsMerge(compressor);
   DhopFaceTime+=usecond();
 
-  // Load imbalance alert. Should use dynamic schedule OMP for loop
-  // Perhaps create a list of only those sites with face work, and 
-  // load balance process the list.
   DhopComputeTime2-=usecond();
   if (dag == DaggerYes) {
     int sz=st.surface_list.size();
@@ -448,11 +447,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 #else 
   assert(0);
 #endif
-
 }
 
 
-
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h
index 2894778a..17db64d8 100644
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -248,24 +248,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   //////////////////////////////////////////
   // Comms packet queue for asynch thread
   //////////////////////////////////////////
+  void CommunicateThreaded()
+  {
+    for(int i=0;i<Packets.size();i++){
+#pragma omp task 
+      {
+	double start;
+	double stop;
+	start = usecond();
+	uint64_t bytes;
+	std::vector<CommsRequest_t> reqs;
+	bytes=_grid->StencilSendToRecvFromBegin(reqs,
+					  Packets[i].send_buf,
+					  Packets[i].to_rank,
+					  Packets[i].recv_buf,
+					  Packets[i].from_rank,
+					  Packets[i].bytes,i);
+	_grid->StencilSendToRecvFromComplete(reqs,i);
+	// Last task logged; this is approximate but hard to catch
+	// the last to complete
+	stop = usecond();
+	stop = stop - start;
+
+	if ( i==0 ) commtime+=stop;
+
+#pragma omp critical
+	{
+	  comms_bytes+=bytes;
+	}
+
+      }
+    }
+    
+  }
   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
     reqs.resize(Packets.size());
     commtime-=usecond();
     for(int i=0;i<Packets.size();i++){
       comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
-					  Packets[i].send_buf,
-					  Packets[i].to_rank,
-					  Packets[i].recv_buf,
-					  Packets[i].from_rank,
-					  Packets[i].bytes);
+						     Packets[i].send_buf,
+						     Packets[i].to_rank,
+						     Packets[i].recv_buf,
+						     Packets[i].from_rank,
+						     Packets[i].bytes,i);
     }
   }
 
   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
     for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromComplete(reqs[i]);
+      _grid->StencilSendToRecvFromComplete(reqs[i],i);
     }
     commtime+=usecond();
   }
diff --git a/lib/util/Init.cc b/lib/util/Init.cc
index fe3b1734..fc701ac1 100644
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -393,7 +393,7 @@ void Grid_init(int *argc,char ***argv)
 
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
   MPI_Finalize();
   Grid_unquiesce_nodes();
 #endif

From 7d7220cbd72278050a1cfda6a083a87b85fecbca Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 27 Jun 2017 14:38:45 +0100
Subject: [PATCH 107/177] scalar: lambda/4! convention

---
 lib/qcd/action/scalar/ScalarInteractionAction.h | 4 ++--
 tests/hmc/Test_hmc_ScalarActionNxN.cc           | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index 1ff8fd37..ac2d4fbb 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -81,7 +81,7 @@ namespace Grid {
       phiStencil.HaloExchange(p, compressor);
       Field action(p._grid), pshift(p._grid), phisquared(p._grid);
       phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared + lambda*phisquared*phisquared;
+      action = (2.0*Ndim + mass_square)*phisquared + lambda/24.*phisquared*phisquared;
       for (int mu = 0; mu < Ndim; mu++) {
 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
@@ -113,7 +113,7 @@ namespace Grid {
 
     virtual void deriv(const Field &p, Field &force) {
       assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p + 2.0*lambda*p*p*p;
+      force = (2.0*Ndim + mass_square)*p + lambda/12.*p*p*p;
       // move this outside
       static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
       phiStencil.HaloExchange(p, compressor);
diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc
index a7490f51..a4dad1a3 100644
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -45,7 +45,7 @@ using namespace Grid;
 using namespace Grid::QCD;
 
 template <class Impl>
-class MagLogger : public HmcObservable<typename Impl::Field> {
+class MagMeas : public HmcObservable<typename Impl::Field> {
 public:
   typedef typename Impl::Field Field;
   typedef typename Impl::Simd::scalar_type Trace;
@@ -72,13 +72,13 @@ private:
 };
 
 template <class Impl>
-class MagMod: public ObservableModule<MagLogger<Impl>, NoParameters>{
-  typedef ObservableModule<MagLogger<Impl>, NoParameters> ObsBase;
+class MagMod: public ObservableModule<MagMeas<Impl>, NoParameters>{
+  typedef ObservableModule<MagMeas<Impl>, NoParameters> ObsBase;
   using ObsBase::ObsBase; // for constructors
   
   // acquire resource
   virtual void initialize(){
-    this->ObservablePtr.reset(new MagLogger<Impl>());
+    this->ObservablePtr.reset(new MagMeas<Impl>());
   }
 public:
   MagMod(): ObsBase(NoParameters()){}

From 15e87a460725f07dd380bd21b538b43b687a0551 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 27 Jun 2017 14:39:27 +0100
Subject: [PATCH 108/177] HDF5 IO fix

---
 lib/serialisation/Hdf5IO.cc | 4 +++-
 lib/serialisation/Hdf5IO.h  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/serialisation/Hdf5IO.cc b/lib/serialisation/Hdf5IO.cc
index b9bb0b87..1fb7be0c 100644
--- a/lib/serialisation/Hdf5IO.cc
+++ b/lib/serialisation/Hdf5IO.cc
@@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName)
                       Hdf5Type<unsigned int>::type());
 }
 
-void Hdf5Reader::push(const std::string &s)
+bool Hdf5Reader::push(const std::string &s)
 {
   group_ = group_.openGroup(s);
   path_.push_back(s);
+  
+  return true;
 }
 
 void Hdf5Reader::pop(void)
diff --git a/lib/serialisation/Hdf5IO.h b/lib/serialisation/Hdf5IO.h
index 2f891cd4..94ad9736 100644
--- a/lib/serialisation/Hdf5IO.h
+++ b/lib/serialisation/Hdf5IO.h
@@ -54,7 +54,7 @@ namespace Grid
   public:
     Hdf5Reader(const std::string &fileName);
     virtual ~Hdf5Reader(void) = default;
-    void push(const std::string &s);
+    bool push(const std::string &s);
     void pop(void);
     template <typename U>
     void readDefault(const std::string &s, U &output);

From 07de925127e15fe7b43e31a9e9f3f2298f5f4261 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 28 Jun 2017 12:45:44 +0100
Subject: [PATCH 109/177] minor scalar action fixes

---
 lib/qcd/action/scalar/ScalarImpl.h              | 4 ++--
 lib/qcd/action/scalar/ScalarInteractionAction.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h
index 174553a2..f85ab840 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -124,11 +124,11 @@ class ScalarImplTypes {
     }
 
     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      Group::LieRandomize(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
     }
 
     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      Group::LieRandomize(pRNG, U, 0.01);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
     }
 
     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index ac2d4fbb..4d189352 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -81,7 +81,7 @@ namespace Grid {
       phiStencil.HaloExchange(p, compressor);
       Field action(p._grid), pshift(p._grid), phisquared(p._grid);
       phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared + lambda/24.*phisquared*phisquared;
+      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
       for (int mu = 0; mu < Ndim; mu++) {
 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
@@ -113,7 +113,7 @@ namespace Grid {
 
     virtual void deriv(const Field &p, Field &force) {
       assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p + lambda/12.*p*p*p;
+      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
       // move this outside
       static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
       phiStencil.HaloExchange(p, compressor);

From 08e04b96761a03c703899a7ee6ca3f42dddcf2d2 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 28 Jun 2017 15:30:06 +0100
Subject: [PATCH 110/177] Better benchmarks

---
 benchmarks/Benchmark_memory_bandwidth.cc | 44 ++++++++++----------
 benchmarks/Benchmark_su3.cc              | 52 ++++++++++++------------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 1aa088f8..1136dfe0 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -55,9 +55,9 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=64;
-#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
-  for(int lat=4;lat<=lmax;lat+=4){
+  uint64_t lmax=96;
+#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
+  for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -65,11 +65,11 @@ int main (int argc, char ** argv)
 
       uint64_t Nloop=NLOOP;
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
       double a=2.0;
 
 
@@ -94,17 +94,17 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
   
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
       double a=2.0;
 
       uint64_t Nloop=NLOOP;
@@ -129,7 +129,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
 
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
@@ -138,11 +138,11 @@ int main (int argc, char ** argv)
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
       RealD a=2.0;
 
 
@@ -166,17 +166,17 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       uint64_t Nloop=NLOOP;
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
       RealD a=2.0;
       Real nn;      
       double start=usecond();
diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 3d7f9bc9..035af2d9 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -37,12 +37,12 @@ int main (int argc, char ** argv)
   Grid_init(&argc,&argv);
 #define LMAX (64)
 
-  int Nloop=20;
+  int64_t Nloop=20;
 
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
 
-  int threads = GridThread::GetThreads();
+  int64_t threads = GridThread::GetThreads();
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -54,16 +54,16 @@ int main (int argc, char ** argv)
   for(int lat=2;lat<=LMAX;lat+=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeColourMatrix z(&Grid);// random(pRNG,z);
-      LatticeColourMatrix x(&Grid);// random(pRNG,x);
-      LatticeColourMatrix y(&Grid);// random(pRNG,y);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	x=x*y;
       }
       double stop=usecond();
@@ -86,17 +86,17 @@ int main (int argc, char ** argv)
   for(int lat=2;lat<=LMAX;lat+=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	z=x*y;
       }
       double stop=usecond();
@@ -117,17 +117,17 @@ int main (int argc, char ** argv)
   for(int lat=2;lat<=LMAX;lat+=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
       }
       double stop=usecond();
@@ -148,17 +148,17 @@ int main (int argc, char ** argv)
   for(int lat=2;lat<=LMAX;lat+=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mac(z,x,y);
       }
       double stop=usecond();

From 6f5a5cd9b3269932a720804aebe8b7046d4b68fe Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 28 Jun 2017 23:27:02 +0100
Subject: [PATCH 111/177] Improved threaded comms benchmark

---
 TODO                          | 11 ++--
 benchmarks/Benchmark_comms.cc | 94 +++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/TODO b/TODO
index 001c6c0c..3d29215e 100644
--- a/TODO
+++ b/TODO
@@ -2,10 +2,13 @@ TODO:
 ---------------
 
 Large item work list:
-1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
+1)- I/O;    There appear to be issues with MPI IO and NERSC with large files.
+            Possible 2GB limit reappeared. GPFS driver in Intel MPI.
+
+2)- BG/Q port and check
+
+3)- Christoph's local basis expansion Lanczos; port to use Lattice_transfer features
 
-2)- Christoph's local basis expansion Lanczos
-3)- BG/Q port and check
 4)- Precision conversion and sort out localConvert      <-- partial
   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 5)- Physical propagator interface
@@ -14,6 +17,8 @@ Large item work list:
 8)- HDCR resume
 
 Recent DONE 
+
+-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 753b8a58..698f9d25 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -435,6 +435,100 @@ int main (int argc, char ** argv)
  
     }
   }    
+
+
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=4;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+      				    lat*mpi_layout[1],
+      				    lat*mpi_layout[2],
+      				    lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+      Grid.ShmBufferFreeAll();
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+      }
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      double dbytes;
+      for(int i=0;i<Nloop;i++){
+	double start=usecond();
+
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	dbytes=0;
+	ncomm=0;
+
+	parallel_for(int dir=0;dir<8;dir++){
+
+	  double tbytes;
+	  int mu =dir % 4;
+
+	  if (mpi_layout[mu]>1 ) {
+	  
+	    ncomm++;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    if ( dir == mu ) { 
+	      int comm_proc=1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    } else { 
+	      int comm_proc = mpi_layout[mu]-1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    }
+	    tbytes= Grid.StencilSendToRecvFromBegin(requests,
+						    (void *)&xbuf[dir][0],
+						    xmit_to_rank,
+						    (void *)&rbuf[dir][0],
+						    recv_from_rank,
+						    bytes,dir);
+	    Grid.StencilSendToRecvFromComplete(requests,dir);
+	    requests.resize(0);
+
+#pragma omp atomic
+	    dbytes+=tbytes;
+	  }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+	t_time[i] = stop-start; // microseconds
+      }
+
+      timestat.statistics(t_time);
+
+      dbytes=dbytes*ppn;
+      double xbytes    = dbytes*0.5;
+      double rbytes    = dbytes*0.5;
+      double bidibytes = dbytes;
+
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
+               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
+ 
+    }
+  }    
+
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;

From 8859a151cc844b8170729285dc2a272e4ffa4940 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:30:29 +0100
Subject: [PATCH 112/177] Small corrections to the NEON port

---
 configure.ac                  |  2 +-
 lib/qcd/smearing/WilsonFlow.h |  9 ++++-----
 lib/simd/Grid_neon.h          | 15 +++++----------
 lib/simd/Grid_vector_types.h  |  2 +-
 4 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/configure.ac b/configure.ac
index a69b97e3..75cf7891 100644
--- a/configure.ac
+++ b/configure.ac
@@ -250,7 +250,7 @@ case ${ax_cv_cxx_compiler_vendor} in
         SIMD_FLAGS='';;
       NEONv8)
         AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
-        SIMD_FLAGS='';;
+        SIMD_FLAGS='-march=armv8-a';;
       QPX|BGQ)
         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
         SIMD_FLAGS='';;
diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h
index 5e9f2d95..4f5c0d43 100644
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
     if (maxTau - taus < epsilon){
         epsilon = maxTau-taus;
     }
-    std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
     GaugeField Z(U._grid);
     GaugeField Zprime(U._grid);
     GaugeField tmp(U._grid), Uprime(U._grid);
@@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
     // adjust integration step
     
     taus += epsilon;
-    std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
     
     epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-    std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 
 }
 
@@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
     out = in;
     for (unsigned int step = 1; step <= Nstep; step++) {
         auto start = std::chrono::high_resolution_clock::now();
-        std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
         evolve_step(out);
         auto end = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double> diff = end - start;
@@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
     unsigned int step = 0;
     do{
         step++;
-        std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
+        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
         evolve_step_adaptive(out, maxTau);
         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
             << step << "  "
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index 38815389..d6eb9c5a 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -6,9 +6,9 @@
 
     Copyright (C) 2015
 
-Author: Nils Meyer <nils.meyer@ur.de>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -27,7 +27,7 @@ Author: neo <cossu@post.kek.jp>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-//----------------------------------------------------------------------
+
 /*
 
   ARMv8 NEON intrinsics layer by
@@ -37,9 +37,6 @@ Author: neo <cossu@post.kek.jp>
   SFB/TRR55
 
 */
-//----------------------------------------------------------------------
-//#ifndef ARM_NEON
-//#define ARM_NEON
 
 #ifndef GEN_SIMD_WIDTH
 #define GEN_SIMD_WIDTH 16u
@@ -606,6 +603,4 @@ namespace Optimization {
   typedef Optimization::TimesMinusI TimesMinusISIMD;
   typedef Optimization::TimesI      TimesISIMD;
 
-}
-
-//#endif // ARM_NEON
+}
\ No newline at end of file
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index e05fecc4..27585547 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -53,7 +53,7 @@ directory
 #if defined IMCI
 #include "Grid_imci.h"
 #endif
-#ifdef NEONv8
+#ifdef NEONV8
 #include "Grid_neon.h"
 #endif
 #if defined QPX

From bf630a6821ea8923fc9690a03f621f6d69b31f4e Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:42:25 +0100
Subject: [PATCH 113/177] README file update

---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9432abe1..5d168298 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 License: GPL v2.
 
-Last update Nov 2016.
+Last update June 2017.
 
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 
@@ -78,14 +78,17 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.
 
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
 
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see https://arxiv.org/abs/1512.03487 for more detail.
 
+### Required libraries
+Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
+
 ### Quick start
 First, start by cloning the repository:
 
@@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `QPX`       | QPX (256 bit)                          |
+| `NEONv8`    | ARM NEON (128 bit)                     |
+| `QPX`       | IBM QPX (256 bit)                      |
 
 Alternatively, some CPU codenames can be directly used:
 

From 09d09d0fe5bce853e1b42115371cd935a4e29cc0 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:48:11 +0100
Subject: [PATCH 114/177] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5d168298..1f0b450c 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 
 MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see https://arxiv.org/abs/1512.03487 for more detail.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 
 ### Required libraries
 Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
@@ -176,7 +176,7 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `NEONv8`    | ARM NEON (128 bit)                     |
+| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | `QPX`       | IBM QPX (256 bit)                      |
 
 Alternatively, some CPU codenames can be directly used:
@@ -216,4 +216,4 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are w
              --with-mpfr=<path>       \
              --enable-mkl             \
              CXX=CC CC=cc
-```
\ No newline at end of file
+```

From ac1f1838bc9c143a3e2091e75d3f68e4455d0231 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:15:32 +0100
Subject: [PATCH 115/177] KNL only

---
 lib/perfmon/PerfCount.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/perfmon/PerfCount.cc b/lib/perfmon/PerfCount.cc
index 4778295a..c6f92b9f 100644
--- a/lib/perfmon/PerfCount.cc
+++ b/lib/perfmon/PerfCount.cc
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
   { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
   { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
     // 4
-#ifdef AVX512
+#ifdef KNL
     { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
     { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
     { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },

From 2d3737a133b6f1208849cd8580badba4ff152a4d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:15:59 +0100
Subject: [PATCH 116/177] O3, KNL

---
 configure.ac | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index f7284d48..8175e8b0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -27,7 +27,7 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
       [version of g++ that will compile the code])
 
-CXXFLAGS="-g $CXXFLAGS"
+CXXFLAGS="-O3 $CXXFLAGS"
 
 
 ############### Checks for typedefs, structures, and compiler characteristics
@@ -241,6 +241,7 @@ case ${ax_cv_cxx_compiler_vendor} in
         SIMD_FLAGS='';;
       KNL)
         AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        AC_DEFINE([KNL],[1],[Knights landing processor])
         SIMD_FLAGS='-march=knl';;
       GEN)
         AC_DEFINE([GEN],[1],[generic vector code])
@@ -276,6 +277,7 @@ case ${ax_cv_cxx_compiler_vendor} in
         SIMD_FLAGS='';;
       KNL)
         AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
+        AC_DEFINE([KNL],[1],[Knights landing processor])
         SIMD_FLAGS='-xmic-avx512';;
       GEN)
         AC_DEFINE([GEN],[1],[generic vector code])

From 694b305cab39e1b7870ca57107521679486c611a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:16:13 +0100
Subject: [PATCH 117/177] Update to reporting

---
 benchmarks/Benchmark_dwf.cc | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index a071c050..d50cc3a0 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -165,7 +165,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 
   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =1000;
+  int ncall =500;
   if (1) {
     FGrid->Barrier();
     Dw.ZeroCounters();
@@ -302,6 +302,7 @@ int main (int argc, char ** argv)
       std::cout<< "sD ERR   \n " << err  <<std::endl;
     }
     assert(sum < 1.0e-4);
+
     
     if(1){
       std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
@@ -381,8 +382,23 @@ int main (int argc, char ** argv)
       }
       assert(error<1.0e-4);
     }
+
+  if(1){
+    std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
+    for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      sDw.Dhop(ssrc,sresult,0);
+      PerformanceCounter Counter(i);
+      Counter.Start();
+      sDw.Dhop(ssrc,sresult,0);
+      Counter.Stop();
+      Counter.Report();
+    }
   }
 
+  }
+
+
+
   if (1)
   { // Naive wilson dag implementation
     ref = zero;

From b73bd151bb69a416dbf2fe455e87c1f85d5e59b3 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:16:35 +0100
Subject: [PATCH 118/177] Switch off counters by default

---
 benchmarks/Benchmark_dwf.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index d50cc3a0..7814ec7d 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -383,7 +383,7 @@ int main (int argc, char ** argv)
       assert(error<1.0e-4);
     }
 
-  if(1){
+  if(0){
     std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
     for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
       sDw.Dhop(ssrc,sresult,0);

From 38325ebbc61cce2d0f6b394fd15d96333d6e370e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:23:51 +0100
Subject: [PATCH 119/177] Interleave code path; not enabled

---
 lib/stencil/Lebesgue.cc | 25 ++++++++++++++++++++++++-
 lib/stencil/Lebesgue.h  |  2 ++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc
index 4551878c..0c644fc1 100644
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -51,8 +51,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
   if ( Block[0]==0) ZGraph();
   else if ( Block[1]==0) NoBlocking();
   else CartesianBlocking();
-}
 
+  if (0) {
+    std::cout << "Thread Interleaving"<<std::endl;
+    ThreadInterleave();
+  } 
+}
+void LebesgueOrder::ThreadInterleave(void)
+{
+  std::vector<IndexInteger> reorder = _LebesgueReorder;
+  std::vector<IndexInteger> throrder;
+  int vol = _LebesgueReorder.size();
+  int threads = GridThread::GetThreads();
+  int blockbits=3;
+  int blocklen = 8;
+  int msk      = 0x7;
+
+  for(int t=0;t<threads;t++){
+    for(int ss=0;ss<vol;ss++){
+       if ( ( ss >> blockbits) % threads == t ) { 
+         throrder.push_back(reorder[ss]);
+       }
+    }
+  }
+  _LebesgueReorder = throrder;
+}
 void LebesgueOrder::NoBlocking(void) 
 {
   std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
diff --git a/lib/stencil/Lebesgue.h b/lib/stencil/Lebesgue.h
index 1675d16c..7db0cc6b 100644
--- a/lib/stencil/Lebesgue.h
+++ b/lib/stencil/Lebesgue.h
@@ -70,6 +70,8 @@ namespace Grid {
 		  std::vector<IndexInteger> & xi,
 		  std::vector<IndexInteger> &dims);
 
+    void ThreadInterleave(void);
+
   private:
     std::vector<IndexInteger> _LebesgueReorder;
 

From f20eceb6cd6469c496e07e01055a08c0e0e4f7c8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:48:27 +0100
Subject: [PATCH 120/177] First touch once per page in a threaded loop

---
 lib/allocator/AlignedAllocator.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 6e85ab27..54090024 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -98,7 +98,12 @@ public:
 #else
     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 #endif
-
+    // First touch optimise in threaded loop
+    uint8_t *cp = (uint8_t *)ptr;
+#pragma omp parallel for
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
     return ptr;
   }
 
@@ -186,6 +191,12 @@ public:
 #else
     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
+    size_type bytes = __n*sizeof(_Tp);
+    uint8_t *cp = (uint8_t *)ptr;
+#pragma omp parallel for
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
     return ptr;
   }
   void deallocate(pointer __p, size_type) { 

From 7a788db3dc07914e4e4daa0219edcd406ea6f35e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:49:08 +0100
Subject: [PATCH 121/177] Guard first touch

---
 lib/allocator/AlignedAllocator.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 54090024..4513ce26 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -100,7 +100,9 @@ public:
 #endif
     // First touch optimise in threaded loop
     uint8_t *cp = (uint8_t *)ptr;
+#ifdef GRID_OMP
 #pragma omp parallel for
+#endif
     for(size_type n=0;n<bytes;n+=4096){
       cp[n]=0;
     }

From b5a6e4f1fd7d562537b4914e4dc21e81448d4bc6 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:53:22 +0100
Subject: [PATCH 122/177] Best option for Xeon cache blocking set

---
 lib/stencil/Lebesgue.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc
index 0c644fc1..2880e4b6 100644
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 
 int LebesgueOrder::UseLebesgueOrder;
+#ifdef KNL
 std::vector<int> LebesgueOrder::Block({8,2,2,2});
-
+#else
+std::vector<int> LebesgueOrder::Block({2,2,2,2});
+#endif
 LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
   n--;           // 1000 0011 --> 1000 0010
   n |= n >> 1;   // 1000 0010 | 0100 0001 = 1100 0011

From f3b0a92e71af2577afb68c3021b1f9a8467f3e8e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 09:48:00 +0100
Subject: [PATCH 123/177] Update README.md

---
 README.md | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 1f0b450c..072f7404 100644
--- a/README.md
+++ b/README.md
@@ -158,7 +158,6 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `none`         | no communications                                             |
 | `mpi[-auto]`   | MPI communications                                            |
 | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
-| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
 
 For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
@@ -199,21 +198,109 @@ The following configuration is recommended for the Intel Knights Landing platfor
 ``` bash
 ../configure --enable-precision=double\
              --enable-simd=KNL        \
-             --enable-comms=mpi-auto \
-             --with-gmp=<path>        \
-             --with-mpfr=<path>       \
+             --enable-comms=mpi-auto  \
              --enable-mkl             \
              CXX=icpc MPICXX=mpiicpc
 ```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 
-where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 
 ``` bash
 ../configure --enable-precision=double\
              --enable-simd=KNL        \
              --enable-comms=mpi       \
-             --with-gmp=<path>        \
-             --with-mpfr=<path>       \
              --enable-mkl             \
              CXX=CC CC=cc
 ```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+```            --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Knight's Landing with Intel Omnipath adapters with two adapters per node 
+presently performs better with use of more than one rank per node, using shared memory 
+for interior communication. This is the mpi3 communications implementation. 
+We recommend four ranks per node for best performance, but optimum is local volume dependent.
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=KNL        \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=mpiicpc
+```
+
+### Build setup for Intel Haswell Xeon platform
+
+The following configuration is recommended for the Intel Knights Landing platform:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+```            --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+        export I_MPI_PIN=1
+```
+This is the default.
+
+### Build setup for Intel Skylake Xeon platform
+
+The following configuration is recommended for the Intel Knights Landing platform:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX512     \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+```            --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX512     \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+        export I_MPI_PIN=1
+```
+This is the default. 
+
+

From e18929eaa0c8e6de539abf2c2ef259ea0816ea7e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 09:53:15 +0100
Subject: [PATCH 124/177] Update README.md

---
 README.md | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 072f7404..f3645b3a 100644
--- a/README.md
+++ b/README.md
@@ -215,7 +215,8 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl
 ```
 
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
-```            --with-gmp=<path>        \
+``` bash
+               --with-gmp=<path>        \
                --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
@@ -228,26 +229,27 @@ We recommend four ranks per node for best performance, but optimum is local volu
 ``` bash
 ../configure --enable-precision=double\
              --enable-simd=KNL        \
-             --enable-comms=mpi3      \
+             --enable-comms=mpi3-auto \
              --enable-mkl             \
-             CXX=mpiicpc
+             CC=icpc MPICXX=mpiicpc 
 ```
 
 ### Build setup for Intel Haswell Xeon platform
 
-The following configuration is recommended for the Intel Knights Landing platform:
+The following configuration is recommended for the Intel Haswell platform:
 
 ``` bash
 ../configure --enable-precision=double\
              --enable-simd=AVX2       \
-             --enable-comms=mpi3      \
+             --enable-comms=mpi3-auto \
              --enable-mkl             \
-             CXX=mpiicpc
+             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
-```            --with-gmp=<path>        \
+``` bash
+               --with-gmp=<path>        \
                --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
@@ -270,7 +272,7 @@ This is the default.
 
 ### Build setup for Intel Skylake Xeon platform
 
-The following configuration is recommended for the Intel Knights Landing platform:
+The following configuration is recommended for the Intel Skylake platform:
 
 ``` bash
 ../configure --enable-precision=double\
@@ -282,7 +284,8 @@ The following configuration is recommended for the Intel Knights Landing platfor
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
-```            --with-gmp=<path>        \
+``` bash
+               --with-gmp=<path>        \
                --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
@@ -298,7 +301,7 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
-```
+``` bash
         export I_MPI_PIN=1
 ```
 This is the default. 

From 251a97fe1be59f28686e1d07f8576c7d9f815517 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 09:55:36 +0100
Subject: [PATCH 125/177] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f3645b3a..f9fd7ab5 100644
--- a/README.md
+++ b/README.md
@@ -301,7 +301,7 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
-``` bash
+``` 
         export I_MPI_PIN=1
 ```
 This is the default. 

From 1354b46338bfaaa338e4e3ad7430e8b8fe087057 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:04:32 +0100
Subject: [PATCH 126/177] Update README.md

---
 README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.md b/README.md
index f9fd7ab5..8f0babd9 100644
--- a/README.md
+++ b/README.md
@@ -306,4 +306,20 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+### Build setup for laptops, other compilers, non-cluster builds
+
+Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
+and omit the enable-mkl flag. 
+
+Single node builds are enabled with 
+```
+            --enable-comms=none
+```
+
+FFTW support that is not in the default search path may then enabled with
+```
+    --with-fftw=<installpath>
+```
+
+BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
 

From 3d09e3e9e0c3b24e1646db3083aba01537bcf88a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:05:46 +0100
Subject: [PATCH 127/177] Update README.md

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 8f0babd9..8f7a3d42 100644
--- a/README.md
+++ b/README.md
@@ -306,6 +306,14 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+### Build setup for BlueGene/Q
+
+To be written...
+
+### Build setup for ARM Neon
+
+To be written..
+
 ### Build setup for laptops, other compilers, non-cluster builds
 
 Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),

From 37263fd9b181f1190ff201203da6ac6a431e045d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:06:24 +0100
Subject: [PATCH 128/177] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8f7a3d42..afb751f5 100644
--- a/README.md
+++ b/README.md
@@ -312,7 +312,7 @@ To be written...
 
 ### Build setup for ARM Neon
 
-To be written..
+To be written...
 
 ### Build setup for laptops, other compilers, non-cluster builds
 

From b68ad0cc0bf6ab479199020fd6b976229c0cb047 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:20:07 +0100
Subject: [PATCH 129/177] Update README.md

---
 README.md | 74 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index afb751f5..a786bc6c 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,37 @@ Last update June 2017.
 
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 
+
+
+### Description
+This library provides data parallel C++ container classes with internal memory layout
+that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
+are provided, similar to HPF and cmfortran, and user control is given over the mapping of
+array indices to both MPI tasks and SIMD processing elements.
+
+* Identically shaped arrays then be processed with perfect data parallelisation.
+* Such identically shaped arrays are called conformable arrays.
+
+The transformation is based on the observation that Cartesian array processing involves
+identical processing to be performed on different regions of the Cartesian array.
+
+The library will both geometrically decompose into MPI tasks and across SIMD lanes.
+Local vector loops are parallelised with OpenMP pragmas.
+
+Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
+optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
+for most programmers.
+
+The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
+
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
+
+MPI, OpenMP, and SIMD parallelism are present in the library.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
+
+
 ### Compilers
 
 Intel ICPC v16.0.3 and later
@@ -56,38 +87,19 @@ When you file an issue, please go though the following checklist:
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 
-
-
-### Description
-This library provides data parallel C++ container classes with internal memory layout
-that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
-are provided, similar to HPF and cmfortran, and user control is given over the mapping of
-array indices to both MPI tasks and SIMD processing elements.
-
-* Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shaped arrays are called conformable arrays.
-
-The transformation is based on the observation that Cartesian array processing involves
-identical processing to be performed on different regions of the Cartesian array.
-
-The library will both geometrically decompose into MPI tasks and across SIMD lanes.
-Local vector loops are parallelised with OpenMP pragmas.
-
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
-optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
-for most programmers.
-
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
-
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
-The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
-
-MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
-
 ### Required libraries
-Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
+Grid requires:
+[GMP](https://gmplib.org/), 
+[MPFR](http://www.mpfr.org/) 
+
+Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
+
+Grid optionally uses:
+[HDF5](https://support.hdfgroup.org/HDF5/)  
+[LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) 
+[FFTW](http://www.fftw.org) (Either generic or via the Intel MKL library)
+[LAPACK]( either generic or Intel MKL library)
+
 
 ### Quick start
 First, start by cloning the repository:

From 7b0237b0819d6981767a0189f7550546a58a8683 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:24:41 +0100
Subject: [PATCH 130/177] Update README.md

---
 README.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a786bc6c..3572be26 100644
--- a/README.md
+++ b/README.md
@@ -89,16 +89,22 @@ When you file an issue, please go though the following checklist:
 
 ### Required libraries
 Grid requires:
+
 [GMP](https://gmplib.org/), 
+
 [MPFR](http://www.mpfr.org/) 
 
 Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
 
 Grid optionally uses:
+
 [HDF5](https://support.hdfgroup.org/HDF5/)  
-[LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) 
-[FFTW](http://www.fftw.org) (Either generic or via the Intel MKL library)
-[LAPACK]( either generic or Intel MKL library)
+
+[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
+
+[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
+
+LAPACK either generic version or Intel MKL library.
 
 
 ### Quick start

From 40e119c61cac619b7fa1874e5fa7ccdc1dcb77cb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 8 Jul 2017 22:27:11 -0400
Subject: [PATCH 131/177] NUMA improvements worth preserving from AMD EPYC
 tests

---
 benchmarks/Benchmark_memory_bandwidth.cc | 48 ++++++++++++------------
 lib/allocator/AlignedAllocator.h         |  3 +-
 lib/communicator/Communicator_mpi3.cc    | 20 +++++++++-
 3 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 1136dfe0..848f271d 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -60,16 +60,16 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
       uint64_t Nloop=NLOOP;
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       double a=2.0;
 
 
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
       double time = (stop-start)/Nloop*1000;
       
       double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
     }
@@ -97,14 +97,14 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       double a=2.0;
 
       uint64_t Nloop=NLOOP;
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
       double time = (stop-start)/Nloop*1000;
      
       double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
     }
@@ -133,16 +133,16 @@ int main (int argc, char ** argv)
 
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       uint64_t Nloop=NLOOP;
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       RealD a=2.0;
 
 
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double bytes=2*vol*Nvec*sizeof(Real);
+      double bytes=2.0*vol*Nvec*sizeof(Real);
       double flops=vol*Nvec*1;// mul
       std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
@@ -169,14 +169,14 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       uint64_t Nloop=NLOOP;
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       RealD a=2.0;
       Real nn;      
       double start=usecond();
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double bytes=vol*Nvec*sizeof(Real);
+      double bytes=1.0*vol*Nvec*sizeof(Real);
       double flops=vol*Nvec*2;// mul,add
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
 
diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 4513ce26..db86c435 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -195,7 +195,8 @@ public:
 #endif
     size_type bytes = __n*sizeof(_Tp);
     uint8_t *cp = (uint8_t *)ptr;
-#pragma omp parallel for
+    // One touch per 4k page, static OMP loop to catch same loop order
+#pragma omp parallel for schedule(static)
     for(size_type n=0;n<bytes;n+=4096){
       cp[n]=0;
     }
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 632eb991..f5646d44 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -37,7 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
-//#include <zlib.h>
+#include <zlib.h>
+#include <numaif.h>
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
@@ -214,6 +215,23 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
       assert(((uint64_t)ptr&0x3F)==0);
+
+	int status;
+	int flags=MPOL_MF_MOVE;
+#ifdef KNL
+	int nodes=1; // numa domain == MCDRAM
+	// Find out if in SNC2,SNC4 mode ?
+#else
+	int nodes=r; // numa domain == MPI ID
+#endif
+	unsigned long count=1;
+      for(uint64_t page=0;page<size;page+=4096){
+	void *pages = (void *) ( page + (uint64_t)ptr );
+	uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
+	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+	if (ierr && (page==0)) perror("numa relocate command failed");
+      }
+
       ShmCommBufs[r] =ptr;
       
     }

From 8a4714a4a6ca1e9a613e097e892f9c78cb05c4e1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sun, 9 Jul 2017 00:11:54 +0100
Subject: [PATCH 132/177] Update README.md

---
 README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/README.md b/README.md
index 3572be26..e0a9bb14 100644
--- a/README.md
+++ b/README.md
@@ -324,6 +324,60 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+### Build setup for AMD EPYC / RYZEN
+
+The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
+So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
+are common. Each chip within the module exposes a separate NUMA domain.
+There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
+MPI-3 is recommended with the use of four ranks per socket,
+and 8 threads per rank. 
+
+The following configuration is recommended for the AMD EPYC platform.
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3 \
+             CXX=mpicxx 
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
+This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
+
+It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
+shared memory to communicate within this node:
+
+mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
+
+Where omp_bind.sh does the following:
+```
+#!/bin/bash
+
+numanode=` expr $PMI_RANK % 8 `
+basecore=`expr $numanode \* 16`
+core0=`expr $basecore + 0 `
+core1=`expr $basecore + 2 `
+core2=`expr $basecore + 4 `
+core3=`expr $basecore + 6 `
+core4=`expr $basecore + 8 `
+core5=`expr $basecore + 10 `
+core6=`expr $basecore + 12 `
+core7=`expr $basecore + 14 `
+
+export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
+echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
+
+$@
+```
+
 ### Build setup for BlueGene/Q
 
 To be written...

From dc6f078246b006ad1b3e61c513273b73f8f0da81 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 11 Jul 2017 14:15:08 +0100
Subject: [PATCH 133/177] fixed the header file for mpi3

---
 configure.ac                          |  8 +++++++-
 lib/communicator/Communicator_mpi3.cc | 18 +++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/configure.ac b/configure.ac
index 8c43d67a..dc6754da 100644
--- a/configure.ac
+++ b/configure.ac
@@ -51,6 +51,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
+AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 
@@ -186,9 +187,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 
 AC_SEARCH_LIBS([crc32], [z],
                [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
-               [have_zlib=true],
+               [have_zlib=true] [LIBS="${LIBS} -lz"],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 
+AC_SEARCH_LIBS([move_pages], [numa],
+               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
+               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
+	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
+
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
                [have_hdf5=true]
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index f5646d44..4192300b 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -38,7 +38,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
+#ifdef HAVE_NUMAIF_H
 #include <numaif.h>
+#endif
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
@@ -216,6 +218,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
       assert(((uint64_t)ptr&0x3F)==0);
 
+      // Try to force numa domain on the shm segment if we have numaif.h
+#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
@@ -225,13 +229,13 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 	int nodes=r; // numa domain == MPI ID
 #endif
 	unsigned long count=1;
-      for(uint64_t page=0;page<size;page+=4096){
-	void *pages = (void *) ( page + (uint64_t)ptr );
-	uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
-	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-	if (ierr && (page==0)) perror("numa relocate command failed");
-      }
-
+	for(uint64_t page=0;page<size;page+=4096){
+	  void *pages = (void *) ( page + (uint64_t)ptr );
+	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
+	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+	  if (ierr && (page==0)) perror("numa relocate command failed");
+	}
+#endif
       ShmCommBufs[r] =ptr;
       
     }

From 659d7d1a4051a950d6a14bf7513a0847010de926 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Wed, 12 Jul 2017 15:01:48 +0100
Subject: [PATCH 134/177] For test/solver Fixed

---
 lib/lattice/Lattice_reduction.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index c5b20f3c..38982891 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -540,7 +540,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
       for(int i=0;i<Nblock;i++){
       for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
-	vector_typeD rtmp = TensorRemove(tmp);
+	//	vector_typeD rtmp = TensorRemove(tmp);
+	auto rtmp = TensorRemove(tmp);
 	mat_thread(i,j) += Reduce(rtmp);
       }}
     }}

From 2d8aff36fe46cd692634ceacbacf43308d204fb4 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Fri, 14 Jul 2017 22:52:16 +0100
Subject: [PATCH 135/177] Update README.md

---
 README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/README.md b/README.md
index e0a9bb14..ea20d0ec 100644
--- a/README.md
+++ b/README.md
@@ -324,6 +324,17 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+
+mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
+Average mflops/s per call per node (full): ** 498739 ** 4d vec
+Average mflops/s per call per node (full): ** 457786 ** 4d vec, fp16 comms
+Average mflops/s per call per node (full): ** 572645 ** 5d vec
+Average mflops/s per call per node (full): ** 721206 ** 5d vec, red black
+Average mflops/s per call per node (full): ** 634542 ** 4d vec, red black
+
+
+
 ### Build setup for AMD EPYC / RYZEN
 
 The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
@@ -378,6 +389,17 @@ echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 $@
 ```
 
+Performance:
+
+** Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+
+mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
+Average mflops/s per call per node (full): **420235** 4d vec
+Average mflops/s per call per node (full): **437617** 4d vec, fp16 comms
+Average mflops/s per call per node (full): **522988** 5d vec
+Average mflops/s per call per node (full): **588984** 5d vec, red black
+Average mflops/s per call per node (full): **508423** 4d vec, red black
+
 ### Build setup for BlueGene/Q
 
 To be written...

From 169f4b2711f0131f1909738c2b631ced3e47c9e1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Fri, 14 Jul 2017 22:56:06 +0100
Subject: [PATCH 136/177] Update README.md

---
 README.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index ea20d0ec..124c7bfa 100644
--- a/README.md
+++ b/README.md
@@ -327,11 +327,11 @@ This is the default.
 ** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
-Average mflops/s per call per node (full): ** 498739 ** 4d vec
-Average mflops/s per call per node (full): ** 457786 ** 4d vec, fp16 comms
-Average mflops/s per call per node (full): ** 572645 ** 5d vec
-Average mflops/s per call per node (full): ** 721206 ** 5d vec, red black
-Average mflops/s per call per node (full): ** 634542 ** 4d vec, red black
+- Average mflops/s per call per node (full):  498739 : 4d vec
+- Average mflops/s per call per node (full):  457786 : 4d vec, fp16 comms
+- Average mflops/s per call per node (full):  572645 : 5d vec
+- Average mflops/s per call per node (full):  721206 : 5d vec, red black
+- Average mflops/s per call per node (full):  634542 : 4d vec, red black
 
 
 
@@ -391,14 +391,14 @@ $@
 
 Performance:
 
-** Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+### Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
-Average mflops/s per call per node (full): **420235** 4d vec
-Average mflops/s per call per node (full): **437617** 4d vec, fp16 comms
-Average mflops/s per call per node (full): **522988** 5d vec
-Average mflops/s per call per node (full): **588984** 5d vec, red black
-Average mflops/s per call per node (full): **508423** 4d vec, red black
+- Average mflops/s per call per node (full): 420235 : 4d vec
+- Average mflops/s per call per node (full): 437617 : 4d vec, fp16 comms
+- Average mflops/s per call per node (full): 522988 : 5d vec
+- Average mflops/s per call per node (full): 588984 : 5d vec, red black
+- Average mflops/s per call per node (full): 508423 : 4d vec, red black
 
 ### Build setup for BlueGene/Q
 

From f038c6babe1ec5cd3772c4bcb892d19709dc96f5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Fri, 14 Jul 2017 22:59:16 +0100
Subject: [PATCH 137/177] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 124c7bfa..a185063e 100644
--- a/README.md
+++ b/README.md
@@ -324,7 +324,7 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
-** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 - Average mflops/s per call per node (full):  498739 : 4d vec
@@ -391,7 +391,7 @@ $@
 
 Performance:
 
-### Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 - Average mflops/s per call per node (full): 420235 : 4d vec

From fe4912880d3ceaf96023e5074682cc4ee43cb871 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Mon, 17 Jul 2017 09:53:07 +0100
Subject: [PATCH 138/177] Update README.md

---
 README.md | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index a185063e..1e0988f3 100644
--- a/README.md
+++ b/README.md
@@ -327,12 +327,8 @@ This is the default.
 #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
-- Average mflops/s per call per node (full):  498739 : 4d vec
-- Average mflops/s per call per node (full):  457786 : 4d vec, fp16 comms
-- Average mflops/s per call per node (full):  572645 : 5d vec
-- Average mflops/s per call per node (full):  721206 : 5d vec, red black
-- Average mflops/s per call per node (full):  634542 : 4d vec, red black
 
+TBA
 
 
 ### Build setup for AMD EPYC / RYZEN
@@ -394,11 +390,8 @@ Performance:
 #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
-- Average mflops/s per call per node (full): 420235 : 4d vec
-- Average mflops/s per call per node (full): 437617 : 4d vec, fp16 comms
-- Average mflops/s per call per node (full): 522988 : 5d vec
-- Average mflops/s per call per node (full): 588984 : 5d vec, red black
-- Average mflops/s per call per node (full): 508423 : 4d vec, red black
+
+TBA
 
 ### Build setup for BlueGene/Q
 

From 0f214ad427c2f903bc5effeb453f5bed27034cc5 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Fri, 21 Jul 2017 11:13:51 -0400
Subject: [PATCH 139/177] Moved FourierAcceleratedGaugeFixer into Grid::QCD
 namespace and removed 'using namespace' directives from header

---
 lib/qcd/utils/GaugeFix.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h
index 4ff216e4..f2ea1aa2 100644
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -26,12 +26,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 //#include <Grid/Grid.h>
 
-using namespace Grid;
-using namespace Grid::QCD;
+namespace Grid {
+namespace QCD {
 
 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
-  public:
+ public:
   INHERIT_GIMPL_TYPES(Gimpl);
 
   typedef typename Gimpl::GaugeLinkField GaugeMat;
@@ -186,3 +186,5 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
   }  
 };
 
+}
+}

From 56967818626452a318c058684b9594adca4f7fa4 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 26 Jul 2017 12:07:34 +0100
Subject: [PATCH 140/177] Debug error in Tensor mult

---
 lib/tensors/Tensor_arith_mul.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/tensors/Tensor_arith_mul.h b/lib/tensors/Tensor_arith_mul.h
index c24853b7..a474db9c 100644
--- a/lib/tensors/Tensor_arith_mul.h
+++ b/lib/tensors/Tensor_arith_mul.h
@@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
 strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
                  const iVector<vtype,N> * __restrict__ rhs,
                  const iScalar<mtype> * __restrict__ lhs){
-    mult(ret,lhs,rhs);
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
+    }                 
 }
     
 

From 237cfd11ab493e1ea8ffaf24fc1da5171b8b929a Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 26 Jul 2017 12:08:51 +0100
Subject: [PATCH 141/177] Solving the spurious O2 flags

---
 configure.ac | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index dc6754da..a028fb0a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 ################ Get git info
 #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 
+################ Set flags
+# do not move!
+CXXFLAGS="-O3 $CXXFLAGS"
+
 ############### Checks for programs
 AC_PROG_CXX
 AC_PROG_RANLIB
@@ -27,7 +31,6 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
       [version of g++ that will compile the code])
 
-CXXFLAGS="-O3 $CXXFLAGS"
 
 
 ############### Checks for typedefs, structures, and compiler characteristics

From 7abc5613bde6fb4e704145b0f2a4c8fa19090944 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 26 Jul 2017 16:21:17 +0100
Subject: [PATCH 142/177] Added smearing to the topological charge observable

---
 lib/qcd/modules/ObservableModules.h      | 15 ++---
 lib/qcd/observables/topological_charge.h | 70 +++++++++++++++++++++---
 tests/hmc/Test_hmc_WilsonGauge.cc        |  5 +-
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/lib/qcd/modules/ObservableModules.h b/lib/qcd/modules/ObservableModules.h
index 579fc1ec..24511617 100644
--- a/lib/qcd/modules/ObservableModules.h
+++ b/lib/qcd/modules/ObservableModules.h
@@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
   typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
   using ObsBase::ObsBase; // for constructors
 
-
-
   // acquire resource
   virtual void initialize(){
     this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
@@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
   PlaquetteMod(): ObsBase(NoParameters()){}
 };
 
+
 template < class Impl >
-class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
-  typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
+class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
+  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
   using ObsBase::ObsBase; // for constructors
 
-
-
   // acquire resource
   virtual void initialize(){
-    this->ObservablePtr.reset(new TopologicalCharge<Impl>());
+    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
   }
   public:
-  TopologicalChargeMod(): ObsBase(NoParameters()){}
+  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
+  TopologicalChargeMod(): ObsBase(){}
 };
 
 
-
 }// QCD temporarily here
 
 
diff --git a/lib/qcd/observables/topological_charge.h b/lib/qcd/observables/topological_charge.h
index 5d09c420..c2c419fb 100644
--- a/lib/qcd/observables/topological_charge.h
+++ b/lib/qcd/observables/topological_charge.h
@@ -33,9 +33,45 @@ directory
 namespace Grid {
 namespace QCD {
 
+struct TopologySmearingParameters : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
+    int, steps,
+    float, step_size,
+    int, meas_interval,
+    float, maxTau);
+
+    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
+        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
+
+    template < class ReaderClass >
+    TopologySmearingParameters(Reader<ReaderClass>& Reader){
+        read(Reader, "Smearing", *this);  
+    }  
+};
+
+
+
+struct TopologyObsParameters : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
+      int, interval,
+      bool, do_smearing,
+      TopologySmearingParameters, Smearing);  
+
+    TopologyObsParameters(int interval = 1, bool smearing = false):
+        interval(interval), Smearing(smearing){}
+
+    template <class ReaderClass >
+      TopologyObsParameters(Reader<ReaderClass>& Reader){
+        read(Reader, "TopologyMeasurement", *this);
+  }
+};
+
+
 // this is only defined for a gauge theory
 template <class Impl>
 class TopologicalCharge : public HmcObservable<typename Impl::Field> {
+    TopologyObsParameters Pars;
+
  public:
     // here forces the Impl to be of gauge fields
     // if not the compiler will complain
@@ -44,20 +80,40 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
     // necessary for HmcObservable compatibility
     typedef typename Impl::Field Field;
 
+    TopologicalCharge(int interval = 1, bool do_smearing = false):
+        Pars(interval, do_smearing){}
+    
+    TopologicalCharge(TopologyObsParameters P):Pars(P){
+        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
+    }
+
     void TrajectoryComplete(int traj,
                             Field &U,
                             GridSerialRNG &sRNG,
                             GridParallelRNG &pRNG) {
 
-    Real q = WilsonLoops<Impl>::TopologicalCharge(U);
+    if (traj%Pars.interval == 0){
+        // Smearing
+        Field Usmear = U;
+        int def_prec = std::cout.precision();
+        
+        if (Pars.do_smearing){
+            // using wilson flow by default here
+            std::cout << "1. " << Pars.Smearing.step_size << std::endl;
+            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
+            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
+            Real T0   = WF.energyDensityPlaquette(Usmear);
+            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
+        }
 
-    int def_prec = std::cout.precision();
+        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
+        std::cout << GridLogMessage
+            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
 
-    std::cout << GridLogMessage
-        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
-        << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
-
-    std::cout.precision(def_prec);
+        std::cout.precision(def_prec);
+        }
     }
 
 };
diff --git a/tests/hmc/Test_hmc_WilsonGauge.cc b/tests/hmc/Test_hmc_WilsonGauge.cc
index b2d5fb02..4cf6d923 100644
--- a/tests/hmc/Test_hmc_WilsonGauge.cc
+++ b/tests/hmc/Test_hmc_WilsonGauge.cc
@@ -66,7 +66,10 @@ int main(int argc, char **argv) {
   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
   typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
-  TheHMC.Resources.AddObservable<QObs>();
+  TopologyObsParameters TopParams;
+  TopParams.interval = 1;
+  TopParams.do_smearing = false; 
+  TheHMC.Resources.AddObservable<QObs>(TopParams);
   //////////////////////////////////////////////
 
   /////////////////////////////////////////////////////////////

From c0485d799d915637fdc455dfa900ee9786f7cd69 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 26 Jul 2017 16:26:04 +0100
Subject: [PATCH 143/177] Explicit parameter declaration in the WilsonGauge
 test

---
 lib/qcd/observables/topological_charge.h | 1 -
 tests/hmc/Test_hmc_WilsonGauge.cc        | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/qcd/observables/topological_charge.h b/lib/qcd/observables/topological_charge.h
index c2c419fb..5af8d77b 100644
--- a/lib/qcd/observables/topological_charge.h
+++ b/lib/qcd/observables/topological_charge.h
@@ -99,7 +99,6 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
         
         if (Pars.do_smearing){
             // using wilson flow by default here
-            std::cout << "1. " << Pars.Smearing.step_size << std::endl;
             WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
             WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
             Real T0   = WF.energyDensityPlaquette(Usmear);
diff --git a/tests/hmc/Test_hmc_WilsonGauge.cc b/tests/hmc/Test_hmc_WilsonGauge.cc
index 4cf6d923..05bf81a2 100644
--- a/tests/hmc/Test_hmc_WilsonGauge.cc
+++ b/tests/hmc/Test_hmc_WilsonGauge.cc
@@ -67,8 +67,12 @@ int main(int argc, char **argv) {
   typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   TopologyObsParameters TopParams;
-  TopParams.interval = 1;
-  TopParams.do_smearing = false; 
+  TopParams.interval = 5;
+  TopParams.do_smearing = true;
+  TopParams.Smearing.steps = 200;
+  TopParams.Smearing.step_size = 0.01;
+  TopParams.Smearing.meas_interval = 50;
+  TopParams.Smearing.maxTau = 2.0; 
   TheHMC.Resources.AddObservable<QObs>(TopParams);
   //////////////////////////////////////////////
 

From c7036f671754710c41de00cb0fa90a6e35104467 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 27 Jul 2017 11:15:09 +0100
Subject: [PATCH 144/177] Adding checks for libm and libstdc++

---
 configure.ac | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/configure.ac b/configure.ac
index a028fb0a..bf078b13 100644
--- a/configure.ac
+++ b/configure.ac
@@ -58,6 +58,10 @@ AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 
+############## Standard libraries
+AC_CHECK_LIB([m],[cos])
+AC_CHECK_LIB([stdc++],[abort])
+
 ############### GMP and MPFR
 AC_ARG_WITH([gmp],
     [AS_HELP_STRING([--with-gmp=prefix],

From 8bd869da37fc3911665213f96e431e3b60cb0332 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 27 Jul 2017 15:12:50 +0100
Subject: [PATCH 145/177] Correcting a bug in the IO routines

---
 lib/parallelIO/BinaryIO.h | 133 ++++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 47 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 117bec01..108e7ef8 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -98,35 +98,39 @@ class BinaryIO {
 
     NerscChecksum(grid,scalardata,nersc_csum);
   }
-  
-  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
+
+  template <class fobj>
+  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
   {
-    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
+    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
 
-
-    uint64_t lsites              =grid->lSites();
-    if (fbuf.size()==1) {
-      lsites=1;
+    uint64_t lsites = grid->lSites();
+    if (fbuf.size() == 1)
+    {
+      lsites = 1;
     }
 
-#pragma omp parallel
-    { 
-      uint32_t nersc_csum_thr=0;
+    #pragma omp parallel
+    {
+      uint32_t nersc_csum_thr = 0;
 
-#pragma omp for
-      for(uint64_t local_site=0;local_site<lsites;local_site++){
-	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
-	for(uint64_t j=0;j<size32;j++){
-	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
-	}
+      #pragma omp for
+      for (uint64_t local_site = 0; local_site < lsites; local_site++)
+      {
+        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
+        for (uint64_t j = 0; j < size32; j++)
+        {
+          nersc_csum_thr = nersc_csum_thr + site_buf[j];
+        }
       }
 
-#pragma omp critical
+      #pragma omp critical
       {
-	nersc_csum  += nersc_csum_thr;
+        nersc_csum += nersc_csum_thr;
       }
     }
   }
+
   template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
   {
     const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@@ -266,7 +270,7 @@ class BinaryIO {
     grid->Barrier();
     GridStopWatch timer; 
     GridStopWatch bstimer;
-
+    
     nersc_csum=0;
     scidac_csuma=0;
     scidac_csumb=0;
@@ -362,18 +366,22 @@ class BinaryIO {
 #else 
 	assert(0);
 #endif
-      } else { 
-	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
-	std::ifstream fin;
-	fin.open(file,std::ios::binary|std::ios::in);
-	if ( control & BINARYIO_MASTER_APPEND )  {
-	  fin.seekg(-sizeof(fobj),fin.end);
-	} else { 
-	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
-	}
-	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
-	fin.close();
+      } else {
+        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
+                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+        std::ifstream fin;
+        fin.open(file, std::ios::binary | std::ios::in);
+        if (control & BINARYIO_MASTER_APPEND)
+        {
+          fin.seekg(-sizeof(fobj), fin.end);
+        }
+        else
+        {
+          fin.seekg(offset + myrank * lsites * sizeof(fobj));
+        }
+        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
+        assert(fin.fail() == 0);
+        fin.close();
       }
       timer.Stop();
 
@@ -416,19 +424,47 @@ class BinaryIO {
 	assert(0);
 #endif
       } else { 
-	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
-	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
-	if ( control & BINARYIO_MASTER_APPEND )  {
+        
+	std::ofstream fout; 
+  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
+  try {
+    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+  } catch (const std::fstream::failure& exc) {
+    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
+    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
+    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
+    #ifdef USE_MPI_IO
+    MPI_Abort(MPI_COMM_WORLD,1);
+    #else
+    exit(1);
+    #endif
+  }
+	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
+		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
+	
+  if ( control & BINARYIO_MASTER_APPEND )  {
 	  fout.seekp(0,fout.end);
 	} else {
 	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
 	}
-	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
+  
+  try {
+  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
+  }
+  catch (const std::fstream::failure& exc) {
+    std::cout << "Exception in writing file " << file << std::endl;
+    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
+    #ifdef USE_MPI_IO
+    MPI_Abort(MPI_COMM_WORLD,1);
+    #else
+    exit(1);
+    #endif
+  }
+
 	fout.close();
-      }
-      timer.Stop();
-    }
+  }
+  timer.Stop();
+  }
 
     std::cout<<GridLogMessage<<"IOobject: ";
     if ( control & BINARYIO_READ) std::cout << " read  ";
@@ -442,11 +478,14 @@ class BinaryIO {
     //////////////////////////////////////////////////////////////////////////////
     // Safety check
     //////////////////////////////////////////////////////////////////////////////
-    grid->Barrier();
-    grid->GlobalSum(nersc_csum);
-    grid->GlobalXOR(scidac_csuma);
-    grid->GlobalXOR(scidac_csumb);
-    grid->Barrier();
+    // if the data size is 1 we do not want to sum over the MPI ranks
+    if (iodata.size() != 1){
+      grid->Barrier();
+      grid->GlobalSum(nersc_csum);
+      grid->GlobalXOR(scidac_csuma);
+      grid->GlobalXOR(scidac_csumb);
+      grid->Barrier();
+    }
   }
 
   /////////////////////////////////////////////////////////////////////////////
@@ -546,9 +585,9 @@ class BinaryIO {
     int gsites = grid->gSites();
     int lsites = grid->lSites();
 
-    uint32_t nersc_csum_tmp;
-    uint32_t scidac_csuma_tmp;
-    uint32_t scidac_csumb_tmp;
+    uint32_t nersc_csum_tmp   = 0;
+    uint32_t scidac_csuma_tmp = 0;
+    uint32_t scidac_csumb_tmp = 0;
 
     GridStopWatch timer;
 

From 14d53e1c9eb8eb1ef684148728c075813814612e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 29 Jul 2017 13:06:53 -0400
Subject: [PATCH 146/177] Threaded MPI calls patches

---
 benchmarks/Benchmark_dwf.cc               |  2 +-
 lib/allocator/AlignedAllocator.h          | 10 ++-
 lib/communicator/Communicator_base.cc     |  4 +-
 lib/communicator/Communicator_base.h      | 14 ++++-
 lib/communicator/Communicator_mpit.cc     | 25 +++++++-
 lib/qcd/action/fermion/WilsonFermion5D.cc | 74 ++++++++++++++---------
 lib/stencil/Stencil.h                     | 59 +++++++++---------
 lib/util/Init.cc                          |  6 +-
 8 files changed, 128 insertions(+), 66 deletions(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index a071c050..0264905c 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -489,7 +489,7 @@ int main (int argc, char ** argv)
 
   //assert(norm2(src_e)<1.0e-4);
   //assert(norm2(src_o)<1.0e-4);
-
+  exit(0);
   Grid_finalize();
 }
 
diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 6e85ab27..7fd9496f 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -92,11 +92,15 @@ public:
     size_type bytes = __n*sizeof(_Tp);
 
     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
-    
+    //////////////////
+    // Hack 2MB align; could make option probably doesn't need configurability
+    //////////////////
+//define GRID_ALLOC_ALIGN (128)
+#define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
 
     return ptr;
diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index a5edf8e9..67bfaed0 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -34,7 +34,9 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+CartesianCommunicator::CommunicatorPolicy_t  
+CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+int CartesianCommunicator::nCommThreads = -1;
 
 /////////////////////////////////
 // Alloc, free shmem region
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 4e471b43..84dbedb4 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -54,8 +54,9 @@ class CartesianCommunicator {
   // 128MB shared memory for comms enought for 48^4 local vol comms
   // Give external control (command line override?) of this
 
-  static const int      MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t MAX_MPI_SHM_BYTES;
+  static const int MAXLOG2RANKSPERNODE = 16;            
+  static uint64_t  MAX_MPI_SHM_BYTES;
+  static int       nCommThreads;
 
   // Communicator should know nothing of the physics grid, only processor grid.
   int              _Nprocessors;     // How many in all
@@ -125,7 +126,7 @@ class CartesianCommunicator {
   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
   static CommunicatorPolicy_t CommunicatorPolicy;
   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
-
+  
   size_t heap_top;
   size_t heap_bytes;
 
@@ -215,6 +216,12 @@ class CartesianCommunicator {
   
   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 
+  double StencilSendToRecvFrom(void *xmit,
+			       int xmit_to_rank,
+			       void *recv,
+			       int recv_from_rank,
+			       int bytes,int dir);
+
   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,
@@ -222,6 +229,7 @@ class CartesianCommunicator {
 				    int recv_from_rank,
 				    int bytes,int dir);
   
+  
   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
   void StencilBarrier(void);
 
diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc
index 24a518ec..f522701c 100644
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -242,7 +242,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
-
+  assert(false);
+  /*
   int myrank = _processor;
   int ierr;
   assert(dir < communicator_halo.size());
@@ -254,6 +255,28 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 		    communicator_halo[dir],MPI_STATUS_IGNORE);
   assert(ierr==0);
   return 2.0*bytes;
+  */
+}
+
+double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
+						    int xmit_to_rank,
+						    void *recv,
+						    int recv_from_rank,
+						    int bytes,int dir)
+{
+  int myrank = _processor;
+  int ierr;
+  assert(dir < communicator_halo.size());
+  
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  MPI_Request req[2];
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank,
+	    communicator_halo[dir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
+	    communicator_halo[dir], &req[0]);
+  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
+  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 { 
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 6a6bc1f8..0b6c9e3d 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -391,37 +391,57 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
   DhopFaceTime+=usecond();
 
   // Rely on async comms; start comms before merge of local data
-  DhopComputeTime-=usecond();
-  DhopCommTime-=usecond();
-#pragma omp parallel 
+  double ctime=0;
+  double ptime=0;
+  //  DhopComputeTime-=usecond();
+  //  DhopCommTime-=usecond();
+#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
   { 
-    // Should time this somehow; hard as the threads fork nowait
-    st.CommunicateThreaded();
-
-  if (dag == DaggerYes) {
-#pragma omp for
-    for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU = ss;
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = st.Packets.size(); 
+    assert(nthreads > ncomms);
+    if (tid >= ncomms) {
+      double start = usecond();
+      nthreads -= ncomms;
+      int ttid = tid - ncomms;
+      int n = U._grid->oSites();
+      int chunk = n / nthreads;
+      int rem = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+	myblock = ttid * chunk + ttid;
+	myn = chunk+1;
+      } else {
+	myblock = ttid*chunk + rem;
+	myn = chunk;
+      }
+      
+      // do the compute
+      if (dag == DaggerYes) {
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  int sU = ss;
+	  int sF = LLs * sU;
+	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+	}
+      } else {
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  int sU = ss;
+	  int sF = LLs * sU;
+	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+	}
+      }
+	ptime = usecond() - start;
     }
-  } else {
-#pragma omp for
-    for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU = ss;
-      int sF = LLs * sU;
-      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+    {
+      double start = usecond();
+      st.CommunicateThreaded();
+      ctime = usecond() - start;
     }
   }
-#pragma omp single
-  DhopComputeTime+=usecond();
-
-#pragma omp taskwait 
-
-#pragma omp single
-  DhopCommTime+=usecond();
-  } // Closes parallel region and waits the comms (I hope)
-
+  DhopCommTime += ctime;
+  DhopComputeTime+=ptime;
 
   DhopFaceTime-=usecond();
   st.CommsMerge(compressor);
diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h
index 17db64d8..d1d7a7e0 100644
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -185,6 +185,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   double splicetime;
   double nosplicetime;
   double calls;
+  std::vector<double> comms_bytesthr;
+  std::vector<double> commtimethr;
 
   ////////////////////////////////////////
   // Stencil query
@@ -250,36 +252,22 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   //////////////////////////////////////////
   void CommunicateThreaded()
   {
-    for(int i=0;i<Packets.size();i++){
-#pragma omp task 
-      {
-	double start;
-	double stop;
-	start = usecond();
-	uint64_t bytes;
-	std::vector<CommsRequest_t> reqs;
-	bytes=_grid->StencilSendToRecvFromBegin(reqs,
-					  Packets[i].send_buf,
-					  Packets[i].to_rank,
-					  Packets[i].recv_buf,
-					  Packets[i].from_rank,
-					  Packets[i].bytes,i);
-	_grid->StencilSendToRecvFromComplete(reqs,i);
-	// Last task logged; this is approximate but hard to catch
-	// the last to complete
-	stop = usecond();
-	stop = stop - start;
-
-	if ( i==0 ) commtime+=stop;
-
-#pragma omp critical
-	{
-	  comms_bytes+=bytes;
-	}
-
+    // must be called in parallel region
+    int mythread = omp_get_thread_num();
+    int nthreads = CartesianCommunicator::nCommThreads;
+    if (nthreads == -1) nthreads = Packets.size();
+    if (mythread < nthreads) {
+      for (int i = mythread; i < Packets.size(); i += nthreads) {
+	double start = usecond();
+	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+						      Packets[i].to_rank,
+						      Packets[i].recv_buf,
+						      Packets[i].from_rank,
+						      Packets[i].bytes,i);
+	comms_bytesthr[mythread] += bytes;
+	commtimethr[mythread] += usecond() - start;
       }
     }
-    
   }
   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
@@ -475,7 +463,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 		  int checkerboard,
 		  const std::vector<int> &directions,
 		  const std::vector<int> &distances) 
-   :   _permute_type(npoints), _comm_buf_size(npoints)
+   : _permute_type(npoints), 
+    _comm_buf_size(npoints),
+    comms_bytesthr(npoints), 
+       commtimethr(npoints)
   {
     face_table_computed=0;
     _npoints = npoints;
@@ -1029,6 +1020,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   void ZeroCounters(void) {
     gathertime = 0.;
     commtime = 0.;
+    memset(&commtimethr[0], 0, sizeof(commtimethr));
+    memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr));
     halogtime = 0.;
     mergetime = 0.;
     decompresstime = 0.;
@@ -1044,6 +1037,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
     RealD NP = _grid->_Nprocessors;
     RealD NN = _grid->NodeCount();
+    double t = 0;
+    // if commtimethr is set they were all done in parallel so take the max
+    // but add up the bytes
+    for (int i = 0; i < 8; ++i) {
+      comms_bytes += comms_bytesthr[i];
+      if (t < commtimethr[i]) t = commtimethr[i];
+    }
+    commtime += t;
     
     _grid->GlobalSum(commtime);    commtime/=NP;
     if ( calls > 0. ) {
diff --git a/lib/util/Init.cc b/lib/util/Init.cc
index fc701ac1..ef875429 100644
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -359,7 +359,11 @@ void Grid_init(int *argc,char ***argv)
   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
     LebesgueOrder::UseLebesgueOrder=1;
   }
-
+  CartesianCommunicator::nCommThreads = -1;
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--commthreads") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--commthreads");
+    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
+  }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
     GridCmdOptionIntVector(arg,LebesgueOrder::Block);

From 175f393f9d1b3dda4da435a6d995003eddb7b257 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 4 Aug 2017 12:14:10 +0100
Subject: [PATCH 147/177] Binary IO error checking

---
 lib/parallelIO/BinaryIO.h | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 108e7ef8..f56f6514 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -413,13 +413,33 @@ class BinaryIO {
       timer.Start();
       if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
-	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
-	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
-	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
-	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
-	MPI_File_close(&fh);
-	MPI_Type_free(&fileArray);
-	MPI_Type_free(&localArray);
+        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
+        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+        std::cout << GridLogMessage << "Checking for errors" << std::endl;
+        if (ierr != MPI_SUCCESS)
+        {
+          char error_string[BUFSIZ];
+          int length_of_error_string, error_class;
+
+          MPI_Error_class(ierr, &error_class);
+          MPI_Error_string(error_class, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Error_string(ierr, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
+        }
+
+        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
+        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
+        assert(ierr == 0);
+
+        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
+        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
+        assert(ierr == 0);
+
+        MPI_File_close(&fh);
+        MPI_Type_free(&fileArray);
+        MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif

From 4fe182e5a7c4b1d1dddc022706a71f1c0432cda5 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Sun, 6 Aug 2017 10:46:19 +0100
Subject: [PATCH 148/177] Added high level HMC support for overriding default
 SIMD lane decomposition

---
 lib/cartesian/Cartesian_base.h      | 23 ++++----
 lib/qcd/hmc/HMCResourceManager.h    | 14 ++++-
 lib/qcd/hmc/HMC_GridModules.h       | 92 +++++++++++++++++++++--------
 lib/util/Init.cc                    |  2 +-
 tests/hmc/Test_hmc_EOMobiusRatio.cc | 13 ++--
 5 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h
index 0db6ce0d..f4f9a269 100644
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -185,17 +185,18 @@ public:
     ////////////////////////////////////////////////////////////////
 
     void show_decomposition(){
-      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
-      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
-      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
-      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
-      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
-      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
-      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
-      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
-      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
-      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
-      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
+      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
+      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
+      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
+      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
+      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
+      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
+      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
+      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
+      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
+      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
+      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
+      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
     } 
 
     ////////////////////////////////////////////////////////////////
diff --git a/lib/qcd/hmc/HMCResourceManager.h b/lib/qcd/hmc/HMCResourceManager.h
index cf0000ed..3e20a8c1 100644
--- a/lib/qcd/hmc/HMCResourceManager.h
+++ b/lib/qcd/hmc/HMCResourceManager.h
@@ -165,7 +165,7 @@ class HMCResourceManager {
   // Grids
   //////////////////////////////////////////////////////////////
 
-  void AddGrid(std::string s, GridModule& M) {
+  void AddGrid(const std::string s, GridModule& M) {
     // Check for name clashes
     auto search = Grids.find(s);
     if (search != Grids.end()) {
@@ -174,14 +174,24 @@ class HMCResourceManager {
       exit(1);
     }
     Grids[s] = std::move(M);
+    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
+    std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
+    std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
+    Grids[s].show_full_decomposition();
+    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
   }
 
   // Add a named grid set, 4d shortcut
-  void AddFourDimGrid(std::string s) {
+  void AddFourDimGrid(const std::string s) {
     GridFourDimModule<vComplex> Mod;
     AddGrid(s, Mod);
   }
 
+  // Add a named grid set, 4d shortcut + tweak simd lanes
+  void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
+    GridFourDimModule<vComplex> Mod(simd_decomposition);
+    AddGrid(s, Mod);
+  }
 
 
   GridCartesian* GetCartesian(std::string s = "") {
diff --git a/lib/qcd/hmc/HMC_GridModules.h b/lib/qcd/hmc/HMC_GridModules.h
index 8331c02b..0f34e9a7 100644
--- a/lib/qcd/hmc/HMC_GridModules.h
+++ b/lib/qcd/hmc/HMC_GridModules.h
@@ -33,28 +33,29 @@ directory
 namespace Grid {
 
 // Resources
-// Modules for grids 
+// Modules for grids
 
 // Introduce another namespace HMCModules?
 
-class GridModuleParameters: Serializable{   
+class GridModuleParameters: Serializable{
 public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
   std::string, lattice,
   std::string, mpi);
 
-  std::vector<int> getLattice(){return strToVec<int>(lattice);}
-  std::vector<int> getMpi()    {return strToVec<int>(mpi);}
+  std::vector<int> getLattice() const {return strToVec<int>(lattice);}
+  std::vector<int> getMpi()     const {return strToVec<int>(mpi);}
 
-  void check(){
-    if (getLattice().size() != getMpi().size()) {
-      std::cout << GridLogError 
+
+  void check() const {
+    if (getLattice().size() != getMpi().size() ) {
+      std::cout << GridLogError
                 << "Error in GridModuleParameters: lattice and mpi dimensions "
                    "do not match"
                 << std::endl;
       exit(1);
     }
-  }    
+  }
 
   template <class ReaderClass>
   GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
@@ -75,51 +76,94 @@ private:
 // Lower level class
 class GridModule {
  public:
-  GridCartesian* get_full() { 
+  GridCartesian* get_full() {
     std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
     return grid_.get(); }
-  GridRedBlackCartesian* get_rb() { 
+  GridRedBlackCartesian* get_rb() {
     std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
     return rbgrid_.get(); }
 
   void set_full(GridCartesian* grid) { grid_.reset(grid); }
   void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
+  void show_full_decomposition(){ grid_->show_decomposition(); }
+  void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
 
  protected:
   std::unique_ptr<GridCartesian> grid_;
   std::unique_ptr<GridRedBlackCartesian> rbgrid_;
-  
+
 };
 
 ////////////////////////////////////
 // Classes for the user
 ////////////////////////////////////
 // Note: the space time grid should be out of the QCD namespace
-template< class vector_type>
-class GridFourDimModule : public GridModule {
- public:
-  GridFourDimModule() {
+template <class vector_type>
+class GridFourDimModule : public GridModule
+{
+public:
+  GridFourDimModule()
+  {
     using namespace QCD;
     set_full(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
+        GridDefaultLatt(), 
+        GridDefaultSimd(4, vector_type::Nsimd()),
         GridDefaultMpi()));
     set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
   }
 
-  GridFourDimModule(GridModuleParameters Params) {
+  GridFourDimModule(const std::vector<int> tweak_simd)
+  {
+    using namespace QCD;
+    if (tweak_simd.size() != 4)
+    {
+      std::cout << GridLogError
+                << "Error in GridFourDimModule: SIMD size different from 4" 
+                << std::endl;
+      exit(1);
+    }
+
+    // Checks that the product agrees with the expectation
+    int simd_sum = 1;
+    for (auto &n : tweak_simd)
+      simd_sum *= n;
+    std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl;
+
+    if (simd_sum == vector_type::Nsimd())
+    {
+      set_full(SpaceTimeGrid::makeFourDimGrid(
+          GridDefaultLatt(), 
+          tweak_simd, 
+          GridDefaultMpi()));
+      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
+    }
+    else
+    {
+      std::cout << GridLogError 
+                << "Error in GridFourDimModule: SIMD lanes must sum to " 
+                << vector_type::Nsimd() 
+                << std::endl;
+    }
+  }
+
+  GridFourDimModule(const GridModuleParameters Params)
+  {
     using namespace QCD;
-    Params.check();
     std::vector<int> lattice_v = Params.getLattice();
     std::vector<int> mpi_v = Params.getMpi();
-    if (lattice_v.size() == 4) {
+    if (lattice_v.size() == 4)
+    {
       set_full(SpaceTimeGrid::makeFourDimGrid(
-          lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
+          lattice_v, 
+          GridDefaultSimd(4, vector_type::Nsimd()),
           mpi_v));
       set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
-    } else {
-      std::cout << GridLogError 
-          << "Error in GridFourDimModule: lattice dimension different from 4"
-          << std::endl;
+    }
+    else
+    {
+      std::cout << GridLogError
+                << "Error in GridFourDimModule: lattice dimension different from 4"
+                << std::endl;
       exit(1);
     }
   }
diff --git a/lib/util/Init.cc b/lib/util/Init.cc
index fe3b1734..35a569ba 100644
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -377,7 +377,7 @@ void Grid_init(int *argc,char ***argv)
   std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
 
   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<GridLogMessage<<"Grid Decomposition\n";
+    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
     std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
     std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
diff --git a/tests/hmc/Test_hmc_EOMobiusRatio.cc b/tests/hmc/Test_hmc_EOMobiusRatio.cc
index 4b5da356..4b4555e3 100644
--- a/tests/hmc/Test_hmc_EOMobiusRatio.cc
+++ b/tests/hmc/Test_hmc_EOMobiusRatio.cc
@@ -40,12 +40,6 @@ namespace Grid{
 				    double, StoppingCondition,
 				    int, MaxCGIterations,
 				    bool, ApplySmearing);
-
-    //template <class ReaderClass >
-    //FermionParameters(Reader<ReaderClass>& Reader){
-    //  read(Reader, "Mobius", *this);
-    //}
-
   };
 
   
@@ -113,9 +107,12 @@ int main(int argc, char **argv) {
   bool ApplySmearing = MyParams.Mobius.ApplySmearing;
   
   
+  // Use this if you want to tweak the default decomposition
+  std::vector<int> simd_lanes({2,2,1,1});
 
-  // Grid from the command line
-  TheHMC.Resources.AddFourDimGrid("gauge");
+  // Grid from the command line arguments --grid and --mpi
+  // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes
+  TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes);
   // Possibile to create the module by hand 
   // hardcoding parameters or using a Reader
 

From dbe4d7850c1e132f538e4aead7869ba703a21ec5 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Sun, 6 Aug 2017 10:49:45 +0100
Subject: [PATCH 149/177] Make a test file compatible with all architectures

---
 tests/hmc/Test_hmc_EOMobiusRatio.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/hmc/Test_hmc_EOMobiusRatio.cc b/tests/hmc/Test_hmc_EOMobiusRatio.cc
index 4b4555e3..d6ca96db 100644
--- a/tests/hmc/Test_hmc_EOMobiusRatio.cc
+++ b/tests/hmc/Test_hmc_EOMobiusRatio.cc
@@ -108,11 +108,16 @@ int main(int argc, char **argv) {
   
   
   // Use this if you want to tweak the default decomposition
-  std::vector<int> simd_lanes({2,2,1,1});
+  // commented out as very architecture speficic
+  
+  //std::vector<int> simd_lanes({2,2,1,1});
 
   // Grid from the command line arguments --grid and --mpi
   // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes
-  TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes);
+  
+  //TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); // tweak the SIMD lanes
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+  
   // Possibile to create the module by hand 
   // hardcoding parameters or using a Reader
 

From 06e6f8de00528ede75f248f98d48eca715d79630 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Tue, 8 Aug 2017 10:22:12 +0100
Subject: [PATCH 150/177] Check that the reduced dim is an integer

---
 lib/cartesian/Cartesian_red_black.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h
index 3037de00..e58999c5 100644
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -176,7 +176,8 @@ public:
 
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
-	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+	_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; // this is not checking if this is integer
+  assert(_rdimensions[d]*_simd_layout[d] == _ldimensions[d]);
 	assert(_rdimensions[d]>0);
 
 	// all elements of a simd vector must have same checkerboard.

From 44051aecd1eb0abc7a61ac814654491804455347 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Tue, 8 Aug 2017 10:31:12 +0100
Subject: [PATCH 151/177] Checking for integer divisions in cartesian full

---
 lib/cartesian/Cartesian_full.h | 130 +++++++++++++++++----------------
 1 file changed, 67 insertions(+), 63 deletions(-)

diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h
index b0e47fa4..815e3b22 100644
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -62,77 +62,81 @@ public:
       return shift;
     }
     GridCartesian(const std::vector<int> &dimensions,
-		  const std::vector<int> &simd_layout,
-		  const std::vector<int> &processor_grid
-		  ) : GridBase(processor_grid)
+                  const std::vector<int> &simd_layout,
+                  const std::vector<int> &processor_grid) : GridBase(processor_grid)
     {
-        ///////////////////////
-        // Grid information
-        ///////////////////////
-        _ndimension = dimensions.size();
-            
-        _fdimensions.resize(_ndimension);
-        _gdimensions.resize(_ndimension);
-        _ldimensions.resize(_ndimension);
-        _rdimensions.resize(_ndimension);
-        _simd_layout.resize(_ndimension);
-	_lstart.resize(_ndimension);
-	_lend.resize(_ndimension);
-            
-        _ostride.resize(_ndimension);
-        _istride.resize(_ndimension);
-            
-        _fsites = _gsites = _osites = _isites = 1;
+      ///////////////////////
+      // Grid information
+      ///////////////////////
+      _ndimension = dimensions.size();
 
-        for(int d=0;d<_ndimension;d++){
-	  _fdimensions[d] = dimensions[d]; // Global dimensions
-	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
-	  _simd_layout[d] = simd_layout[d];
-	  _fsites = _fsites * _fdimensions[d];
-	  _gsites = _gsites * _gdimensions[d];
+      _fdimensions.resize(_ndimension);
+      _gdimensions.resize(_ndimension);
+      _ldimensions.resize(_ndimension);
+      _rdimensions.resize(_ndimension);
+      _simd_layout.resize(_ndimension);
+      _lstart.resize(_ndimension);
+      _lend.resize(_ndimension);
 
-	  //FIXME check for exact division
+      _ostride.resize(_ndimension);
+      _istride.resize(_ndimension);
 
-	  // Use a reduced simd grid
-	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
-	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
-	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
-	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
-	  _osites  *= _rdimensions[d];
-	  _isites  *= _simd_layout[d];
-                
-	  // Addressing support
-	  if ( d==0 ) {
-	    _ostride[d] = 1;
-	    _istride[d] = 1;
-	  } else {
-	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
-	  }
+      _fsites = _gsites = _osites = _isites = 1;
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        _fdimensions[d] = dimensions[d];   // Global dimensions
+        _gdimensions[d] = _fdimensions[d]; // Global dimensions
+        _simd_layout[d] = simd_layout[d];
+        _fsites = _fsites * _fdimensions[d];
+        _gsites = _gsites * _gdimensions[d];
+
+        // Use a reduced simd grid
+        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
+        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
+        _osites *= _rdimensions[d];
+        _isites *= _simd_layout[d];
+
+        // Addressing support
+        if (d == 0)
+        {
+          _ostride[d] = 1;
+          _istride[d] = 1;
         }
-        
-        ///////////////////////
-        // subplane information
-        ///////////////////////
-        _slice_block.resize(_ndimension);
-        _slice_stride.resize(_ndimension);
-        _slice_nblock.resize(_ndimension);
-            
-        int block =1;
-        int nblock=1;
-        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
-            
-        for(int d=0;d<_ndimension;d++){
-            nblock/=_rdimensions[d];
-            _slice_block[d] =block;
-            _slice_stride[d]=_ostride[d]*_rdimensions[d];
-            _slice_nblock[d]=nblock;
-            block = block*_rdimensions[d];
+        else
+        {
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
         }
+      }
 
+      ///////////////////////
+      // subplane information
+      ///////////////////////
+      _slice_block.resize(_ndimension);
+      _slice_stride.resize(_ndimension);
+      _slice_nblock.resize(_ndimension);
+
+      int block = 1;
+      int nblock = 1;
+      for (int d = 0; d < _ndimension; d++)
+        nblock *= _rdimensions[d];
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        nblock /= _rdimensions[d];
+        _slice_block[d] = block;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
+        _slice_nblock[d] = nblock;
+        block = block * _rdimensions[d];
+      }
     };
 };
-
-
 }
 #endif

From 8a3fe60a27e4573faca940efd33d18a7d468c764 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Tue, 8 Aug 2017 11:36:20 +0100
Subject: [PATCH 152/177] Added more  asserts at grid creation time

---
 lib/cartesian/Cartesian_red_black.h | 192 +++++++++++++++-------------
 1 file changed, 105 insertions(+), 87 deletions(-)

diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h
index e58999c5..b1a5b9ef 100644
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -131,21 +131,21 @@ public:
       Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
     }
     void Init(const std::vector<int> &dimensions,
-	      const std::vector<int> &simd_layout,
-	      const std::vector<int> &processor_grid,
-	      const std::vector<int> &checker_dim_mask,
-	      int checker_dim)
+              const std::vector<int> &simd_layout,
+              const std::vector<int> &processor_grid,
+              const std::vector<int> &checker_dim_mask,
+              int checker_dim)
     {
-    ///////////////////////
-    // Grid information
-    ///////////////////////
+      ///////////////////////
+      // Grid information
+      ///////////////////////
       _checker_dim = checker_dim;
-      assert(checker_dim_mask[checker_dim]==1);
+      assert(checker_dim_mask[checker_dim] == 1);
       _ndimension = dimensions.size();
-      assert(checker_dim_mask.size()==_ndimension);
-      assert(processor_grid.size()==_ndimension);
-      assert(simd_layout.size()==_ndimension);
-      
+      assert(checker_dim_mask.size() == _ndimension);
+      assert(processor_grid.size() == _ndimension);
+      assert(simd_layout.size() == _ndimension);
+
       _fdimensions.resize(_ndimension);
       _gdimensions.resize(_ndimension);
       _ldimensions.resize(_ndimension);
@@ -153,115 +153,133 @@ public:
       _simd_layout.resize(_ndimension);
       _lstart.resize(_ndimension);
       _lend.resize(_ndimension);
-      
+
       _ostride.resize(_ndimension);
       _istride.resize(_ndimension);
-      
+
       _fsites = _gsites = _osites = _isites = 1;
-	
-      _checker_dim_mask=checker_dim_mask;
 
-      for(int d=0;d<_ndimension;d++){
-	_fdimensions[d] = dimensions[d];
-	_gdimensions[d] = _fdimensions[d];
-	_fsites = _fsites * _fdimensions[d];
-	_gsites = _gsites * _gdimensions[d];
-        
-	if (d==_checker_dim) {
-	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
-	}
-	_ldimensions[d] = _gdimensions[d]/_processors[d];
-	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
-	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
+      _checker_dim_mask = checker_dim_mask;
 
-	// Use a reduced simd grid
-	_simd_layout[d] = simd_layout[d];
-	_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; // this is not checking if this is integer
-  assert(_rdimensions[d]*_simd_layout[d] == _ldimensions[d]);
-	assert(_rdimensions[d]>0);
+      for (int d = 0; d < _ndimension; d++)
+      {
+        _fdimensions[d] = dimensions[d];
+        _gdimensions[d] = _fdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
+        _gsites = _gsites * _gdimensions[d];
 
-	// all elements of a simd vector must have same checkerboard.
-	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
-	if ( _simd_layout[d]>1 ) {
-	  if ( checker_dim_mask[d] ) { 
-	    assert( (_rdimensions[d]&0x1) == 0 );
-	  }
-	}
+        if (d == _checker_dim)
+        {
+          assert((_gdimensions[d] & 0x1) == 0);
+          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
+        }
+        _ldimensions[d] = _gdimensions[d] / _processors[d];
+        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
 
-	_osites *= _rdimensions[d];
-	_isites *= _simd_layout[d];
-        
-	// Addressing support
-	if ( d==0 ) {
-	  _ostride[d] = 1;
-	  _istride[d] = 1;
-	} else {
-	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
-	}
+        // Use a reduced simd grid
+        _simd_layout[d] = simd_layout[d];
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+        assert(_rdimensions[d] > 0);
 
+        // all elements of a simd vector must have same checkerboard.
+        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
+        if (_simd_layout[d] > 1)
+        {
+          if (checker_dim_mask[d])
+          {
+            assert((_rdimensions[d] & 0x1) == 0);
+          }
+        }
 
+        _osites *= _rdimensions[d];
+        _isites *= _simd_layout[d];
+
+        // Addressing support
+        if (d == 0)
+        {
+          _ostride[d] = 1;
+          _istride[d] = 1;
+        }
+        else
+        {
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
+        }
       }
-            
+
       ////////////////////////////////////////////////////////////////////////////////////////////
       // subplane information
       ////////////////////////////////////////////////////////////////////////////////////////////
       _slice_block.resize(_ndimension);
       _slice_stride.resize(_ndimension);
       _slice_nblock.resize(_ndimension);
-        
-      int block =1;
-      int nblock=1;
-      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
-      
-      for(int d=0;d<_ndimension;d++){
-	nblock/=_rdimensions[d];
-	_slice_block[d] =block;
-	_slice_stride[d]=_ostride[d]*_rdimensions[d];
-	_slice_nblock[d]=nblock;
-	block = block*_rdimensions[d];
+
+      int block = 1;
+      int nblock = 1;
+      for (int d = 0; d < _ndimension; d++)
+        nblock *= _rdimensions[d];
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        nblock /= _rdimensions[d];
+        _slice_block[d] = block;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
+        _slice_nblock[d] = nblock;
+        block = block * _rdimensions[d];
       }
 
       ////////////////////////////////////////////////
       // Create a checkerboard lookup table
       ////////////////////////////////////////////////
       int rvol = 1;
-      for(int d=0;d<_ndimension;d++){
-	rvol=rvol * _rdimensions[d];
+      for (int d = 0; d < _ndimension; d++)
+      {
+        rvol = rvol * _rdimensions[d];
       }
       _checker_board.resize(rvol);
-      for(int osite=0;osite<_osites;osite++){
-	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      for (int osite = 0; osite < _osites; osite++)
+      {
+        _checker_board[osite] = CheckerBoardFromOindex(osite);
       }
-      
     };
-protected:
+
+  protected:
     virtual int oIndex(std::vector<int> &coor)
     {
-      int idx=0;
-      for(int d=0;d<_ndimension;d++) {
-	if( d==_checker_dim ) {
-	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
-	} else {
-	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
-	}
+      int idx = 0;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        if (d == _checker_dim)
+        {
+          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
+        }
+        else
+        {
+          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
+        }
       }
       return idx;
     };
-        
+
     virtual int iIndex(std::vector<int> &lcoor)
     {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) {
-	  if( d==_checker_dim ) {
-	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
-	  } else { 
-	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-	  }
-	}
-        return idx;
+      int idx = 0;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        if (d == _checker_dim)
+        {
+          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
+        }
+        else
+        {
+          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
+        }
+      }
+      return idx;
     }
 };
-
 }
 #endif

From fd367d8bfd95ec193b9528c59d7846508bf82296 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 16 Aug 2017 09:42:57 +0100
Subject: [PATCH 153/177] Debugging the PointerCache

---
 lib/allocator/AlignedAllocator.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc
index 4249a72e..04de20bf 100644
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@@ -11,7 +11,7 @@ int PointerCache::victim;
 
 void *PointerCache::Insert(void *ptr,size_t bytes) {
 
-  if (bytes < 4096 ) return NULL;
+  if (bytes < 4096 ) return ptr;
 
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);

From bcefdd7c4eff147242ededf040653449c2d573c9 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 19 Aug 2017 12:49:02 -0400
Subject: [PATCH 154/177] Align both allocator calls to 2MB

---
 lib/allocator/AlignedAllocator.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 7fd9496f..39734b53 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -186,9 +186,9 @@ public:
   pointer allocate(size_type __n, const void* _p= 0) 
   {
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 #endif
     return ptr;
   }

From 9e658de2383620b5aa002f319b85442ab24d8115 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 19 Aug 2017 12:52:44 -0400
Subject: [PATCH 155/177] Use Vector

---
 benchmarks/Benchmark_comms.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 698f9d25..491fba1e 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -92,8 +92,8 @@ int main (int argc, char ** argv)
       RealD Nnode = Grid.NodeCount();
       RealD ppn = Nrank/Nnode;
 
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 
       int ncomm;
       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -172,8 +172,8 @@ int main (int argc, char ** argv)
       RealD Nnode = Grid.NodeCount();
       RealD ppn = Nrank/Nnode;
 
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 
 
       int ncomm;

From d6472eda8d00c8d0ffc60760a4dd9462702ac00b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 19 Aug 2017 12:53:18 -0400
Subject: [PATCH 156/177] Use mmap

---
 lib/communicator/Communicator_base.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 67bfaed0..6767495f 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     *************************************************************************************/
     /*  END LEGAL */
 #include <Grid/GridCore.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/mman.h>
 
 namespace Grid {
 
@@ -129,8 +133,15 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
   return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
+#if 1
+  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE,  MAP_HUGETLB| MAP_SHARED | MAP_ANONYMOUS, -1, 0); 
+  if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE);  
+  std::cout << "ShmCommBuf "<<ShmCommBuf<<std::endl;
+#else 
   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
   ShmCommBuf=(void *)&ShmBufStorageVector[0];
+#endif
+  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 }
 
 #endif

From 2f619482b8160cd859da636ffbaad064ba4fc02f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 19 Aug 2017 12:53:59 -0400
Subject: [PATCH 157/177] Enable blocking stencil send

---
 lib/communicator/Communicator_mpit.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc
index f522701c..c0fb47fd 100644
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -242,20 +242,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
-  assert(false);
-  /*
   int myrank = _processor;
   int ierr;
   assert(dir < communicator_halo.size());
 
-  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
-  // Give the CPU to MPI immediately; can use threads to overlap optionally
   ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
 		    recv,bytes,MPI_CHAR,recv_from_rank, recv_from_rank,
 		    communicator_halo[dir],MPI_STATUS_IGNORE);
   assert(ierr==0);
   return 2.0*bytes;
-  */
 }
 
 double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,

From 0b0cf62193f6c44126bf0b01e77d6fcdcb0d09bd Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 19 Aug 2017 13:18:50 -0400
Subject: [PATCH 158/177] Fix mpi 3 interface change

---
 lib/communicator/Communicator_mpi3.cc | 11 +++++++++++
 lib/communicator/Communicator_mpit.cc | 25 +++++++------------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 46e4745c..e6e33d33 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -621,6 +621,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
   }
 }
 
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
+						     int dest,
+						     void *recv,
+						     int from,
+						     int bytes,int dir)
+{
+  std::vector<CommsRequest_t> list;
+  StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
+  StencilSendToRecvFromComplete(list,dir);
+}
+
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,
diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc
index c0fb47fd..9a9b26d2 100644
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -242,17 +242,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
-  int myrank = _processor;
-  int ierr;
-  assert(dir < communicator_halo.size());
-
-  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
-		    recv,bytes,MPI_CHAR,recv_from_rank, recv_from_rank,
-		    communicator_halo[dir],MPI_STATUS_IGNORE);
-  assert(ierr==0);
-  return 2.0*bytes;
+  StencilSendToRecvFrom(xmit,xmit_to_rank,recv,recv_from_rank,bytes,dir);
 }
-
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+{ 
+  // Do nothing
+};
 double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 						    int xmit_to_rank,
 						    void *recv,
@@ -266,17 +261,11 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
   //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
   // Give the CPU to MPI immediately; can use threads to overlap optionally
   MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank,
-	    communicator_halo[dir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
-	    communicator_halo[dir], &req[0]);
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,          communicator_halo[dir], &req[0]);
   MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
   return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
-{ 
-  // Do nothing
-};
 
 
 

From bfef525ed2474c0cfe1047e0351ab58ce525ff10 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 19 Aug 2017 23:10:12 +0100
Subject: [PATCH 159/177] New benchmark prep

---
 benchmarks/Benchmark_ITT.cc | 518 ++++++++++++++++++++++++++++++++++++
 1 file changed, 518 insertions(+)
 create mode 100644 benchmarks/Benchmark_ITT.cc

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
new file mode 100644
index 00000000..4f16b1de
--- /dev/null
+++ b/benchmarks/Benchmark_ITT.cc
@@ -0,0 +1,518 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+struct time_statistics{
+  double mean;
+  double err;
+  double min;
+  double max;
+
+  void statistics(std::vector<double> v){
+      double sum = std::accumulate(v.begin(), v.end(), 0.0);
+      mean = sum / v.size();
+
+      std::vector<double> diff(v.size());
+      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+      auto result = std::minmax_element(v.begin(), v.end());
+      min = *result.first;
+      max = *result.second;
+}
+};
+
+void comms_header(){
+  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
+            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
+};
+
+Gamma::Algebra Gmu [] = {
+  Gamma::Algebra::GammaX,
+  Gamma::Algebra::GammaY,
+  Gamma::Algebra::GammaZ,
+  Gamma::Algebra::GammaT
+};
+struct controls {
+  int Opt;
+  int CommsOverlap;
+  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
+  //  int HugePages;
+};
+
+class Benchmark {
+public:
+  static void Decomposition (void ) {
+
+    int threads = GridThread::GetThreads();
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
+    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
+    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
+    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+  }
+
+  static void Comms(void)
+  {
+    int Nloop=100;
+    int nmu=0;
+    int maxlat=32;
+
+    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+    std::vector<int> mpi_layout  = GridDefaultMpi();
+
+    std::vector<double> t_time(Nloop);
+    time_statistics timestat;
+
+    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+    comms_header();
+
+    for(int lat=4;lat<=maxlat;lat+=4){
+      for(int Ls=8;Ls<=8;Ls*=2){
+
+	std::vector<int> latt_size  ({lat*mpi_layout[0],
+	      lat*mpi_layout[1],
+	      lat*mpi_layout[2],
+	      lat*mpi_layout[3]});
+
+	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+	RealD Nrank = Grid._Nprocessors;
+	RealD Nnode = Grid.NodeCount();
+	RealD ppn = Nrank/Nnode;
+
+	std::vector<HalfSpinColourVectorD *> xbuf(8);
+	std::vector<HalfSpinColourVectorD *> rbuf(8);
+	Grid.ShmBufferFreeAll();
+	for(int d=0;d<8;d++){
+	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	}
+
+	int ncomm;
+	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+	double dbytes;
+	for(int i=0;i<Nloop;i++){
+	  double start=usecond();
+
+	  std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	  dbytes=0;
+	  ncomm=0;
+
+	  parallel_for(int dir=0;dir<8;dir++){
+
+	    double tbytes;
+	    int mu =dir % 4;
+
+	    if (mpi_layout[mu]>1 ) {
+	        
+	      ncomm++;
+	      int xmit_to_rank;
+	      int recv_from_rank;
+	      if ( dir == mu ) { 
+		int comm_proc=1;
+		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      } else { 
+		int comm_proc = mpi_layout[mu]-1;
+		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      }
+#if 1
+	      tbytes= Grid.StencilSendToRecvFromBegin(requests,
+						      (void *)&xbuf[dir][0],
+						      xmit_to_rank,
+						      (void *)&rbuf[dir][0],
+						      recv_from_rank,
+						      bytes,dir);
+	      Grid.StencilSendToRecvFromComplete(requests,dir);
+#endif
+	      requests.resize(0);
+
+#pragma omp atomic
+	      dbytes+=tbytes;
+	    }
+	  }
+	  Grid.Barrier();
+	  double stop=usecond();
+	  t_time[i] = stop-start; // microseconds
+	}
+
+	timestat.statistics(t_time);
+
+	dbytes=dbytes*ppn;
+	double xbytes    = dbytes*0.5;
+	double rbytes    = dbytes*0.5;
+	double bidibytes = dbytes;
+
+
+	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
+		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
+
+ 
+      }
+    }    
+
+    return;
+  }
+
+  static void Memory(void)
+  {
+    const int Nvec=8;
+    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
+    typedef iVector<vReal,Nvec> Vec;
+
+    std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
+    std::vector<int> mpi_layout  = GridDefaultMpi();
+
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+  
+  uint64_t lmax=48;
+#define NLOOP (10*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+
+    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+    for(int lat=8;lat<=lmax;lat+=4){
+
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      Vec rn ; random(sRNG,rn);
+
+      LatticeVec z(&Grid); z=rn;
+      LatticeVec x(&Grid); x=rn;
+      LatticeVec y(&Grid); y=rn;
+      double a=2.0;
+
+      uint64_t Nloop=NLOOP;
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	z=a*x-y;
+        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
+        y._odata[4]=z._odata[4];
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000;
+     
+      double flops=vol*Nvec*2;// mul,add
+      double bytes=3.0*vol*Nvec*sizeof(Real);
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
+
+    }
+  };
+
+
+  static void DWF(int Ls,int L)
+  {
+    RealD mass=0.1;
+    RealD M5  =1.8;
+
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    std::vector<int> local({L,L,L,L});
+
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    uint64_t SHM=NP/NN;
+
+    std::vector<int> internal;
+    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
+    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
+    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
+    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
+    else assert(0);
+
+    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
+    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    std::vector<int> seeds5({5,6,7,8});
+    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    ///////// Source preparation ////////////
+    LatticeFermion src   (FGrid); random(RNG5,src);
+    LatticeFermion ref   (FGrid);
+    LatticeFermion tmp   (FGrid);
+
+    RealD N2 = 1.0/::sqrt(norm2(src));
+    src = src*N2;
+    
+    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+
+    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+    ////////////////////////////////////
+    // Naive wilson implementation
+    ////////////////////////////////////
+    {
+      LatticeGaugeField Umu5d(FGrid); 
+      std::vector<LatticeColourMatrix> U(4,FGrid);
+      for(int ss=0;ss<Umu._grid->oSites();ss++){
+	for(int s=0;s<Ls;s++){
+	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
+	}
+      }
+      ref = zero;
+      for(int mu=0;mu<Nd;mu++){
+	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+      }
+      for(int mu=0;mu<Nd;mu++){
+	
+	tmp = U[mu]*Cshift(src,mu+1,1);
+	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+	
+	tmp =adj(U[mu])*src;
+	tmp =Cshift(tmp,mu+1,-1);
+	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+      }
+      ref = -0.5*ref;
+    }
+
+    LatticeFermion src_e (FrbGrid);
+    LatticeFermion src_o (FrbGrid);
+    LatticeFermion r_e   (FrbGrid);
+    LatticeFermion r_o   (FrbGrid);
+    LatticeFermion r_eo  (FGrid);
+    LatticeFermion err   (FGrid);
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+
+#if defined(AVX512) 
+      const int num_cases = 6;
+#else
+      const int num_cases = 4;
+#endif
+      controls Cases [] = {
+#if defined(AVX512) 
+	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+#endif
+	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+
+	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
+	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+	Dw.ZeroCounters();
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1344.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+
+	Dw.Report();
+
+	Dw.DhopEO(src_o,r_e,DaggerNo);
+	Dw.DhopOE(src_e,r_o,DaggerNo);
+	setCheckerboard(r_eo,r_o);
+	setCheckerboard(r_eo,r_e);
+	err = r_eo-ref; 
+	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+	assert((norm2(err)<1.0e-4));
+
+      }
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    }
+  }
+
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
+  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
+
+  Benchmark::Decomposition();
+
+  int do_memory=1;
+  int do_comms =1;
+  int do_su3   =0;
+  int do_wilson=1;
+  int do_dwf   =1;
+
+  if ( do_memory ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::Memory();
+  }
+
+  if ( do_comms ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::Comms();
+  }
+
+  if ( do_su3 ) {
+    // empty for now
+  }
+
+  if ( do_wilson ) {
+    int Ls=1;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::DWF(Ls,16);
+    Benchmark::DWF(Ls,24);
+    Benchmark::DWF(Ls,32);
+  }
+
+  if ( do_dwf ) {
+    int Ls=16;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::DWF(Ls,8);
+    Benchmark::DWF(Ls,12);
+    Benchmark::DWF(Ls,16);
+    Benchmark::DWF(Ls,24);
+  }
+
+  Grid_finalize();
+}

From 6d0d064a6c62f2c269df1a6aa557152543f8c186 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 19 Aug 2017 23:11:30 +0100
Subject: [PATCH 160/177] Update TODO

---
 TODO | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/TODO b/TODO
index 001c6c0c..cccc5f45 100644
--- a/TODO
+++ b/TODO
@@ -2,18 +2,18 @@ TODO:
 ---------------
 
 Large item work list:
-1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
 
+1)- BG/Q port and check
 2)- Christoph's local basis expansion Lanczos
-3)- BG/Q port and check
-4)- Precision conversion and sort out localConvert      <-- partial
+3)- Precision conversion and sort out localConvert      <-- partial
   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
-5)- Physical propagator interface
-6)- Conserved currents
-7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
-8)- HDCR resume
+4)- Physical propagator interface
+5)- Conserved currents
+6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
+7)- HDCR resume
 
 Recent DONE 
+-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE

From a446d95c3393d697f987434ac594950d18017b7a Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 20 Aug 2017 01:10:50 +0100
Subject: [PATCH 161/177] Trying to pass TeamCity and Travis

---
 benchmarks/Benchmark_ITT.cc               | 12 ++++++------
 lib/communicator/Communicator_base.cc     |  6 +++++-
 lib/communicator/Communicator_base.h      | 19 +++++++++++++------
 lib/communicator/Communicator_mpi3.cc     | 17 +++++++++++++----
 lib/qcd/action/fermion/WilsonFermion5D.cc | 18 ++++++++++--------
 lib/stencil/Stencil.h                     |  7 ++++++-
 lib/util/Init.cc                          | 18 ++++++++++++++----
 7 files changed, 67 insertions(+), 30 deletions(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 4f16b1de..9bf7d0a5 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -218,7 +218,7 @@ public:
     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
   
   uint64_t lmax=48;
-#define NLOOP (10*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+#define NLOOP (50*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 
     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
     for(int lat=8;lat<=lmax;lat+=4){
@@ -368,7 +368,7 @@ public:
       const int num_cases = 4;
 #endif
       controls Cases [] = {
-#if defined(AVX512) 
+#ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
@@ -380,6 +380,10 @@ public:
 
       for(int c=0;c<num_cases;c++) {
 
+	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
+	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
@@ -390,10 +394,6 @@ public:
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-
-	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
-	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 6767495f..3ce3a774 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -41,6 +41,7 @@ uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;
 CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
+int CartesianCommunicator::Hugepages = 0;
 
 /////////////////////////////////
 // Alloc, free shmem region
@@ -134,7 +135,10 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
 }
 void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
-  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE,  MAP_HUGETLB| MAP_SHARED | MAP_ANONYMOUS, -1, 0); 
+
+  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
+  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
+  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
   if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE);  
   std::cout << "ShmCommBuf "<<ShmCommBuf<<std::endl;
 #else 
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 84dbedb4..ac7d94f3 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -50,13 +50,24 @@ namespace Grid {
 class CartesianCommunicator {
   public:    
 
-  // 65536 ranks per node adequate for now
+
+  ////////////////////////////////////////////
+  // Isend/Irecv/Wait, or Sendrecv blocking
+  ////////////////////////////////////////////
+  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
+  static CommunicatorPolicy_t CommunicatorPolicy;
+  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
+
+  ///////////////////////////////////////////
+  // Up to 65536 ranks per node adequate for now
   // 128MB shared memory for comms enought for 48^4 local vol comms
   // Give external control (command line override?) of this
-
+  ///////////////////////////////////////////
   static const int MAXLOG2RANKSPERNODE = 16;            
   static uint64_t  MAX_MPI_SHM_BYTES;
   static int       nCommThreads;
+  // use explicit huge pages
+  static int       Hugepages;
 
   // Communicator should know nothing of the physics grid, only processor grid.
   int              _Nprocessors;     // How many in all
@@ -122,10 +133,6 @@ class CartesianCommunicator {
   /////////////////////////////////
   static void * ShmCommBuf;
 
-  // Isend/Irecv/Wait, or Sendrecv blocking
-  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
-  static CommunicatorPolicy_t CommunicatorPolicy;
-  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
   
   size_t heap_top;
   size_t heap_bytes;
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index e6e33d33..4f769971 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -41,8 +41,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef HAVE_NUMAIF_H
 #include <numaif.h>
 #endif
+
+// Make up for linex deficiencies
 #ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
+#define SHM_HUGETLB 0x0
+#endif
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x0
 #endif
 
 namespace Grid {
@@ -213,8 +218,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
       ftruncate(fd, size);
+      
+      int mmap_flag = MAP_SHARED;
+      if (Hugepages) mmap_flag |= MAP_HUGETLB;
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
 
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
       assert(((uint64_t)ptr&0x3F)==0);
 
@@ -628,8 +636,9 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
   std::vector<CommsRequest_t> list;
-  StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
   StencilSendToRecvFromComplete(list,dir);
+  return offbytes;
 }
 
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -671,7 +680,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   }
 
   if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
-    this->StencilSendToRecvFromComplete(list);
+    this->StencilSendToRecvFromComplete(list,dir);
   }
 
   return off_node_bytes;
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 0b6c9e3d..404ecce0 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -135,10 +135,11 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
-    std::vector<int> latt = GridDefaultLatt();          
-    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-    RealD NP = _FourDimGrid->_Nprocessors;
-    RealD NN = _FourDimGrid->NodeCount();
+  RealD NP     = _FourDimGrid->_Nprocessors;
+  RealD NN     = _FourDimGrid->NodeCount();
+  RealD volume = Ls;  
+  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
+  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 
   if ( DhopCalls > 0 ) {
     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
@@ -390,17 +391,18 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
   st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
   DhopFaceTime+=usecond();
 
-  // Rely on async comms; start comms before merge of local data
   double ctime=0;
   double ptime=0;
-  //  DhopComputeTime-=usecond();
-  //  DhopCommTime-=usecond();
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma omp parallel reduction(max:ctime) reduction(max:ptime)
   { 
     int tid = omp_get_thread_num();
     int nthreads = omp_get_num_threads();
     int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = st.Packets.size(); 
+    if (ncomms == -1) ncomms = 1;
     assert(nthreads > ncomms);
     if (tid >= ncomms) {
       double start = usecond();
diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h
index d1d7a7e0..cca67587 100644
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -252,10 +252,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   //////////////////////////////////////////
   void CommunicateThreaded()
   {
+#ifdef GRID_OMP
     // must be called in parallel region
     int mythread = omp_get_thread_num();
     int nthreads = CartesianCommunicator::nCommThreads;
-    if (nthreads == -1) nthreads = Packets.size();
+#else
+    int mythread = 0;
+    int nthreads = 1;
+#endif
+    if (nthreads == -1) nthreads = 1;
     if (mythread < nthreads) {
       for (int i = mythread; i < Packets.size(); i += nthreads) {
 	double start = usecond();
diff --git a/lib/util/Init.cc b/lib/util/Init.cc
index 39a726cf..3fd8b4cd 100644
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv)
     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
   }
 
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
+    CartesianCommunicator::Hugepages = 1;
+  }
+
+
   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
     Grid_debug_handler_init();
   }
@@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv)
     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
+    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;    
     std::cout<<GridLogMessage<<std::endl;
     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
     std::cout<<GridLogMessage<<std::endl;
@@ -317,7 +323,7 @@ void Grid_init(int *argc,char ***argv)
     std::cout<<GridLogMessage<<std::endl;
     std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
     std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
-    std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;    
+    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
     std::cout<<GridLogMessage<<std::endl;
     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
@@ -356,12 +362,13 @@ void Grid_init(int *argc,char ***argv)
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
   }
+
   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
     LebesgueOrder::UseLebesgueOrder=1;
   }
   CartesianCommunicator::nCommThreads = -1;
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--commthreads") ){
-    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--commthreads");
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
     GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
   }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
@@ -378,7 +385,10 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
 
-  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  if ( CartesianCommunicator::Hugepages) {
+    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
+  }
 
   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";

From 383ca7d39239d99a90e11df7a73918bea0268927 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 20 Aug 2017 01:27:48 +0100
Subject: [PATCH 162/177] Switch off comms for now until
 feature/multi-communicator is merged

---
 benchmarks/Benchmark_ITT.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 4f16b1de..91524149 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -160,7 +160,7 @@ public:
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
-#if 1
+#if 0
 	      tbytes= Grid.StencilSendToRecvFromBegin(requests,
 						      (void *)&xbuf[dir][0],
 						      xmit_to_rank,

From 11062fb6861153ffafa6d821f8ee53f01f5f72a4 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 20 Aug 2017 01:37:07 +0100
Subject: [PATCH 163/177] Comms none fail fix

---
 lib/communicator/Communicator_base.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 3ce3a774..2e6626be 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -102,6 +102,18 @@ int                      CartesianCommunicator::NodeCount(void)    { return Proc
 int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
 #endif
 #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
+						     int xmit_to_rank,
+						     void *recv,
+						     int recv_from_rank,
+						     int bytes, int dir)
+{
+  std::vector<CommsRequest_t> list;
+  // Discard the "dir"
+  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+  SendToRecvFromComplete(list);
+  return 2.0*bytes;
+}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,

From 1cdf99966810227f180452393973c87ae4a301c4 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 20 Aug 2017 02:39:10 +0100
Subject: [PATCH 164/177] Moving multicommunicator into mpi3 also for threading

---
 lib/communicator/Communicator_base.h  |  8 ++++----
 lib/communicator/Communicator_mpi3.cc | 12 ++++++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index ac7d94f3..ac866ced 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -78,15 +78,15 @@ class CartesianCommunicator {
 
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
   static MPI_Comm communicator_world;
-         MPI_Comm communicator;
+
+  MPI_Comm              communicator;
+  std::vector<MPI_Comm> communicator_halo;
+
   typedef MPI_Request CommsRequest_t;
 #else 
   typedef int CommsRequest_t;
 #endif
 
-#if defined (GRID_COMMS_MPIT)
-  std::vector<MPI_Comm> communicator_halo;
-#endif
 
   ////////////////////////////////////////////////////////////////////
   // Helper functionality for SHM Windows common to all other impls
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 4f769971..9e5dfb97 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -405,8 +405,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
   int ierr;
   communicator=communicator_world;
+
   _ndimension = processors.size();
 
+  communicator_halo.resize (2*_ndimension);
+  for(int i=0;i<_ndimension*2;i++){
+    MPI_Comm_dup(communicator,&communicator_halo[i]);
+  }
+
   ////////////////////////////////////////////////////////////////
   // Assert power of two shm_size.
   ////////////////////////////////////////////////////////////////
@@ -648,6 +654,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int from,
 							 int bytes,int dir)
 {
+  assert(dir < communicator_halo.size());
+
   MPI_Request xrq;
   MPI_Request rrq;
 
@@ -666,14 +674,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   gfrom = MPI_UNDEFINED;
 #endif
   if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
     assert(ierr==0);
     list.push_back(rrq);
     off_node_bytes+=bytes;
   }
 
   if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
     assert(ierr==0);
     list.push_back(xrq);
     off_node_bytes+=bytes;

From ae56e556c64fe1b2cdd2d302ef63294cc322aeaf Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 20 Aug 2017 02:53:12 +0100
Subject: [PATCH 165/177] finalise issue on new OPA revert

---
 benchmarks/Benchmark_dwf.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 98ce0a07..3858226e 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -503,9 +503,9 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 
-  //assert(norm2(src_e)<1.0e-4);
-  //assert(norm2(src_o)<1.0e-4);
-  exit(0);
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
   Grid_finalize();
+  exit(0);
 }
 

From b49bec0cec4a876462fa1f042556d595f3d88416 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 20 Aug 2017 03:08:54 +0100
Subject: [PATCH 166/177] MAP_HUGETLB portability fix

---
 lib/communicator/Communicator_base.cc |  2 ++
 lib/communicator/Communicator_mpi3.cc | 15 +++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 2e6626be..3378c56a 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -149,7 +149,9 @@ void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
 
   int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
+#ifdef MAP_HUGETLB
   if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
+#endif
   ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
   if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE);  
   std::cout << "ShmCommBuf "<<ShmCommBuf<<std::endl;
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 9e5dfb97..204993fd 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -42,13 +42,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <numaif.h>
 #endif
 
-// Make up for linex deficiencies
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 0x0
-#endif
-#ifndef MAP_HUGETLB
-#define MAP_HUGETLB 0x0
-#endif
 
 namespace Grid {
 
@@ -220,7 +213,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       ftruncate(fd, size);
       
       int mmap_flag = MAP_SHARED;
+#ifdef MAP_HUGETLB
       if (Hugepages) mmap_flag |= MAP_HUGETLB;
+#endif
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
 
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
@@ -274,7 +269,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
     for(int r=0;r<ShmSize;r++){
       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
       key_t key   = 0x4545 + r;
-      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+      int flags = IPC_CREAT | SHM_R | SHM_W;
+#ifdef SHM_HUGETLB
+      flags|=SHM_HUGETLB;
+#endif
+      if ((shmids[r]= shmget(key,size, flags)) < 0) {
 	int errsv = errno;
 	printf("Errno %d\n",errsv);
 	perror("shmget");

From d9cd4f027336b650f7da0add0f3f380611cda7db Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Wed, 23 Aug 2017 15:07:18 +0100
Subject: [PATCH 167/177] Staggered multinode block cg debugged. Missing global
 sum. Code stalls and resumes on KNL at cambridge. Curious.

CG iterations 23ms each, then 3200 ms pauses. Mean bandwidth reports
as 200MB/s. Comms dominant in the report. However, the time behaviour suggests it
is *bursty*.... Could be swap to disk?
---
 .../iterative/BlockConjugateGradient.h        |  9 ++-
 lib/lattice/Lattice_reduction.h               | 38 ++++++++----
 .../fermion/ImprovedStaggeredFermion5D.cc     | 60 +++++++++++++++++++
 .../fermion/ImprovedStaggeredFermion5D.h      | 10 ++++
 .../solver/Test_staggered_block_cg_unprec.cc  |  8 ++-
 5 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index 9418f63c..d7817c05 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -199,7 +199,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 
   Linop.HermOp(X, AD);
   tmp = B - AD;  
+  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
   ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
+  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
+  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
+  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
   D=Q;
 
   std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -221,13 +226,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
     MatrixTimer.Start();
     Linop.HermOp(D, Z);      
     MatrixTimer.Stop();
+    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
 
     //4. M  = [D^dag Z]^{-1}
     sliceInnerTimer.Start();
     sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
     sliceInnerTimer.Stop();
     m_M       = m_DZ.inverse();
-
+    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
+    
     //5. X  = X + D MC
     m_tmp     = m_M * m_C;
     sliceMaddTimer.Start();
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 38982891..db012c8c 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
   }
 };
 
+/*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
   int NN    = BlockSolverGrid->_ndimension;
@@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
   }
   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
+*/
 
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   int Nblock = X._grid->GlobalDimensions()[Orthog];
 
   GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
   int Nblock = X._grid->GlobalDimensions()[Orthog];
 
   GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
   typedef typename vobj::vector_type vector_type;
   
   GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
   
   int Nblock = FullGrid->GlobalDimensions()[Orthog];
   
-  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
   
   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -550,6 +554,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
       mat += mat_thread;
     }  
   }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
   return;
 }
 
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
index 61a3c559..7d988d89 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 {
   Compressor compressor;
   int LLs = in._grid->_rdimensions[0];
+
+
+
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
   st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
   
+  DhopComputeTime -= usecond();
   // Dhop takes the 4d grid from U, and makes a 5d index for fermion
   if (dag == DaggerYes) {
     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
     }
   }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
 }
 
 
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
   conformable(in._grid,out._grid); // drops the cb check
 
@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
   conformable(in._grid,out._grid); // drops the cb check
 
@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=2;
   conformable(in._grid,FermionGrid()); // verifies full grid
   conformable(in._grid,out._grid);
 
@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
   DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Report(void) 
+{
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _FourDimGrid->_Nprocessors;
+  RealD NN = _FourDimGrid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _FourDimGrid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime    = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
 
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index 4961da49..ca1a955a 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -55,6 +55,16 @@ namespace QCD {
       FermionField _tmp;
       FermionField &tmp(void) { return _tmp; }
 
+      ////////////////////////////////////////
+      // Performance monitoring
+      ////////////////////////////////////////
+      void Report(void);
+      void ZeroCounters(void);
+      double DhopTotalTime;
+      double DhopCalls;
+      double DhopCommTime;
+      double DhopComputeTime;
+
       ///////////////////////////////////////////////////////////////
       // Implement the abstract base
       ///////////////////////////////////////////////////////////////
diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc
index 8db41e98..f54bc3b2 100644
--- a/tests/solver/Test_staggered_block_cg_unprec.cc
+++ b/tests/solver/Test_staggered_block_cg_unprec.cc
@@ -75,7 +75,7 @@ int main (int argc, char ** argv)
   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
 
   RealD mass=0.003;
-  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
+  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); 
   MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
 
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
@@ -99,21 +99,27 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=zero;
+  Ds.ZeroCounters();
   CG(HermOp,src,result);
+  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=zero;
+  Ds.ZeroCounters();
   mCG(HermOp,src,result);
+  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=zero;
+  Ds.ZeroCounters();
   BCGrQ(HermOp,src,result);
+  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
 

From 5fa386ddc96fc13146e528ea4d3d92ec9552c49e Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 24 Aug 2017 10:17:52 +0100
Subject: [PATCH 168/177] FFT test compile fixed

---
 lib/qcd/utils/GaugeFix.h    | 3 +++
 tests/core/Test_fft_gfix.cc | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h
index f2ea1aa2..c4ea31aa 100644
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -26,6 +26,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 //#include <Grid/Grid.h>
 
+#ifndef GRID_QCD_GAUGE_FIX_H
+#define GRID_QCD_GAUGE_FIX_H
 namespace Grid {
 namespace QCD {
 
@@ -188,3 +190,4 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
 
 }
 }
+#endif
diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
index 9732eb85..916c4b0b 100644
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #include <Grid/Grid.h>
 
+using namespace Grid;
+using namespace Grid::QCD;
+
 int main (int argc, char ** argv)
 {
   std::vector<int> seeds({1,2,3,4});
@@ -82,6 +85,7 @@ int main (int argc, char ** argv)
 
   Uorg = Uorg - Umu;
   std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
+  std::cout << " Norm "<< norm2(Umu) << std::endl;
 
 
   std::cout<< "*****************************************************************" <<std::endl;

From 102ea9ae668a1c8eef506113348355e9d78fd522 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 24 Aug 2017 18:17:09 +0100
Subject: [PATCH 169/177] CI update

---
 .travis.yml | 68 -----------------------------------------------------
 README.md   | 16 +------------
 2 files changed, 1 insertion(+), 83 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 64dae823..7d8203ce 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,68 +9,6 @@ matrix:
     - os:        osx
       osx_image: xcode8.3
       compiler: clang
-    - compiler: gcc
-      dist: trusty
-      sudo: required
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.9
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: VERSION=-4.9
-    - compiler: gcc
-      dist: trusty
-      sudo: required
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-5
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: VERSION=-5
-    - compiler: clang
-      dist: trusty
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.8
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
-    - compiler: clang
-      dist: trusty
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-4.8
-            - libmpfr-dev
-            - libgmp-dev
-            - libmpc-dev
-            - libopenmpi-dev
-            - openmpi-bin
-            - binutils-dev
-      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
       
 before_install:
     - export GRIDDIR=`pwd`
@@ -106,9 +44,3 @@ script:
     - make -j4
     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
     - make check
-    - echo make clean
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
-
-
diff --git a/README.md b/README.md
index 1e0988f3..13dd6996 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,4 @@
-# Grid
-<table>
-<tr>
-    <td>Last stable release</td>
-    <td><a href="https://travis-ci.org/paboyle/Grid">
-    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
-    </td>
-</tr>
-<tr>
-    <td>Development branch</td>
-    <td><a href="https://travis-ci.org/paboyle/Grid">
-    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
-    </td>
-</tr>
-</table>
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
 
 **Data parallel C++ mathematical object library.**
 

From c3b1263e75212356fc1aa061cd226db70f4f00fc Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 25 Aug 2017 09:25:54 +0100
Subject: [PATCH 170/177] Benchmark prep

---
 benchmarks/Benchmark_ITT.cc               | 322 +++++++++++++++++++---
 benchmarks/Benchmark_comms.cc             |  30 +-
 lib/allocator/AlignedAllocator.h          |   5 +
 lib/communicator/Communicator_base.cc     |   6 +-
 lib/communicator/Communicator_mpi3.cc     |   5 +-
 lib/communicator/Communicator_mpit.cc     |  19 +-
 lib/qcd/action/fermion/CayleyFermion5D.cc |  12 +-
 lib/qcd/action/fermion/WilsonCompressor.h |  41 ++-
 lib/qcd/action/fermion/WilsonFermion5D.cc |  11 +
 lib/stencil/Stencil.h                     | 114 +++++++-
 10 files changed, 494 insertions(+), 71 deletions(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 9bf7d0a5..c5226ee1 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -32,6 +32,19 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 
+typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
+
+
+std::vector<int> L_list;
+std::vector<int> Ls_list;
+std::vector<double> mflop_list;
+
+double mflop_ref;
+double mflop_ref_err;
+
+int NN_global;
 
 struct time_statistics{
   double mean;
@@ -95,13 +108,15 @@ public:
 
   static void Comms(void)
   {
-    int Nloop=100;
+    int Nloop=1000;
     int nmu=0;
     int maxlat=32;
 
     std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
     std::vector<int> mpi_layout  = GridDefaultMpi();
 
+    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
+
     std::vector<double> t_time(Nloop);
     time_statistics timestat;
 
@@ -133,13 +148,14 @@ public:
 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 
-	int ncomm;
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+	int ncomm;
 	double dbytes;
+	std::vector<double> times(Nloop);
 	for(int i=0;i<Nloop;i++){
+
 	  double start=usecond();
 
-	  std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	  dbytes=0;
 	  ncomm=0;
 
@@ -150,7 +166,6 @@ public:
 
 	    if (mpi_layout[mu]>1 ) {
 	        
-	      ncomm++;
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
@@ -160,18 +175,18 @@ public:
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
-#if 1
-	      tbytes= Grid.StencilSendToRecvFromBegin(requests,
-						      (void *)&xbuf[dir][0],
-						      xmit_to_rank,
-						      (void *)&rbuf[dir][0],
-						      recv_from_rank,
-						      bytes,dir);
-	      Grid.StencilSendToRecvFromComplete(requests,dir);
-#endif
-	      requests.resize(0);
-
+	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+						 (void *)&rbuf[dir][0], recv_from_rank,
+						 bytes,dir);
+	  
+#ifdef GRID_OMP
 #pragma omp atomic
+#endif
+	      ncomm++;
+
+#ifdef GRID_OMP
+#pragma omp atomic
+#endif
 	      dbytes+=tbytes;
 	    }
 	  }
@@ -181,13 +196,15 @@ public:
 	}
 
 	timestat.statistics(t_time);
+	//	for(int i=0;i<t_time.size();i++){
+	  //	  std::cout << i<<" "<<t_time[i]<<std::endl;
+	//	}
 
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
 	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;
 
-
 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
@@ -196,7 +213,8 @@ public:
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 
  
-      }
+	
+	    }
     }    
 
     return;
@@ -218,7 +236,7 @@ public:
     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
   
   uint64_t lmax=48;
-#define NLOOP (50*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 
     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
     for(int lat=8;lat<=lmax;lat+=4){
@@ -253,8 +271,7 @@ public:
     }
   };
 
-
-  static void DWF(int Ls,int L)
+  static double DWF5(int Ls,int L)
   {
     RealD mass=0.1;
     RealD M5  =1.8;
@@ -262,6 +279,7 @@ public:
     double mflops;
     double mflops_best = 0;
     double mflops_worst= 0;
+    std::vector<double> mflops_all;
 
     ///////////////////////////////////////////////////////
     // Set/Get the layout & grid size
@@ -274,6 +292,189 @@ public:
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
     uint64_t NP = TmpGrid->RankCount();
     uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+    std::vector<int> internal;
+    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
+    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
+    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
+    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
+    else assert(0);
+
+    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
+    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
+    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    std::vector<int> seeds5({5,6,7,8});
+    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    ///////// Source preparation ////////////
+    LatticeFermion src   (sFGrid); random(RNG5,src);
+    LatticeFermion tmp   (sFGrid);
+
+    RealD N2 = 1.0/::sqrt(norm2(src));
+    src = src*N2;
+    
+    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+
+    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
+    LatticeFermion src_e (sFrbGrid);
+    LatticeFermion src_o (sFrbGrid);
+    LatticeFermion r_e   (sFrbGrid);
+    LatticeFermion r_o   (sFrbGrid);
+    LatticeFermion r_eo  (sFGrid);
+    LatticeFermion err   (sFGrid);
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+
+#if defined(AVX512) 
+      const int num_cases = 6;
+      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
+#else
+      const int num_cases = 4;
+      std::string fmt("U/S ; U/O ; G/S ; G/O ");
+#endif
+      controls Cases [] = {
+#ifdef AVX512
+	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+#endif
+	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+
+	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
+	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+	int nwarm = 200;
+	double t0=usecond();
+	sFGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  sDw.DhopEO(src_o,r_e,DaggerNo);
+	}
+	sFGrid->Barrier();
+	double t1=usecond();
+	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	//	if (ncall < 500) ncall = 500;
+	uint64_t ncall = 1000;
+
+	sFGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+	sDw.ZeroCounters();
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  sDw.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	sFGrid->Barrier();
+	
+	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1344.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
+
+	sDw.Report();
+
+      }
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    }
+    return mflops_best;
+  }
+
+  static double DWF(int Ls,int L)
+  {
+    RealD mass=0.1;
+    RealD M5  =1.8;
+
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    std::vector<int> local({L,L,L,L});
+
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
     uint64_t SHM=NP/NN;
 
     std::vector<int> internal;
@@ -364,13 +565,15 @@ public:
 
 #if defined(AVX512) 
       const int num_cases = 6;
+      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
       const int num_cases = 4;
+      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
       controls Cases [] = {
 #ifdef AVX512
-	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
@@ -394,7 +597,7 @@ public:
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-	int nwarm = 10;
+	int nwarm = 200;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
@@ -402,7 +605,10 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	//	if (ncall < 500) ncall = 500;
+	uint64_t ncall = 1000;
+
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
@@ -428,7 +634,7 @@ public:
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 
 	mflops = flops/timestat.mean;
-
+	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
@@ -450,12 +656,20 @@ public:
 
       }
       std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
       std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
     }
+    return mflops_best;
   }
 
 };
@@ -493,26 +707,66 @@ int main (int argc, char ** argv)
     // empty for now
   }
 
+  int sel=2;
+  std::vector<int> L_list({8,12,16,24});
+  std::vector<double> wilson;
+  std::vector<double> dwf4;
+  std::vector<double> dwf5;
+
   if ( do_wilson ) {
     int Ls=1;
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::DWF(Ls,16);
-    Benchmark::DWF(Ls,24);
-    Benchmark::DWF(Ls,32);
+    for(int l=0;l<L_list.size();l++){
+      wilson.push_back(Benchmark::DWF(1,L_list[l]));
+    }
   }
 
+  int Ls=16;
   if ( do_dwf ) {
-    int Ls=16;
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::DWF(Ls,8);
-    Benchmark::DWF(Ls,12);
-    Benchmark::DWF(Ls,16);
-    Benchmark::DWF(Ls,24);
+    for(int l=0;l<L_list.size();l++){
+      dwf4.push_back(Benchmark::DWF(Ls,L_list[l]));
+    }
   }
 
+  if ( do_dwf ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    for(int l=0;l<L_list.size();l++){
+      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
+    }
+
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
+  }
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  int NN=NN_global;
+  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
+  }
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Comparison point result: "  << dwf4[sel]/NN <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+
+  }
+
+
   Grid_finalize();
 }
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 491fba1e..a270e3fa 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -92,11 +92,16 @@ int main (int argc, char ** argv)
       RealD Nnode = Grid.NodeCount();
       RealD ppn = Nrank/Nnode;
 
-      Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
 
       int ncomm;
       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      for(int mu=0;mu<8;mu++){
+	xbuf[mu].resize(lat*lat*lat*Ls);
+	rbuf[mu].resize(lat*lat*lat*Ls);
+	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
+      }
 
       for(int i=0;i<Nloop;i++){
       double start=usecond();
@@ -112,7 +117,6 @@ int main (int argc, char ** argv)
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
@@ -172,9 +176,14 @@ int main (int argc, char ** argv)
       RealD Nnode = Grid.NodeCount();
       RealD ppn = Nrank/Nnode;
 
-      Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
 
+      for(int mu=0;mu<8;mu++){
+	xbuf[mu].resize(lat*lat*lat*Ls);
+	rbuf[mu].resize(lat*lat*lat*Ls);
+	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
+      }
 
       int ncomm;
       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -493,14 +502,9 @@ int main (int argc, char ** argv)
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
-	    tbytes= Grid.StencilSendToRecvFromBegin(requests,
-						    (void *)&xbuf[dir][0],
-						    xmit_to_rank,
-						    (void *)&rbuf[dir][0],
-						    recv_from_rank,
-						    bytes,dir);
-	    Grid.StencilSendToRecvFromComplete(requests,dir);
-	    requests.resize(0);
+
+	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
 
 #pragma omp atomic
 	    dbytes+=tbytes;
diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 3609d8ab..c5ad0883 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -92,6 +92,9 @@ public:
     size_type bytes = __n*sizeof(_Tp);
 
     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
+    //    if ( ptr != NULL ) 
+    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
+
     //////////////////
     // Hack 2MB align; could make option probably doesn't need configurability
     //////////////////
@@ -102,6 +105,7 @@ public:
 #else
     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
+    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
     // First touch optimise in threaded loop
     uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
@@ -115,6 +119,7 @@ public:
 
   void deallocate(pointer __p, size_type __n) { 
     size_type bytes = __n * sizeof(_Tp);
+
     pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 
 #ifdef HAVE_MM_MALLOC_H
diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 3378c56a..956de0d2 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -153,8 +153,10 @@ void CartesianCommunicator::ShmInitGeneric(void){
   if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
   ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
-  if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE);  
-  std::cout << "ShmCommBuf "<<ShmCommBuf<<std::endl;
+  if (ShmCommBuf == (void *)MAP_FAILED) {
+    perror("mmap failed ");
+    exit(EXIT_FAILURE);  
+  }
 #else 
   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
   ShmCommBuf=(void *)&ShmBufStorageVector[0];
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 204993fd..cb7fa390 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -221,8 +221,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
       assert(((uint64_t)ptr&0x3F)==0);
 
-      // Try to force numa domain on the shm segment if we have numaif.h
-#ifdef HAVE_NUMAIF_H
+// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
+#if 0
+//#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc
index 9a9b26d2..eb6ef87d 100644
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -242,11 +242,24 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
-  StencilSendToRecvFrom(xmit,xmit_to_rank,recv,recv_from_rank,bytes,dir);
+  int myrank = _processor;
+  int ierr;
+  assert(dir < communicator_halo.size());
+  
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  MPI_Request req[2];
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
+
+  list.push_back(req[0]);
+  list.push_back(req[1]);
+  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 { 
-  // Do nothing
+  int nreq=waitall.size();
+  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
 };
 double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 						    int xmit_to_rank,
@@ -262,7 +275,7 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
   // Give the CPU to MPI immediately; can use threads to overlap optionally
   MPI_Request req[2];
   MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,          communicator_halo[dir], &req[0]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
   MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
   return 2.0*bytes;
 }
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc
index 46ba3793..5e67d1f1 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   
   for(int i=0;i<Ls;i++){
     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    //    assert(fabs(bee[i])>0.0);
+    assert(fabs(bee[i])>0.0);
     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
     beo[i]=as[i]*bs[i];
     ceo[i]=-as[i]*cs[i];
@@ -455,11 +455,17 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
     dee[i] = bee[i];
     
     if ( i < Ls-1 ) {
+
+      assert(fabs(bee[i])>0.0);
+      assert(fabs(bee[0])>0.0);
       
       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
       
       leem[i]=mass*cee[Ls-1]/bee[0];
-      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      for(int j=0;j<i;j++) {
+	assert(fabs(bee[j+1])>0.0);
+	leem[i]*= aee[j]/bee[j+1];
+      }
       
       uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
       
@@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   { 
     Coeff_t delta_d=mass*cee[Ls-1];
     for(int j=0;j<Ls-1;j++) {
-      //      assert(fabs(bee[j])>0.0);
+      assert(fabs(bee[j])>0.0);
       delta_d *= cee[j]/bee[j];
     }
     dee[Ls-1] += delta_d;
diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h
index 96cbe1ec..30c6d838 100644
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -238,7 +238,35 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
 template<class vobj,class cobj>
 class WilsonStencil : public CartesianStencil<vobj,cobj> {
 public:
-
+  double timer0;
+  double timer1;
+  double timer2;
+  double timer3;
+  double timer4;
+  double timer5;
+  double timer6;
+  uint64_t callsi;
+  void ZeroCountersi(void)
+  {
+    std::cout << GridLogMessage << " ZeroCountersi()"<<std::endl;
+    timer0=0;
+    timer1=0;
+    timer2=0;
+    timer3=0;
+    timer4=0;
+    timer5=0;
+    timer6=0;
+    callsi=0;
+  }
+  void Reporti(int calls)
+  {
+    std::cout << GridLogMessage << " Reporti() calls " <<callsi << calls<<std::endl;
+    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
+    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
+    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
+    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
+    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
+  }
   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 
   std::vector<int> same_node;
@@ -252,6 +280,7 @@ public:
     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
     same_node(npoints)
   { 
+    ZeroCountersi();
     surface_list.resize(0);
   };
 
@@ -282,17 +311,25 @@ public:
   {
     std::vector<std::vector<CommsRequest_t> > reqs;
     this->HaloExchangeOptGather(source,compress);
+    double t1=usecond();
     this->CommunicateBegin(reqs);
     this->CommunicateComplete(reqs);
+    double t2=usecond(); timer1 += t2-t1;
     this->CommsMerge(compress);
+    double t3=usecond(); timer2 += t3-t2;
     this->CommsMergeSHM(compress);
+    double t4=usecond(); timer3 += t4-t3;
   }
   
   template <class compressor>
   void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
   {
     this->Prepare();
+    double t0=usecond();
     this->HaloGatherOpt(source,compress);
+    double t1=usecond();
+    timer0 += t1-t0;
+    callsi++;
   }
 
   template <class compressor>
@@ -304,7 +341,9 @@ public:
     typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
     typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
 
+    this->mpi3synctime_g-=usecond();
     this->_grid->StencilBarrier();
+    this->mpi3synctime_g+=usecond();
 
     assert(source._grid==this->_grid);
     this->halogtime-=usecond();
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 404ecce0..c5b0f872 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -185,6 +185,11 @@ void WilsonFermion5D<Impl>::Report(void)
     std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
   }
+  if ( DhopCalls > 0){
+    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
+    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
+    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
+  }
 }
 
 template<class Impl>
@@ -204,6 +209,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
   Stencil.ZeroCounters();
   StencilEven.ZeroCounters();
   StencilOdd.ZeroCounters();
+  Stencil.ZeroCountersi();
+  StencilEven.ZeroCountersi();
+  StencilOdd.ZeroCountersi();
 }
 
 
@@ -445,6 +453,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
   DhopCommTime += ctime;
   DhopComputeTime+=ptime;
 
+  // First to enter, last to leave timing
+  st.CollateThreads();
+
   DhopFaceTime-=usecond();
   st.CommsMerge(compressor);
   DhopFaceTime+=usecond();
diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h
index cca67587..ad454bcb 100644
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   // Timing info; ugly; possibly temporary
   /////////////////////////////////////////
   double commtime;
+  double mpi3synctime;
+  double mpi3synctime_g;
+  double shmmergetime;
   double gathertime;
   double gathermtime;
   double halogtime;
@@ -185,8 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   double splicetime;
   double nosplicetime;
   double calls;
-  std::vector<double> comms_bytesthr;
-  std::vector<double> commtimethr;
+  std::vector<double> comm_bytes_thr;
+  std::vector<double> comm_time_thr;
+  std::vector<double> comm_enter_thr;
+  std::vector<double> comm_leave_thr;
 
   ////////////////////////////////////////
   // Stencil query
@@ -262,18 +267,45 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 #endif
     if (nthreads == -1) nthreads = 1;
     if (mythread < nthreads) {
+      comm_enter_thr[mythread] = usecond();
       for (int i = mythread; i < Packets.size(); i += nthreads) {
-	double start = usecond();
 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 						      Packets[i].to_rank,
 						      Packets[i].recv_buf,
 						      Packets[i].from_rank,
 						      Packets[i].bytes,i);
-	comms_bytesthr[mythread] += bytes;
-	commtimethr[mythread] += usecond() - start;
+	comm_bytes_thr[mythread] += bytes;
       }
+      comm_leave_thr[mythread]= usecond();
+      comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
     }
   }
+  
+  void CollateThreads(void)
+  {
+    int nthreads = CartesianCommunicator::nCommThreads;
+    double first=0.0;
+    double last =0.0;
+
+    for(int t=0;t<nthreads;t++) {
+
+      double t0 = comm_enter_thr[t];
+      double t1 = comm_leave_thr[t];
+      comms_bytes+=comm_bytes_thr[t];
+
+      comm_enter_thr[t] = 0.0;
+      comm_leave_thr[t] = 0.0;
+      comm_time_thr[t]   = 0.0;
+      comm_bytes_thr[t]=0;
+
+      if ( first == 0.0 ) first = t0;                   // first is t0
+      if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen
+
+      if ( t1 > last ) last = t1;                       // max time seen
+      
+    }
+    commtime+= last-first;
+  }
   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
     reqs.resize(Packets.size());
@@ -295,14 +327,48 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
     }
     commtime+=usecond();
   }
+  void Communicate(void)
+  {
+#ifdef GRID_OMP
+#pragma omp parallel 
+    {
+      // must be called in parallel region
+      int mythread  = omp_get_thread_num();
+      int maxthreads= omp_get_max_threads();
+      int nthreads = CartesianCommunicator::nCommThreads;
+      assert(nthreads <= maxthreads);
+
+      if (nthreads == -1) nthreads = 1;
+#else
+      int mythread = 0;
+      int nthreads = 1;
+#endif
+      if (mythread < nthreads) {
+	for (int i = mythread; i < Packets.size(); i += nthreads) {
+	  double start = usecond();
+	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+								   Packets[i].to_rank,
+								   Packets[i].recv_buf,
+								   Packets[i].from_rank,
+								   Packets[i].bytes,i);
+	  comm_time_thr[mythread] += usecond() - start;
+	}
+      }
+#ifdef GRID_OMP
+    }
+#endif
+  }
   
   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
   {
     std::vector<std::vector<CommsRequest_t> > reqs;
     Prepare();
     HaloGather(source,compress);
+    // Concurrent
     CommunicateBegin(reqs);
     CommunicateComplete(reqs);
+    // Sequential
+    // Communicate();
     CommsMergeSHM(compress); 
     CommsMerge(compress); 
   }
@@ -363,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   template<class compressor>
   void HaloGather(const Lattice<vobj> &source,compressor &compress)
   {
+    mpi3synctime_g-=usecond();
     _grid->StencilBarrier();// Synch shared memory on a single nodes
+    mpi3synctime_g+=usecond();
 
     // conformable(source._grid,_grid);
     assert(source._grid==_grid);
@@ -423,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
     CommsMerge(decompress,Mergers,Decompressions); 
   }
   template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
+    mpi3synctime-=usecond();    
     _grid->StencilBarrier();// Synch shared memory on a single nodes
+    mpi3synctime+=usecond();    
+    shmmergetime-=usecond();    
     CommsMerge(decompress,MergersSHM,DecompressionsSHM);
+    shmmergetime+=usecond();    
   }
 
   template<class decompressor>
@@ -470,8 +542,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 		  const std::vector<int> &distances) 
    : _permute_type(npoints), 
     _comm_buf_size(npoints),
-    comms_bytesthr(npoints), 
-       commtimethr(npoints)
+    comm_bytes_thr(npoints), 
+    comm_enter_thr(npoints),
+    comm_leave_thr(npoints), 
+       comm_time_thr(npoints)
   {
     face_table_computed=0;
     _npoints = npoints;
@@ -1025,8 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   void ZeroCounters(void) {
     gathertime = 0.;
     commtime = 0.;
-    memset(&commtimethr[0], 0, sizeof(commtimethr));
-    memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr));
+    mpi3synctime=0.;
+    mpi3synctime_g=0.;
+    shmmergetime=0.;
+    for(int i=0;i<_npoints;i++){
+      comm_time_thr[i]=0;
+      comm_bytes_thr[i]=0;
+      comm_enter_thr[i]=0;
+      comm_leave_thr[i]=0;
+    }
     halogtime = 0.;
     mergetime = 0.;
     decompresstime = 0.;
@@ -1043,13 +1124,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
     RealD NP = _grid->_Nprocessors;
     RealD NN = _grid->NodeCount();
     double t = 0;
-    // if commtimethr is set they were all done in parallel so take the max
+    // if comm_time_thr is set they were all done in parallel so take the max
     // but add up the bytes
+    int threaded = 0 ;
     for (int i = 0; i < 8; ++i) {
-      comms_bytes += comms_bytesthr[i];
-      if (t < commtimethr[i]) t = commtimethr[i];
+      if ( comm_time_thr[i]>0.0 ) {
+	threaded = 1;
+	comms_bytes += comm_bytes_thr[i];
+	if (t < comm_time_thr[i]) t = comm_time_thr[i];
+      }
     }
-    commtime += t;
+    if (threaded) commtime += t;
     
     _grid->GlobalSum(commtime);    commtime/=NP;
     if ( calls > 0. ) {
@@ -1065,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
       }
+      PRINTIT(mpi3synctime);
+      PRINTIT(mpi3synctime_g);
+      PRINTIT(shmmergetime);
       PRINTIT(splicetime);
       PRINTIT(nosplicetime);
     }

From c289699d9a4a064a3770b6fca6c4ecf6c55f3553 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 25 Aug 2017 11:41:01 +0100
Subject: [PATCH 171/177] updated from cambridge mpi3 shakeout

---
 benchmarks/Benchmark_ITT.cc               | 4 ++--
 lib/qcd/action/fermion/WilsonCompressor.h | 7 +++++--
 lib/stencil/Stencil.h                     | 8 ++++----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index c5226ee1..bd75dd8e 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -108,7 +108,7 @@ public:
 
   static void Comms(void)
   {
-    int Nloop=1000;
+    int Nloop=200;
     int nmu=0;
     int maxlat=32;
 
@@ -197,7 +197,7 @@ public:
 
 	timestat.statistics(t_time);
 	//	for(int i=0;i<t_time.size();i++){
-	  //	  std::cout << i<<" "<<t_time[i]<<std::endl;
+	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 	//	}
 
 	dbytes=dbytes*ppn;
diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h
index 30c6d838..406476b0 100644
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -312,8 +312,11 @@ public:
     std::vector<std::vector<CommsRequest_t> > reqs;
     this->HaloExchangeOptGather(source,compress);
     double t1=usecond();
-    this->CommunicateBegin(reqs);
-    this->CommunicateComplete(reqs);
+    // Asynchronous MPI calls multidirectional, Isend etc...
+    //    this->CommunicateBegin(reqs);
+    //    this->CommunicateComplete(reqs);
+    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
+    this->Communicate();
     double t2=usecond(); timer1 += t2-t1;
     this->CommsMerge(compress);
     double t3=usecond(); timer2 += t3-t2;
diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h
index ad454bcb..cd0792d5 100644
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -365,10 +365,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
     Prepare();
     HaloGather(source,compress);
     // Concurrent
-    CommunicateBegin(reqs);
-    CommunicateComplete(reqs);
-    // Sequential
-    // Communicate();
+    //CommunicateBegin(reqs);
+    //CommunicateComplete(reqs);
+    // Sequential, possibly threaded
+    Communicate();
     CommsMergeSHM(compress); 
     CommsMerge(compress); 
   }

From 3a582174053732f4e5645367b750fd446d8fcb1d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 25 Aug 2017 14:29:53 +0100
Subject: [PATCH 172/177] Updated

---
 benchmarks/Benchmark_ITT.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index bd75dd8e..2edae8d0 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -386,7 +386,7 @@ public:
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-	int nwarm = 200;
+	int nwarm = 100;
 	double t0=usecond();
 	sFGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
@@ -396,7 +396,7 @@ public:
 	double t1=usecond();
 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 	//	if (ncall < 500) ncall = 500;
-	uint64_t ncall = 1000;
+	uint64_t ncall = 500;
 
 	sFGrid->Broadcast(0,&ncall,sizeof(ncall));
 

From d0f3d525d5dfb6cd7a2f5fe3be5a69c7ddc1306e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 25 Aug 2017 19:33:54 +0100
Subject: [PATCH 173/177] Optimal block size for KNL

---
 benchmarks/Benchmark_ITT.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 2edae8d0..c0ce451f 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -679,8 +679,11 @@ int main (int argc, char ** argv)
   Grid_init(&argc,&argv);
 
   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
+#ifdef KNL
+  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
+#else
   LebesgueOrder::Block = std::vector<int>({2,2,2,2});
-
+#endif
   Benchmark::Decomposition();
 
   int do_memory=1;

From f68b5de9c8798779ef2657b9c2d469174ae8f53a Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 25 Aug 2017 19:35:21 +0100
Subject: [PATCH 174/177] No compile fix on Clang

---
 lib/qcd/action/fermion/CayleyFermion5D.cc | 12 ++++++------
 lib/qcd/action/fermion/WilsonCompressor.h |  4 ----
 lib/qcd/action/fermion/WilsonFermion5D.cc |  5 +++--
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc
index 5e67d1f1..838b1c3d 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   for(int i=0; i < Ls; i++){
     as[i] = 1.0;
     omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
-    //    assert(fabs(omega[i])>0.0);
+    assert(omega[i]!=Coeff_t(0.0));
     bs[i] = 0.5*(bpc/omega[i] + bmc);
     cs[i] = 0.5*(bpc/omega[i] - bmc);
   }
@@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   
   for(int i=0;i<Ls;i++){
     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    assert(fabs(bee[i])>0.0);
+    assert(bee[i]!=Coeff_t(0.0));
     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
     beo[i]=as[i]*bs[i];
     ceo[i]=-as[i]*cs[i];
@@ -456,14 +456,14 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
     
     if ( i < Ls-1 ) {
 
-      assert(fabs(bee[i])>0.0);
-      assert(fabs(bee[0])>0.0);
+      assert(bee[i]!=Coeff_t(0.0));
+      assert(bee[0]!=Coeff_t(0.0));
       
       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
       
       leem[i]=mass*cee[Ls-1]/bee[0];
       for(int j=0;j<i;j++) {
-	assert(fabs(bee[j+1])>0.0);
+	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
       }
       
@@ -484,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   { 
     Coeff_t delta_d=mass*cee[Ls-1];
     for(int j=0;j<Ls-1;j++) {
-      assert(fabs(bee[j])>0.0);
+      assert(bee[j] != Coeff_t(0.0));
       delta_d *= cee[j]/bee[j];
     }
     dee[Ls-1] += delta_d;
diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h
index 406476b0..cc5c3c63 100644
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -248,7 +248,6 @@ public:
   uint64_t callsi;
   void ZeroCountersi(void)
   {
-    std::cout << GridLogMessage << " ZeroCountersi()"<<std::endl;
     timer0=0;
     timer1=0;
     timer2=0;
@@ -260,7 +259,6 @@ public:
   }
   void Reporti(int calls)
   {
-    std::cout << GridLogMessage << " Reporti() calls " <<callsi << calls<<std::endl;
     if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
     if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
     if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
@@ -290,7 +288,6 @@ public:
     // Here we know the distance is 1 for WilsonStencil
     for(int point=0;point<this->_npoints;point++){
       same_node[point] = this->SameNode(point);
-      //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
     }
     
     for(int site = 0 ;site< vol4;site++){
@@ -365,7 +362,6 @@ public:
     int dag = compress.dag;
     int face_idx=0;
     if ( dag ) { 
-      //	std::cout << " Optimised Dagger compress " <<std::endl;
       assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
       assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
       assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index c5b0f872..1da58ddb 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -123,12 +123,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
   int vol4;
   vol4=FourDimGrid.oSites();
   Stencil.BuildSurfaceList(LLs,vol4);
+
   vol4=FourDimRedBlackGrid.oSites();
   StencilEven.BuildSurfaceList(LLs,vol4);
    StencilOdd.BuildSurfaceList(LLs,vol4);
 
-  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
-                       <<" " << StencilEven.surface_list.size()<<std::endl;
+   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 
 }
      

From ad89abb018274ae29b372311e674070436e8bf9b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 25 Aug 2017 20:43:37 +0100
Subject: [PATCH 175/177] Fix

---
 benchmarks/Benchmark_ITT.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 58fdb84a..c0ce451f 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -181,6 +181,7 @@ public:
 	  
 #ifdef GRID_OMP
 #pragma omp atomic
+#endif
 	      ncomm++;
 
 #ifdef GRID_OMP

From 54a5e6c1d0ec1cf1b66dac5ba407db49bc7e1016 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 25 Aug 2017 22:36:08 +0100
Subject: [PATCH 176/177] Check if we get huge pages on linux. Larry Meadows
 piece of magic.

---
 lib/allocator/AlignedAllocator.cc | 33 +++++++++++++++++++++++++++++++
 lib/allocator/AlignedAllocator.h  |  2 ++
 2 files changed, 35 insertions(+)

diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc
index 04de20bf..764bd732 100644
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@@ -63,4 +63,37 @@ void *PointerCache::Lookup(size_t bytes) {
   return NULL;
 }
 
+
+void check_huge_pages(void *Buf,uint64_t BYTES)
+{
+#ifdef __linux__
+  int fd = open("/proc/self/pagemap", O_RDONLY);
+  assert(fd >= 0);
+  const int page_size = 4096;
+  uint64_t virt_pfn = (uint64_t)Buf / page_size;
+  off_t offset = sizeof(uint64_t) * virt_pfn;
+  uint64_t npages = (BYTES + page_size-1) / page_size;
+  uint64_t pagedata[npages];
+  uint64_t ret = lseek(fd, offset, SEEK_SET);
+  assert(ret == offset);
+  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  assert(ret == sizeof(uint64_t) * npages);
+  int nhugepages = npages / 512;
+  int n4ktotal, nnothuge;
+  n4ktotal = 0;
+  nnothuge = 0;
+  for (int i = 0; i < nhugepages; ++i) {
+    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
+    for (int j = 0; j < 512; ++j) {
+      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
+      ++n4ktotal;
+      if (pageaddr != baseaddr + j * page_size)
+	++nnothuge;
+      }
+  }
+  int rank = CartesianCommunicator::RankWorld();
+  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
+#endif
+}
+
 }
diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index c5ad0883..e64a5949 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -64,6 +64,8 @@ namespace Grid {
 
   };
 
+  void check_huge_pages(void *Buf,uint64_t BYTES);
+
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////

From 4b4c2a715b319bcc7060ef9ae8aa983c49471167 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 26 Aug 2017 11:38:04 +0100
Subject: [PATCH 177/177] fcntl.h needed

---
 lib/allocator/AlignedAllocator.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/allocator/AlignedAllocator.cc b/lib/allocator/AlignedAllocator.cc
index 764bd732..967b2571 100644
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@@ -1,7 +1,5 @@
-
-
-
 #include <Grid/GridCore.h>
+#include <fcntl.h>
 
 namespace Grid {