From 037bb6ea7397fcedddcd94515786849351e4d507 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Mon, 16 Mar 2020 14:07:52 +0100
Subject: [PATCH 01/51] Check in reader for openqcd configs

This reader is suboptimal in the sense that it opens the entire config on every MPI rank.
---
 Grid/parallelIO/OpenQcdIO.h  | 153 +++++++++++++++++++++++++++++++++++
 Grid/qcd/hmc/HMC_aggregate.h |   1 +
 tests/IO/Test_openqcd_io.cc  |  55 +++++++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 Grid/parallelIO/OpenQcdIO.h
 create mode 100644 tests/IO/Test_openqcd_io.cc
diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h
new file mode 100644
index 00000000..f340e8fc
--- /dev/null
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -0,0 +1,153 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/parallelIO/OpenQcdIO.h
+
+Copyright (C) 2015 - 2020
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+struct OpenQcdHeader : Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
+                                  int,    Nt,
+                                  int,    Nx,
+                                  int,    Ny,
+                                  int,    Nz,
+                                  double, plaq);
+};
+
+class OpenQcdIO : public BinaryIO {
+public:
+  static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
+
+  static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
+    OpenQcdHeader header;
+
+    {
+      std::ifstream fin(file, std::ios::in | std::ios::binary);
+      fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
+      assert(!fin.fail());
+      field.data_start = fin.tellg();
+      fin.close();
+    }
+
+    header.plaq /= normalisationFactor;
+
+    // sanity check (should trigger on endian issues)
+    assert(0 < header.Nt && header.Nt <= 1024);
+    assert(0 < header.Nx && header.Nx <= 1024);
+    assert(0 < header.Ny && header.Ny <= 1024);
+    assert(0 < header.Nz && header.Nz <= 1024);
+
+    field.dimension[0] = header.Nx;
+    field.dimension[1] = header.Ny;
+    field.dimension[2] = header.Nz;
+    field.dimension[3] = header.Nt;
+
+    assert(grid->_ndimension == Nd);
+    for(int d = 0; d < Nd; d++)
+      assert(grid->_fdimensions[d] == field.dimension[d]);
+
+    field.plaquette = header.plaq;
+
+    return field.data_start;
+  }
+
+  template<class vsimd>
+  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                       FieldMetaData&                        header,
+                                       std::string                           file) {
+    auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
+    assert(grid != nullptr);
+    assert((grid->_ndimension == Nd) && (Nd == 4));
+
+    uint64_t offset = readHeader(file, Umu.Grid(), header);
+    FieldMetaData clone(header);
+
+    // NOTE: This version is suboptimal because we read in the full file on every rank
+    std::vector<ColourMatrix> data(grid->gSites() * 4);
+    {
+      auto fin = std::fstream(file, std::ios::in | std::ios::binary);
+      fin.seekg(offset);
+      fin.read((char *)data.data(), data.size() * sizeof(ColourMatrix));
+      fin.close();
+    }
+
+    // global lattice size
+    Coordinate fdim = grid->FullDimensions();
+
+    // coordinate of this process
+    Coordinate pcoor;
+    grid->ProcessorCoorFromRank(CartesianCommunicator::RankWorld(), pcoor);
+
+    // loop over local indices
+    thread_for(idx, grid->lSites(), {
+      // convert local index to global coordinate
+      Coordinate lcoor, gcoor;
+      grid->LocalIndexToLocalCoor(idx, lcoor);
+      grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor, lcoor, gcoor);
+
+      // openQCD stores links attached to odd sites
+      bool neg = (gcoor[Xdir] + gcoor[Ydir] + gcoor[Zdir] + gcoor[Tdir]) % 2 != 1;
+
+      LorentzColourMatrix site_data;
+      for (int mu = 0; mu < 4; ++mu) {
+        // determine the site at which it is stored
+        Coordinate c = gcoor;
+        if (neg)
+          c[mu] = (c[mu] + 1) % grid->FullDimensions()[mu];
+
+        // site-index in the OpenQCD format (which uses t,x,y,z order)
+        int openqcd_idx = (c[Tdir] * fdim[Xdir] * fdim[Ydir] * fdim[Zdir]
+                        +  c[Xdir] * fdim[Ydir] * fdim[Zdir]
+                        +  c[Ydir] * fdim[Zdir]
+                        +  c[Zdir])/2;
+        int openqcd_mu = (mu + 1) % 4;
+
+        // pick the colour-matrix out
+        site_data(mu) =
+            data[8 * openqcd_idx + 2 * openqcd_mu + (neg ? 1 : 0)]();
+      }
+
+      pokeLocalSite(site_data, Umu, lcoor);
+    });
+
+    GaugeStatistics(Umu, clone);
+
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " plaquette "
+              << std::setprecision(15)
+              << clone.plaquette << " header " << header.plaquette
+              << " difference " << fabs(clone.plaquette - header.plaquette)
+              << std::endl;
+
+    if(fabs(clone.plaquette - header.plaquette) >= 1.0e-5) std::cout << " Plaquette mismatch " << std::endl;
+    assert(fabs(clone.plaquette - header.plaquette) < 1.0e-5);
+
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
+  }
+};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h
index e4d2ce83..94c745e1 100644
--- a/Grid/qcd/hmc/HMC_aggregate.h
+++ b/Grid/qcd/hmc/HMC_aggregate.h
@@ -39,6 +39,7 @@ directory
 #include <Grid/parallelIO/IldgIOtypes.h>
 #include <Grid/parallelIO/IldgIO.h>
 #include <Grid/parallelIO/NerscIO.h>
+#include <Grid/parallelIO/OpenQcdIO.h>
 NAMESPACE_CHECK(Ildg);
 
 #include <Grid/qcd/hmc/checkpointers/CheckPointers.h>
diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc
new file mode 100644
index 00000000..2a5769bd
--- /dev/null
+++ b/tests/IO/Test_openqcd_io.cc
@@ -0,0 +1,55 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/io/Test_openqcd_io.cc
+
+Copyright (C) 2015 - 2020
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  GridCartesian* grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+                                                       GridDefaultSimd(Nd, vComplexD::Nsimd()),
+                                                       GridDefaultMpi());
+
+  LatticeGaugeField Umu(grid);
+
+  FieldMetaData header;
+
+  if(!Grid::GridCmdOptionExists(argv, argv + argc, "--config")) {
+    std::cout << GridLogError << "You need to use --config /path/to/openqcd_config" << std::endl;
+    abort();
+  }
+
+  std::string file = Grid::GridCmdOptionPayload(argv, argv + argc, "--config");
+  assert(!file.empty());
+
+  OpenQcdIO::readConfiguration(Umu, header, file);
+
+  Grid_finalize();
+}

From c9b737a4e7bb62cf6d22dace3458b108043e5023 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 16 Mar 2020 17:58:30 -0400
Subject: [PATCH 02/51] make trace,adj,transpose unary operators

---
 Grid/lattice/Lattice.h                            |  4 ++--
 Grid/lattice/Lattice_trace.h                      |  2 ++
 .../WilsonCloverFermionImplementation.h           |  8 ++++----
 Grid/qcd/action/gauge/GaugeImplementations.h      |  2 +-
 Grid/qcd/utils/WilsonLoops.h                      |  4 ++--
 bootstrap.sh                                      | 15 ++++++++-------
 6 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h
index 1eea98ed..6eac0362 100644
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -31,11 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_ET.h>
 #include <Grid/lattice/Lattice_arith.h>
 #include <Grid/lattice/Lattice_trace.h>
-#include <Grid/lattice/Lattice_transpose.h>
+//#include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
-#include <Grid/lattice/Lattice_reality.h>
+//#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
 #include <Grid/lattice/Lattice_coordinate.h>
diff --git a/Grid/lattice/Lattice_trace.h b/Grid/lattice/Lattice_trace.h
index 93444e0c..6b5f67d2 100644
--- a/Grid/lattice/Lattice_trace.h
+++ b/Grid/lattice/Lattice_trace.h
@@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Trace
 ////////////////////////////////////////////////////////////////////////////////////////////////////
+/*
 template<class vobj>
 inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
 {
@@ -48,6 +49,7 @@ inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
   });
   return ret;
 };
+*/
     
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Trace Index level dependent operation
diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
index 5744d3bb..9d99d9e7 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -132,14 +132,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
   pickCheckerboard(Even, CloverTermEven, CloverTerm);
   pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
 
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+  pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
+  pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
 
   pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
   pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
 
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+  pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
 }
 
 template <class Impl>
diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h
index a14aec1b..19bc5aa6 100644
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -59,7 +59,7 @@ public:
   }
   static inline GaugeLinkField
   CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    return Cshift(adj(Link), mu, -1);
+    return Cshift(closure(adj(Link)), mu, -1);
   }
   static inline GaugeLinkField
   CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h
index 0367c9fa..fdd53698 100644
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -485,7 +485,7 @@ public:
 
         // Up staple    ___ ___
         //             |       |
-        tmp = Cshift(adj(U[nu]), nu, -1);
+        tmp = Cshift(closure(adj(U[nu])), nu, -1);
         tmp = adj(U2[mu]) * tmp;
         tmp = Cshift(tmp, mu, -2);
 
@@ -519,7 +519,7 @@ public:
         //
         //      |  |
 
-        tmp = Cshift(adj(U2[nu]), nu, -2);
+        tmp = Cshift(closure(adj(U2[nu])), nu, -2);
         tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
         tmp = U2[nu] * Cshift(tmp, nu, 2);
         Stap += Cshift(tmp, mu, 1);
diff --git a/bootstrap.sh b/bootstrap.sh
index 4bd3de5e..5ea7ce32 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,19 +1,20 @@
 #!/usr/bin/env bash
 set -e
 
-EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
+#https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2
+EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.7.tar.bz2'
 EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
 
 
 echo "-- deploying Eigen source..."
 ARC=`basename ${EIGEN_URL}`
 wget ${EIGEN_URL} --no-check-certificate
-if command -v sha256sum; then
-   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
-      | sha256sum --check || exit 1
-else
-   echo "WARNING: could not verify checksum, please install sha256sum" >&2
-fi
+#if command -v sha256sum; then
+#   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
+#      | sha256sum --check || exit 1
+#else
+#   echo "WARNING: could not verify checksum, please install sha256sum" >&2
+#fi
 ./scripts/update_eigen.sh ${ARC}
 rm ${ARC}
 # patch for non-portable includes in Eigen 3.3.5

From 60db3133d38e8e4433627e6351e38cafe210c175 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 16 Mar 2020 17:59:56 -0400
Subject: [PATCH 03/51] make trace,adj,transpose unary operators

---
 bootstrap.sh | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/bootstrap.sh b/bootstrap.sh
index 5ea7ce32..4bd3de5e 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,20 +1,19 @@
 #!/usr/bin/env bash
 set -e
 
-#https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2
-EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.7.tar.bz2'
+EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
 EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
 
 
 echo "-- deploying Eigen source..."
 ARC=`basename ${EIGEN_URL}`
 wget ${EIGEN_URL} --no-check-certificate
-#if command -v sha256sum; then
-#   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
-#      | sha256sum --check || exit 1
-#else
-#   echo "WARNING: could not verify checksum, please install sha256sum" >&2
-#fi
+if command -v sha256sum; then
+   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
+      | sha256sum --check || exit 1
+else
+   echo "WARNING: could not verify checksum, please install sha256sum" >&2
+fi
 ./scripts/update_eigen.sh ${ARC}
 rm ${ARC}
 # patch for non-portable includes in Eigen 3.3.5

From 989af658071f5d9fc92adc0d6e0ab9775b3e0e51 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Mon, 23 Mar 2020 17:33:18 +0100
Subject: [PATCH 04/51] Check in parallel reader for openqcd configs

---
 Grid/parallelIO/MetaData.h                 |  24 ++
 Grid/parallelIO/OpenQcdIO.h                | 165 ++++++++----
 Grid/parallelIO/OpenQcdIOChromaReference.h | 281 +++++++++++++++++++++
 Grid/qcd/hmc/HMC_aggregate.h               |   3 +
 tests/IO/Test_openqcd_io.cc                |  51 +++-
 5 files changed, 466 insertions(+), 58 deletions(-)
 create mode 100644 Grid/parallelIO/OpenQcdIOChromaReference.h

diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h
index 2e211838..4c1cfbdb 100644
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger {
   };
 };
 
+template<class fobj,class sobj>
+struct GaugeDoubleStoredMunger{
+  void operator()(fobj &in, sobj &out) {
+    for (int mu = 0; mu < Nds; mu++) {
+      for (int i = 0; i < Nc; i++) {
+        for (int j = 0; j < Nc; j++) {
+          out(mu)()(i, j) = in(mu)()(i, j);
+        }}
+    }
+  };
+};
+
+template <class fobj, class sobj>
+struct GaugeDoubleStoredUnmunger {
+  void operator()(sobj &in, fobj &out) {
+    for (int mu = 0; mu < Nds; mu++) {
+      for (int i = 0; i < Nc; i++) {
+        for (int j = 0; j < Nc; j++) {
+          out(mu)()(i, j) = in(mu)()(i, j);
+        }}
+    }
+  };
+};
+
 template<class fobj,class sobj>
 struct Gauge3x2munger{
   void operator() (fobj &in,sobj &out){
diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h
index f340e8fc..00911595 100644
--- a/Grid/parallelIO/OpenQcdIO.h
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -67,6 +67,10 @@ public:
     field.dimension[2] = header.Nz;
     field.dimension[3] = header.Nt;
 
+    std::cout << GridLogDebug << "header: " << header << std::endl;
+    std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
+    std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
+
     assert(grid->_ndimension == Nd);
     for(int d = 0; d < Nd; d++)
       assert(grid->_fdimensions[d] == field.dimension[d]);
@@ -80,74 +84,141 @@ public:
   static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
                                        FieldMetaData&                        header,
                                        std::string                           file) {
+    typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
+
+    assert(Ns == 4 and Nd == 4 and Nc == 3);
+
     auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
-    assert(grid != nullptr);
-    assert((grid->_ndimension == Nd) && (Nd == 4));
+    assert(grid != nullptr); assert(grid->_ndimension == Nd);
 
     uint64_t offset = readHeader(file, Umu.Grid(), header);
+
     FieldMetaData clone(header);
 
-    // NOTE: This version is suboptimal because we read in the full file on every rank
-    std::vector<ColourMatrix> data(grid->gSites() * 4);
-    {
-      auto fin = std::fstream(file, std::ios::in | std::ios::binary);
-      fin.seekg(offset);
-      fin.read((char *)data.data(), data.size() * sizeof(ColourMatrix));
-      fin.close();
-    }
+    std::string format("IEEE64"); // they always store little endian double precsision
+    uint32_t    nersc_csum, scidac_csuma, scidac_csumb;
 
-    // global lattice size
-    Coordinate fdim = grid->FullDimensions();
+    GridCartesian*         grid_openqcd = createOpenQcdGrid(grid);
+    GridRedBlackCartesian* grid_rb      = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
 
-    // coordinate of this process
-    Coordinate pcoor;
-    grid->ProcessorCoorFromRank(CartesianCommunicator::RankWorld(), pcoor);
+    typedef DoubleStoredColourMatrixD                                              fobj;
+    typedef typename DoubleStoredGaugeField::vector_object::scalar_object          sobj;
+    typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
 
-    // loop over local indices
-    thread_for(idx, grid->lSites(), {
-      // convert local index to global coordinate
-      Coordinate lcoor, gcoor;
-      grid->LocalIndexToLocalCoor(idx, lcoor);
-      grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor, lcoor, gcoor);
+    word w = 0;
 
-      // openQCD stores links attached to odd sites
-      bool neg = (gcoor[Xdir] + gcoor[Ydir] + gcoor[Zdir] + gcoor[Tdir]) % 2 != 1;
+    std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
+    std::vector<sobj> scalardata(grid->lSites());
 
-      LorentzColourMatrix site_data;
-      for (int mu = 0; mu < 4; ++mu) {
-        // determine the site at which it is stored
-        Coordinate c = gcoor;
-        if (neg)
-          c[mu] = (c[mu] + 1) % grid->FullDimensions()[mu];
+    IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
+             nersc_csum, scidac_csuma, scidac_csumb);
 
-        // site-index in the OpenQCD format (which uses t,x,y,z order)
-        int openqcd_idx = (c[Tdir] * fdim[Xdir] * fdim[Ydir] * fdim[Zdir]
-                        +  c[Xdir] * fdim[Ydir] * fdim[Zdir]
-                        +  c[Ydir] * fdim[Zdir]
-                        +  c[Zdir])/2;
-        int openqcd_mu = (mu + 1) % 4;
+    GridStopWatch timer;
+    timer.Start();
 
-        // pick the colour-matrix out
-        site_data(mu) =
-            data[8 * openqcd_idx + 2 * openqcd_mu + (neg ? 1 : 0)]();
-      }
+    DoubleStoredGaugeField Umu_ds(grid);
 
-      pokeLocalSite(site_data, Umu, lcoor);
+    auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
+
+    Coordinate ldim = grid->LocalDimensions();
+    thread_for(idx_g, grid->lSites(), {
+        Coordinate coor;
+        grid->LocalIndexToLocalCoor(idx_g, coor);
+
+        bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
+
+        if(!isOdd) continue;
+
+        int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
+                  +  coor[Xdir] * ldim[Ydir] * ldim[Zdir]
+                  +  coor[Ydir] * ldim[Zdir]
+                  +  coor[Zdir])/2;
+
+        munge(iodata[idx_o], scalardata[idx_g]);
     });
 
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
+
+    timer.Reset(); timer.Start();
+
+    vectorizeFromLexOrdArray(scalardata, Umu_ds);
+
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
+
+    timer.Reset(); timer.Start();
+
+    undoDoubleStore(Umu, Umu_ds);
+
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
+
     GaugeStatistics(Umu, clone);
 
-    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " plaquette "
-              << std::setprecision(15)
-              << clone.plaquette << " header " << header.plaquette
-              << " difference " << fabs(clone.plaquette - header.plaquette)
-              << std::endl;
+    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
 
-    if(fabs(clone.plaquette - header.plaquette) >= 1.0e-5) std::cout << " Plaquette mismatch " << std::endl;
-    assert(fabs(clone.plaquette - header.plaquette) < 1.0e-5);
+    // clang-format off
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file
+              << " plaquette " << clone.plaquette
+              << " header " << header.plaquette
+              << " difference " << plaq_diff
+              << std::endl;
+    // clang-format on
+
+    RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
+    RealD tol     = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
+
+    if(plaq_diff >= tol)
+      std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
+    assert(plaq_diff < tol);
 
     std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
   }
+
+  template<class vsimd>
+  static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                        std::string                           file) {
+    std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+private:
+  static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
+    // exploit GridCartesian to be able to still use IOobject
+    Coordinate gdim  = grid->GlobalDimensions();
+    Coordinate ldim  = grid->LocalDimensions();
+    Coordinate pcoor = grid->ThisProcessorCoor();
+
+    // openqcd does rb on the z direction
+    gdim[Zdir] /= 2;
+    ldim[Zdir] /= 2;
+
+    // and has the order T X Y Z (from slowest to fastest)
+    std::swap(gdim[Xdir], gdim[Zdir]);
+    std::swap(ldim[Xdir], ldim[Zdir]);
+    std::swap(pcoor[Xdir], pcoor[Zdir]);
+
+    GridCartesian* ret   = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
+    ret->_ldimensions    = ldim;
+    ret->_processor_coor = pcoor;
+    return ret;
+  }
+
+  template<class vsimd>
+  static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>&            Umu,
+                                     Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
+    conformable(Umu.Grid(), Umu_ds.Grid());
+    Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
+
+    // they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
+    for(int mu_g = 0; mu_g < Nd; ++mu_g) {
+      int mu_o = (mu_g + 1) % Nd;
+      U        = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
+               + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
+      PokeIndex<LorentzIndex>(Umu, U, mu_g);
+    }
+  }
 };
 
 NAMESPACE_END(Grid);
diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h
new file mode 100644
index 00000000..bab54fe8
--- /dev/null
+++ b/Grid/parallelIO/OpenQcdIOChromaReference.h
@@ -0,0 +1,281 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
+
+Copyright (C) 2015 - 2020
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <ios>
+#include <iostream>
+#include <limits>
+#include <iomanip>
+#include <mpi.h>
+#include <ostream>
+#include <string>
+
+#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
+#define CHECK_VAR(a)   { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
+// #undef CHECK
+// #define CHECK
+
+NAMESPACE_BEGIN(Grid);
+
+class ParRdr {
+private:
+  bool const swap;
+
+  MPI_Status status;
+  MPI_File   fp;
+
+  int err;
+
+  MPI_Datatype oddSiteType;
+  MPI_Datatype fileViewType;
+
+  GridBase* grid;
+
+public:
+  ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
+    : swap(false)
+    , grid(gridPtr) {
+    err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
+    assert(err == MPI_SUCCESS);
+  }
+
+  virtual ~ParRdr() { MPI_File_close(&fp); }
+
+  inline void errInfo(int const err, std::string const& func) {
+    static char estring[MPI_MAX_ERROR_STRING];
+    int         eclass = -1, len = 0;
+    MPI_Error_class(err, &eclass);
+    MPI_Error_string(err, estring, &len);
+    std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
+  }
+
+  int readHeader(FieldMetaData& field) {
+    assert((grid->_ndimension == Nd) && (Nd == 4));
+    assert(Nc == 3);
+
+    OpenQcdHeader header;
+
+    readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
+
+    header.plaq /= 3.; // TODO change this into normalizationfactor
+
+    // sanity check (should trigger on endian issues) TODO remove?
+    assert(0 < header.Nt && header.Nt <= 1024);
+    assert(0 < header.Nx && header.Nx <= 1024);
+    assert(0 < header.Ny && header.Ny <= 1024);
+    assert(0 < header.Nz && header.Nz <= 1024);
+
+    field.dimension[0] = header.Nx;
+    field.dimension[1] = header.Ny;
+    field.dimension[2] = header.Nz;
+    field.dimension[3] = header.Nt;
+
+    for(int d = 0; d < Nd; d++)
+      assert(grid->FullDimensions()[d] == field.dimension[d]);
+
+    field.plaquette = header.plaq;
+
+    field.data_start = sizeof(OpenQcdHeader);
+
+    return field.data_start;
+  }
+
+  void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
+    err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
+    errInfo(err, "MPI_File_read_at_all");
+    // CHECK_VAR(err)
+
+    int read = -1;
+    MPI_Get_count(&status, datatype, &read);
+    // CHECK_VAR(read)
+    assert(nbytes == (uint64_t)read);
+    assert(err == MPI_SUCCESS);
+  }
+
+  void createTypes() {
+    constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
+
+    err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
+
+    Coordinate const L = grid->GlobalDimensions();
+    Coordinate const l = grid->LocalDimensions();
+    Coordinate const i = grid->ThisProcessorCoor();
+
+    Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
+    Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
+    Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
+
+    err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
+  }
+
+  void freeTypes() {
+    err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
+  }
+
+  bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
+    auto hdr_offset = readHeader(meta);
+    CHECK
+    createTypes();
+    err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
+    CHECK
+    int const domainSites = grid->lSites();
+    domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
+
+    // the actual READ
+    constexpr uint64_t cm_size   = 2 * Nc * Nc * sizeof(double);    // 2_complex
+    constexpr uint64_t os_size   = Nd * 2 * cm_size;                // 2_fwdbwd
+    constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
+    uint64_t const     n_os      = domainSites / 2;
+
+    for(uint64_t os_idx = 0; os_idx < n_os;) {
+      uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
+      uint64_t const cm      = os_idx * Nd * 2;
+      readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
+      os_idx += read_os;
+    }
+
+    CHECK
+    err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
+  errInfo(err, "MPI_File_set_view1");
+    assert(err == MPI_SUCCESS);
+    freeTypes();
+
+    std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
+    return true;
+  }
+};
+
+class OpenQcdIOChromaReference : public BinaryIO {
+public:
+  template<class vsimd>
+  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                       Grid::FieldMetaData&                  header,
+                                       std::string                           file) {
+    typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
+
+    assert(Ns == 4 and Nd == 4 and Nc == 3);
+
+    auto grid = Umu.Grid();
+
+    typedef ColourMatrixD fobj;
+
+    std::vector<fobj> iodata(
+      Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
+
+    {
+      ParRdr rdr(MPI_COMM_WORLD, file, grid);
+      rdr.readGauge(iodata, header);
+    } // equivalent to using binaryio
+
+    std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
+
+    copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
+
+    DoubledGaugeField Umu_ds(grid);
+
+    vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
+
+    redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
+
+    FieldMetaData clone(header);
+
+    GaugeStatistics(Umu, clone);
+
+    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
+
+    // clang-format off
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file
+              << " plaquette " << clone.plaquette
+              << " header " << header.plaquette
+              << " difference " << plaq_diff
+              << std::endl;
+    // clang-format on
+
+    RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
+    RealD tol     = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
+
+    if(plaq_diff >= tol)
+      std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
+    assert(plaq_diff < tol);
+
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
+  }
+
+private:
+  template<class vsimd>
+  static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>&            Umu,
+                                  Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
+    Grid::conformable(Umu.Grid(), Umu_ds.Grid());
+    Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
+
+    U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
+  }
+
+  static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
+                                         std::vector<ColourMatrixD> const&      node_buff,
+                                         GridBase*                              grid) {
+    assert(node_buff.size() == Nd * grid->lSites());
+
+    Coordinate const& l = grid->LocalDimensions();
+
+    Coordinate coord(Nd);
+    int&       x = coord[0];
+    int&       y = coord[1];
+    int&       z = coord[2];
+    int&       t = coord[3];
+
+    int buff_idx = 0;
+    for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
+      for(x = 0; x < l[0]; ++x)
+        for(y = 0; y < l[1]; ++y)
+          for(z = 0; z < l[2]; ++z) {
+            if((t + z + y + x) % 2 == 0) continue;
+
+            int local_idx;
+            Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
+            for(int mu = 0; mu < 2 * Nd; ++mu)
+              for(int c1 = 0; c1 < Nc; ++c1) {
+                for(int c2 = 0; c2 < Nc; ++c2) {
+                  u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
+                }
+              }
+            buff_idx += 2 * Nd;
+          }
+
+    assert(node_buff.size() == buff_idx);
+  }
+};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h
index 94c745e1..cb510953 100644
--- a/Grid/qcd/hmc/HMC_aggregate.h
+++ b/Grid/qcd/hmc/HMC_aggregate.h
@@ -40,6 +40,9 @@ directory
 #include <Grid/parallelIO/IldgIO.h>
 #include <Grid/parallelIO/NerscIO.h>
 #include <Grid/parallelIO/OpenQcdIO.h>
+#if !defined(GRID_COMMS_NONE)
+#include <Grid/parallelIO/OpenQcdIOChromaReference.h>
+#endif
 NAMESPACE_CHECK(Ildg);
 
 #include <Grid/qcd/hmc/checkpointers/CheckPointers.h>
diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc
index 2a5769bd..83b498c2 100644
--- a/tests/IO/Test_openqcd_io.cc
+++ b/tests/IO/Test_openqcd_io.cc
@@ -28,28 +28,57 @@ See the full license in the file "LICENSE" in the top level distribution directo
 
 #include <Grid/Grid.h>
 
+#if defined(GRID_COMMS_NONE)
+#error This test requires Grid compiled with MPI
+#endif
+
 using namespace Grid;
 
 int main(int argc, char** argv) {
   Grid_init(&argc, &argv);
 
-  GridCartesian* grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
-                                                       GridDefaultSimd(Nd, vComplexD::Nsimd()),
-                                                       GridDefaultMpi());
+  auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
+  auto mpi_layout  = GridDefaultMpi();
+  auto latt_size   = GridDefaultLatt();
 
-  LatticeGaugeField Umu(grid);
+  GridCartesian grid(latt_size, simd_layout, mpi_layout);
 
-  FieldMetaData header;
+  GridParallelRNG pRNG(&grid);
 
-  if(!Grid::GridCmdOptionExists(argv, argv + argc, "--config")) {
-    std::cout << GridLogError << "You need to use --config /path/to/openqcd_config" << std::endl;
-    abort();
+  pRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
+
+  LatticeGaugeField Umu_ref(&grid);
+  LatticeGaugeField Umu_me(&grid);
+  LatticeGaugeField Umu_diff(&grid);
+
+  FieldMetaData header_ref;
+  FieldMetaData header_me;
+
+  Umu_ref = Zero();
+  Umu_me  = Zero();
+
+  std::string file("/home/daniel/configs/openqcd/test_16x8_pbcn6");
+
+  if(GridCmdOptionExists(argv, argv + argc, "--config")) {
+    file = GridCmdOptionPayload(argv, argv + argc, "--config");
+    std::cout << "file: " << file << std::endl;
+    assert(!file.empty());
   }
 
-  std::string file = Grid::GridCmdOptionPayload(argv, argv + argc, "--config");
-  assert(!file.empty());
+  OpenQcdIOChromaReference::readConfiguration(Umu_ref, header_ref, file);
+  OpenQcdIO::readConfiguration(Umu_me, header_me, file);
 
-  OpenQcdIO::readConfiguration(Umu, header, file);
+  std::cout << GridLogMessage << header_ref << std::endl;
+  std::cout << GridLogMessage << header_me << std::endl;
+
+  Umu_diff = Umu_ref - Umu_me;
+
+  // clang-format off
+  std::cout << GridLogMessage
+            << "norm2(Umu_ref) = " << norm2(Umu_ref)
+            << " norm2(Umu_me) = " << norm2(Umu_me)
+            << " norm2(Umu_diff) = " << norm2(Umu_diff) << std::endl;
+  // clang-format on
 
   Grid_finalize();
 }

From a2188ea875246e16d2081142a62e59f85961a6c4 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 26 Mar 2020 09:12:36 -0400
Subject: [PATCH 05/51] remove debugging printf from
 WilsonKernelsImplementation

---
 .../fermion/implementation/WilsonKernelsImplementation.h    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index a787fa79..5600d25a 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -372,19 +372,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifndef GRID_NVCC
      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite); printf(".");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
    } else if( interior ) {
      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
 #ifndef GRID_NVCC
      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt); printf("-");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
    } else if( exterior ) { 
      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
 #ifndef GRID_NVCC
      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt); printf("+");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
    }
    assert(0 && " Kernel optimisation case not covered ");

From 856d168e4131b915ecab245753b44b690662c881 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 29 Mar 2020 07:56:05 -0400
Subject: [PATCH 06/51] global sum over vectors of uint64_t

---
 Grid/communicator/Communicator_base.h  | 1 +
 Grid/communicator/Communicator_mpi3.cc | 4 ++++
 Grid/communicator/Communicator_none.cc | 3 ++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index 11dbfcbb..436d75ef 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -114,6 +114,7 @@ public:
   void GlobalSumVector(RealD *,int N);
   void GlobalSum(uint32_t &);
   void GlobalSum(uint64_t &);
+  void GlobalSumVector(uint64_t*,int N);
   void GlobalSum(ComplexF &c);
   void GlobalSumVector(ComplexF *c,int N);
   void GlobalSum(ComplexD &c);
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 2576b1fa..0e525674 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -255,6 +255,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
   assert(ierr==0);
 }
+void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
   assert(ierr==0);
diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index b8a15a0e..81900371 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
+void CartesianCommunicator::GlobalSumVector(double *,int N){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
-void CartesianCommunicator::GlobalSumVector(double *,int N){}
+void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}
 

From 5fc8a273e7e8f1a29acebad0237b5e0bef2ecb97 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Mon, 6 Apr 2020 11:30:50 +0200
Subject: [PATCH 07/51] Fused innerProduct + norm2 on first argument operation

---
 Grid/lattice/Lattice_reduction.h |  58 +++++++++++++-
 tests/Test_innerproduct_norm.cc  | 126 +++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 tests/Test_innerproduct_norm.cc

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 3c5b03e5..de2efd72 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -204,8 +204,64 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
   grid->GlobalSum(nrm);
   return nrm; 
 }
-
  
+template<class vobj> strong_inline void
+innerProduct_norm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
+{
+  conformable(left,right);
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  Vector<ComplexD> tmp(2);
+
+  GridBase *grid = left.Grid();
+
+  auto left_v=left.View();
+  auto right_v=right.View();
+
+  const uint64_t nsimd = grid->Nsimd();
+  const uint64_t sites = grid->oSites();
+
+#ifdef GRID_NVCC
+  // GPU
+  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
+  typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
+  Vector<inner_t> inner_tmp(sites);
+  Vector<norm_t> norm_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
+  auto norm_tmp_v = &norm_tmp[0];
+
+  accelerator_for( ss, sites, nsimd,{
+      auto left_tmp = left_v(ss);
+      coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
+      coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)));
+  });
+
+  tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
+  tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
+#else
+  // CPU
+  typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
+  typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
+  Vector<inner_t> inner_tmp(sites);
+  Vector<norm_t> norm_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
+  auto norm_tmp_v = &norm_tmp[0];
+
+  accelerator_for( ss, sites, nsimd,{
+      auto left_tmp = left_v(ss);
+      inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
+      norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
+  });
+  // Already promoted to double
+  tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
+  tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
+#endif
+  grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
+  ip = tmp[0];
+  nrm = real(tmp[1]);
+}
+
 template<class Op,class T1>
 inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
   ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
diff --git a/tests/Test_innerproduct_norm.cc b/tests/Test_innerproduct_norm.cc
new file mode 100644
index 00000000..85c98521
--- /dev/null
+++ b/tests/Test_innerproduct_norm.cc
@@ -0,0 +1,126 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_innerproduct_norm.cc
+
+Copyright (C) 2015
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  const int nIter = 100;
+
+  // clang-format off
+  GridCartesian *Grid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
+  GridCartesian *Grid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  // clang-format on
+
+  GridParallelRNG pRNG_d(Grid_d);
+  GridParallelRNG pRNG_f(Grid_f);
+
+  std::vector<int> seeds_d({1, 2, 3, 4});
+  std::vector<int> seeds_f({5, 6, 7, 8});
+
+  pRNG_d.SeedFixedIntegers(seeds_d);
+  pRNG_f.SeedFixedIntegers(seeds_f);
+
+  // clang-format off
+  LatticeFermionD x_d(Grid_d); random(pRNG_d, x_d);
+  LatticeFermionD y_d(Grid_d); random(pRNG_d, y_d);
+  LatticeFermionF x_f(Grid_f); random(pRNG_f, x_f);
+  LatticeFermionF y_f(Grid_f); random(pRNG_f, y_f);
+  // clang-format on
+
+  GridStopWatch sw_ref;
+  GridStopWatch sw_res;
+
+  { // double precision
+    ComplexD ip_d_ref, ip_d_res, diff_ip_d;
+    RealD    norm2_d_ref, norm2_d_res, diff_norm2_d;
+
+    sw_ref.Reset();
+    sw_ref.Start();
+    for(int i = 0; i < nIter; ++i) {
+      ip_d_ref    = innerProduct(x_d, y_d);
+      norm2_d_ref = norm2(x_d);
+    }
+    sw_ref.Stop();
+
+    sw_res.Reset();
+    sw_res.Start();
+    for(int i = 0; i < nIter; ++i) { innerProduct_norm(ip_d_res, norm2_d_res, x_d, y_d); }
+    sw_res.Stop();
+
+    diff_ip_d    = ip_d_ref - ip_d_res;
+    diff_norm2_d = norm2_d_ref - norm2_d_res;
+
+    // clang-format off
+    std::cout << GridLogMessage << "Double: ip_ref = " << ip_d_ref << " ip_res = " << ip_d_res << " diff = " << diff_ip_d << std::endl;
+    std::cout << GridLogMessage << "Double: norm2_ref = " << norm2_d_ref << " norm2_res = " << norm2_d_res << " diff = " << diff_norm2_d << std::endl;
+    std::cout << GridLogMessage << "Double: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl;
+    // clang-format on
+
+    assert(diff_ip_d == 0.);
+    assert(diff_norm2_d == 0.);
+
+    std::cout << GridLogMessage << "Double: all checks passed" << std::endl;
+  }
+
+  { // single precision
+    ComplexD ip_f_ref, ip_f_res, diff_ip_f;
+    RealD    norm2_f_ref, norm2_f_res, diff_norm2_f;
+
+    sw_ref.Reset();
+    sw_ref.Start();
+    for(int i = 0; i < nIter; ++i) {
+      ip_f_ref    = innerProduct(x_f, y_f);
+      norm2_f_ref = norm2(x_f);
+    }
+    sw_ref.Stop();
+
+    sw_res.Reset();
+    sw_res.Start();
+    for(int i = 0; i < nIter; ++i) { innerProduct_norm(ip_f_res, norm2_f_res, x_f, y_f); }
+    sw_res.Stop();
+
+    diff_ip_f    = ip_f_ref - ip_f_res;
+    diff_norm2_f = norm2_f_ref - norm2_f_res;
+
+    // clang-format off
+    std::cout << GridLogMessage << "Single: ip_ref = " << ip_f_ref << " ip_res = " << ip_f_res << " diff = " << diff_ip_f << std::endl;
+    std::cout << GridLogMessage << "Single: norm2_ref = " << norm2_f_ref << " norm2_res = " << norm2_f_res << " diff = " << diff_norm2_f << std::endl;
+    std::cout << GridLogMessage << "Single: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl;
+    // clang-format on
+
+    assert(diff_ip_f == 0.);
+    assert(diff_norm2_f == 0.);
+
+    std::cout << GridLogMessage << "Single: all checks passed" << std::endl;
+  }
+
+  Grid_finalize();
+}

From 091d5c605ef041dbb95ba88e8a8b49c8a7b966af Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 17 Apr 2020 04:25:28 -0400
Subject: [PATCH 08/51] towards more precise blocking

---
 Grid/lattice/Lattice_reduction.h |  2 +-
 Grid/tensors/Tensor_class.h      |  2 +
 Grid/tensors/Tensor_inner.h      | 73 ++++++++++++++++++++++++++++++++
 Grid/tensors/Tensor_traits.h     | 20 +++++++++
 4 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index de2efd72..d3f5f9ae 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -206,7 +206,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
 }
  
 template<class vobj> strong_inline void
-innerProduct_norm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
+innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
   conformable(left,right);
 
diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h
index 75e42721..dbcbae8d 100644
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@@ -6,6 +6,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Michael Marshall <michael.marshall@ed.ac.au>
+Author: Christoph Lehner <christoph@lhnr.de>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -55,6 +56,7 @@ class GridTensorBase {};
   using Complexified    = typename Traits::Complexified; \
   using Realified       = typename Traits::Realified; \
   using DoublePrecision = typename Traits::DoublePrecision; \
+  using DoublePrecision2= typename Traits::DoublePrecision2; \
   static constexpr int TensorLevel = Traits::TensorLevel
 
 template <class vtype>
diff --git a/Grid/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h
index 03f72966..c052adcf 100644
--- a/Grid/tensors/Tensor_inner.h
+++ b/Grid/tensors/Tensor_inner.h
@@ -8,6 +8,7 @@
 
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -194,6 +195,78 @@ auto innerProductD (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decl
   ret._internal = innerProductD(lhs._internal,rhs._internal);
   return ret;
 }
+
+
+//////////////////////////////////////
+// innerProductD2: precision promotion without inner sum
+//////////////////////////////////////
+accelerator_inline vComplexD2 TensorRemove(const vComplexD2 & x) { return x; };
+accelerator_inline vRealD2 TensorRemove(const vRealD2 & x) { return x; };
+
+accelerator_inline ComplexD innerProductD2(const ComplexF &l,const ComplexF &r){  return innerProduct(l,r); }
+accelerator_inline ComplexD innerProductD2(const ComplexD &l,const ComplexD &r){  return innerProduct(l,r); }
+accelerator_inline RealD    innerProductD2(const RealD    &l,const RealD    &r){  return innerProduct(l,r); }
+accelerator_inline RealD    innerProductD2(const RealF    &l,const RealF    &r){  return innerProduct(l,r); }
+
+accelerator_inline vComplexD innerProductD2(const vComplexD &l,const vComplexD &r){  return innerProduct(l,r); }
+accelerator_inline vRealD    innerProductD2(const vRealD    &l,const vRealD    &r){  return innerProduct(l,r); }
+
+accelerator_inline vComplexD2 innerProductD2(const vComplexF &l,const vComplexF &r)
+{  
+  vComplexD la,lb;
+  vComplexD ra,rb;
+  Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
+  Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
+  vComplexD2 ret;
+  ret._internal[0] = innerProduct(la,ra);
+  ret._internal[1] = innerProduct(lb,rb);
+  return ret;
+}
+accelerator_inline vRealD2 innerProductD2(const vRealF &l,const vRealF &r)
+{  
+  vRealD la,lb;
+  vRealD ra,rb;
+  Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
+  Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
+  vRealD2 ret;
+  ret._internal[0]=innerProduct(la,ra);
+  ret._internal[1]=innerProduct(lb,rb); 
+  return ret;
+}
+
+// Now do it for vector, matrix, scalar
+template<class l,class r,int N> accelerator_inline
+  auto innerProductD2 (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal[0],rhs._internal[0]))>
+{
+  typedef decltype(innerProductD2(lhs._internal[0],rhs._internal[0])) ret_t;
+  iScalar<ret_t> ret;
+  zeroit(ret);
+  for(int c1=0;c1<N;c1++){
+    ret._internal += innerProductD2(lhs._internal[c1],rhs._internal[c1]);
+  }
+  return ret;
+}
+template<class l,class r,int N> accelerator_inline
+  auto innerProductD2 (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal[0][0],rhs._internal[0][0]))>
+{
+  typedef decltype(innerProductD2(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
+  iScalar<ret_t> ret;
+  ret=Zero();
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal+=innerProductD2(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+    }}
+  return ret;
+}
+template<class l,class r> accelerator_inline
+  auto innerProductD2 (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal,rhs._internal))>
+{
+  typedef decltype(innerProductD2(lhs._internal,rhs._internal)) ret_t;
+  iScalar<ret_t> ret;
+  ret._internal = innerProductD2(lhs._internal,rhs._internal);
+  return ret;
+}
+
 //////////////////////
 // Keep same precison
 //////////////////////
diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h
index 9067d43d..afb1f916 100644
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -6,6 +6,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
 Author: Michael Marshall <michael.marshall@ed.ac.au>
+Author: Christoph Lehner <christoph@lhnr.de>
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation; either version 2 of the License, or
@@ -37,6 +38,10 @@ NAMESPACE_BEGIN(Grid);
   template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
   template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
 
+  // To store double-precision data in single-precision grids for precision promoted localInnerProductD
+  typedef iVector<vComplexD,2> vComplexD2;
+  typedef iVector<vRealD,2> vRealD2;
+
 //////////////////////////////////////////////////////////////////////////////////
 // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
 // Use of a helper class like this allows us to template specialise and "dress"
@@ -81,6 +86,7 @@ NAMESPACE_BEGIN(Grid);
     typedef ComplexF Complexified;
     typedef RealF Realified;
     typedef RealD DoublePrecision;
+    typedef RealD DoublePrecision2;
   };
   template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base {
     typedef RealD scalar_type;
@@ -93,6 +99,7 @@ NAMESPACE_BEGIN(Grid);
     typedef ComplexD Complexified;
     typedef RealD Realified;
     typedef RealD DoublePrecision;
+    typedef RealD DoublePrecision2;
   };
   template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base {
     typedef ComplexF scalar_type;
@@ -105,6 +112,7 @@ NAMESPACE_BEGIN(Grid);
     typedef ComplexF Complexified;
     typedef RealF Realified;
     typedef ComplexD DoublePrecision;
+    typedef ComplexD DoublePrecision2;
   };
   template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base {
     typedef ComplexD scalar_type;
@@ -117,6 +125,7 @@ NAMESPACE_BEGIN(Grid);
     typedef ComplexD Complexified;
     typedef RealD Realified;
     typedef ComplexD DoublePrecision;
+    typedef ComplexD DoublePrecision2;
   };
   template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base {
     typedef Integer scalar_type;
@@ -129,6 +138,7 @@ NAMESPACE_BEGIN(Grid);
     typedef void Complexified;
     typedef void Realified;
     typedef void DoublePrecision;
+    typedef void DoublePrecision2;
   };
 
   template<> struct GridTypeMapper<vRealF> : public GridTypeMapper_Base {
@@ -142,6 +152,7 @@ NAMESPACE_BEGIN(Grid);
     typedef vComplexF Complexified;
     typedef vRealF Realified;
     typedef vRealD DoublePrecision;
+    typedef vRealD2 DoublePrecision2;
   };
   template<> struct GridTypeMapper<vRealD> : public GridTypeMapper_Base {
     typedef RealD  scalar_type;
@@ -154,6 +165,7 @@ NAMESPACE_BEGIN(Grid);
     typedef vComplexD Complexified;
     typedef vRealD Realified;
     typedef vRealD DoublePrecision;
+    typedef vRealD DoublePrecision2;
   };
   template<> struct GridTypeMapper<vRealH> : public GridTypeMapper_Base {
     // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
@@ -167,6 +179,7 @@ NAMESPACE_BEGIN(Grid);
     typedef vComplexH Complexified;
     typedef vRealH Realified;
     typedef vRealD DoublePrecision;
+    typedef vRealD DoublePrecision2;
   };
   template<> struct GridTypeMapper<vComplexH> : public GridTypeMapper_Base {
     // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
@@ -180,6 +193,7 @@ NAMESPACE_BEGIN(Grid);
     typedef vComplexH Complexified;
     typedef vRealH Realified;
     typedef vComplexD DoublePrecision;
+    typedef vComplexD DoublePrecision2;
   };
   template<> struct GridTypeMapper<vComplexF> : public GridTypeMapper_Base {
     typedef ComplexF  scalar_type;
@@ -192,6 +206,7 @@ NAMESPACE_BEGIN(Grid);
     typedef vComplexF Complexified;
     typedef vRealF Realified;
     typedef vComplexD DoublePrecision;
+    typedef vComplexD2 DoublePrecision2;
   };
   template<> struct GridTypeMapper<vComplexD> : public GridTypeMapper_Base {
     typedef ComplexD  scalar_type;
@@ -204,6 +219,7 @@ NAMESPACE_BEGIN(Grid);
     typedef vComplexD Complexified;
     typedef vRealD Realified;
     typedef vComplexD DoublePrecision;
+    typedef vComplexD DoublePrecision2;
   };
   template<> struct GridTypeMapper<vInteger> : public GridTypeMapper_Base {
     typedef  Integer scalar_type;
@@ -216,6 +232,7 @@ NAMESPACE_BEGIN(Grid);
     typedef void Complexified;
     typedef void Realified;
     typedef void DoublePrecision;
+    typedef void DoublePrecision2;
   };
 
 #define GridTypeMapper_RepeatedTypes \
@@ -234,6 +251,7 @@ NAMESPACE_BEGIN(Grid);
     using Complexified    = iScalar<typename BaseTraits::Complexified>;
     using Realified       = iScalar<typename BaseTraits::Realified>;
     using DoublePrecision = iScalar<typename BaseTraits::DoublePrecision>;
+    using DoublePrecision2= iScalar<typename BaseTraits::DoublePrecision2>;
     static constexpr int Rank = BaseTraits::Rank + 1;
     static constexpr std::size_t count = BaseTraits::count;
     static constexpr int Dimension(int dim) {
@@ -248,6 +266,7 @@ NAMESPACE_BEGIN(Grid);
     using Complexified    = iVector<typename BaseTraits::Complexified,    N>;
     using Realified       = iVector<typename BaseTraits::Realified,       N>;
     using DoublePrecision = iVector<typename BaseTraits::DoublePrecision, N>;
+    using DoublePrecision2= iVector<typename BaseTraits::DoublePrecision2, N>;
     static constexpr int Rank = BaseTraits::Rank + 1;
     static constexpr std::size_t count = BaseTraits::count * N;
     static constexpr int Dimension(int dim) {
@@ -262,6 +281,7 @@ NAMESPACE_BEGIN(Grid);
     using Complexified    = iMatrix<typename BaseTraits::Complexified,    N>;
     using Realified       = iMatrix<typename BaseTraits::Realified,       N>;
     using DoublePrecision = iMatrix<typename BaseTraits::DoublePrecision, N>;
+    using DoublePrecision2= iMatrix<typename BaseTraits::DoublePrecision2, N>;
     static constexpr int Rank = BaseTraits::Rank + 2;
     static constexpr std::size_t count = BaseTraits::count * N * N;
     static constexpr int Dimension(int dim) {

From f1fe444d4fa205896f1d3c3b37c44567086a3b96 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 24 Apr 2020 06:27:20 -0400
Subject: [PATCH 09/51] blocked precision promotion infrastructure upgrade

---
 Grid/tensors/Tensor_inner.h     |  1 +
 Grid/tensors/Tensor_traits.h    | 24 +++++++++++++++++++++---
 tests/Test_innerproduct_norm.cc |  4 ++--
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/Grid/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h
index c052adcf..fd651cae 100644
--- a/Grid/tensors/Tensor_inner.h
+++ b/Grid/tensors/Tensor_inner.h
@@ -200,6 +200,7 @@ auto innerProductD (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decl
 //////////////////////////////////////
 // innerProductD2: precision promotion without inner sum
 //////////////////////////////////////
+
 accelerator_inline vComplexD2 TensorRemove(const vComplexD2 & x) { return x; };
 accelerator_inline vRealD2 TensorRemove(const vRealD2 & x) { return x; };
 
diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h
index afb1f916..5359c547 100644
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -38,10 +38,28 @@ NAMESPACE_BEGIN(Grid);
   template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
   template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
 
-  // To store double-precision data in single-precision grids for precision promoted localInnerProductD
+  // Traits to identify scalars
+  template<typename T>     struct isGridScalar                : public std::false_type { static constexpr bool notvalue = true; };
+  template<class T>        struct isGridScalar<iScalar<T>>    : public std::true_type  { static constexpr bool notvalue = false; };
+
+  // Store double-precision data in single-precision grids for precision promoted localInnerProductD
+  typedef iVector<ComplexD,2> ComplexD2;
   typedef iVector<vComplexD,2> vComplexD2;
+  typedef iVector<RealD,2> RealD2;
   typedef iVector<vRealD,2> vRealD2;
 
+  // Traits to identify fundamental data types
+  template<typename T>     struct isGridFundamental                : public std::false_type { static constexpr bool notvalue = true; };
+  template<>               struct isGridFundamental<vComplexF>     : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<vComplexD>     : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<vRealF>        : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<vRealD>        : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<ComplexF>      : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<ComplexD>      : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<RealF>         : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<RealD>         : public std::true_type  { static constexpr bool notvalue = false; };
+
+
 //////////////////////////////////////////////////////////////////////////////////
 // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
 // Use of a helper class like this allows us to template specialise and "dress"
@@ -86,7 +104,7 @@ NAMESPACE_BEGIN(Grid);
     typedef ComplexF Complexified;
     typedef RealF Realified;
     typedef RealD DoublePrecision;
-    typedef RealD DoublePrecision2;
+    typedef RealD2 DoublePrecision2;
   };
   template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base {
     typedef RealD scalar_type;
@@ -112,7 +130,7 @@ NAMESPACE_BEGIN(Grid);
     typedef ComplexF Complexified;
     typedef RealF Realified;
     typedef ComplexD DoublePrecision;
-    typedef ComplexD DoublePrecision2;
+    typedef ComplexD2 DoublePrecision2;
   };
   template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base {
     typedef ComplexD scalar_type;
diff --git a/tests/Test_innerproduct_norm.cc b/tests/Test_innerproduct_norm.cc
index 85c98521..a8718c6b 100644
--- a/tests/Test_innerproduct_norm.cc
+++ b/tests/Test_innerproduct_norm.cc
@@ -72,7 +72,7 @@ int main(int argc, char** argv) {
 
     sw_res.Reset();
     sw_res.Start();
-    for(int i = 0; i < nIter; ++i) { innerProduct_norm(ip_d_res, norm2_d_res, x_d, y_d); }
+    for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_d_res, norm2_d_res, x_d, y_d); }
     sw_res.Stop();
 
     diff_ip_d    = ip_d_ref - ip_d_res;
@@ -104,7 +104,7 @@ int main(int argc, char** argv) {
 
     sw_res.Reset();
     sw_res.Start();
-    for(int i = 0; i < nIter; ++i) { innerProduct_norm(ip_f_res, norm2_f_res, x_f, y_f); }
+    for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_f_res, norm2_f_res, x_f, y_f); }
     sw_res.Stop();
 
     diff_ip_f    = ip_f_ref - ip_f_res;

From f4033ad8cb32c34debe1623d84eab7c0d79116d5 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Mon, 27 Apr 2020 17:46:14 +0100
Subject: [PATCH 10/51] baryon speedup by a factor 2

---
 Grid/qcd/utils/BaryonUtils.h | 416 ++++++++++++++++++++++++++++++++++-
 1 file changed, 407 insertions(+), 9 deletions(-)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index d65b9176..18d6f84b 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -46,7 +46,8 @@ public:
   typedef typename SpinMatrixField::vector_object sobj;
 
   static const int epsilon[6][3] ;
-  static const Complex epsilon_sgn[6];
+  //static const Complex epsilon_sgn[6];
+  static const double epsilon_sgn[6];
 
   private: 
   template <class mobj, class robj>
@@ -60,6 +61,62 @@ public:
 				 const int parity,
 				 const int * wick_contractions,
   				 robj &result);
+  template <class mobj, class robj, int w0, int w1, int w2, int w3, int w4, int w5>
+  static void baryon_site_macro(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const int parity,
+  				 robj &result);
+  template <class mobj, class robj>
+  static void baryon_site_macro(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const int parity,
+				 const int * wick_contractions,
+  				 robj &result);
+  template <class mobj, class robj>
+  static inline void baryon_site_template(unsigned int mask, const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result);
+  template <unsigned int mask, class mobj, class robj>
+  static inline void baryon_site_template(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result);
+						 
+  template <unsigned int maxMask>
+  struct BaryonSiteHelper
+  {
+  template <class mobj, class robj>
+    static inline void function(const unsigned int mask, const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result);
+						 };
   public:
   static void ContractBaryons(const PropagatorField &q1_left,
 				 const PropagatorField &q2_left,
@@ -151,14 +208,18 @@ public:
 
 template <class FImpl> 
 const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
-template <class FImpl> 
+/*template <class FImpl> 
 const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
 						    Complex(1),
 						    Complex(1),
 						    Complex(-1),
 						    Complex(-1),
 						    Complex(-1)};
+*/
+template <class FImpl> 
+const double BaryonUtils<FImpl>::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0};
 
+//This is the old version
 template <class FImpl>
 template <class mobj, class robj>
 void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
@@ -188,13 +249,15 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
         int a_right = epsilon[ie_right][0]; //a'
         int b_right = epsilon[ie_right][1]; //b'
         int c_right = epsilon[ie_right][2]; //c'
+	//complex<double> ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
         //This is the \delta_{456}^{123} part
 	if (wick_contraction[0]){
           auto D2g = D2 * GammaB_left;
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	    result()()() += ee * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
           }}}
   	}	  
         //This is the \delta_{456}^{231} part
@@ -203,7 +266,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    result()()() += ee * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
           }}}
         }	  
         //This is the \delta_{456}^{312} part
@@ -212,7 +275,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    result()()() += ee * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
           }}}
         }	  
         //This is the \delta_{456}^{132} part
@@ -221,7 +284,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    result()()() -= ee * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
           }}}
         }	  
         //This is the \delta_{456}^{321} part
@@ -230,7 +293,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    result()()() -= ee * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
           }}}
         }	  
         //This is the \delta_{456}^{213} part
@@ -239,13 +302,284 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	    result()()() -= ee * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
           }}}
         }	  
       }
     }
 }
 
+template <class FImpl>
+template <class mobj, class robj, int w0, int w1, int w2, int w3, int w4, int w5>
+void BaryonUtils<FImpl>::baryon_site_macro(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result)
+{
+
+  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
+
+    auto gD1a = GammaA_left * GammaA_right * D1;
+    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
+    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
+    auto gD3 = GammaB_right * D3;
+
+    auto D2g = D2 * GammaB_left;
+    auto pD1g = pD1 * GammaB_left;
+    auto gD3g = gD3 * GammaB_left;
+
+    for (int ie_left=0; ie_left < 6 ; ie_left++){
+      int a_left = epsilon[ie_left][0]; //a
+      int b_left = epsilon[ie_left][1]; //b
+      int c_left = epsilon[ie_left][2]; //c
+      for (int ie_right=0; ie_right < 6 ; ie_right++){
+        int a_right = epsilon[ie_right][0]; //a'
+        int b_right = epsilon[ie_right][1]; //b'
+        int c_right = epsilon[ie_right][2]; //c'
+	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+        //All parts together
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    if(w0){
+	        result()()() += eepD1*D2g_ab*gD3_ab;
+	    }
+  	    if(w1){
+		result()()() += eepD1g_gb*D2_ab*gD3_ag;
+	    }
+	    if(w2){
+		result()()() += eepD1_gb*D2_ag*gD3g_ab;
+	    }
+	    if(w3){
+    		result()()() -= eepD1*D2_ab*gD3g_ab;
+	    }
+    	    if(w4){
+		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
+	    }
+            if(w5){
+    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
+            }
+  	  }}}
+      }
+    }
+}
+
+#define BARYON_SITE(w0, w1, w2, w3, w4, w5, D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, wick_contraction, result) \
+	if((wick_contraction[0] == w0) && (wick_contraction[1] == w1) &&  (wick_contraction[2] == w2) &&  (wick_contraction[3] == w3) &&  (wick_contraction[4] == w4) &&  (wick_contraction[5] == w5)) \
+{\
+ baryon_site_macro<mobj, robj, w0, w1, w2, w3, w4, w5>( D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, result );\
+}
+
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::baryon_site_macro(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 const int * wick_contraction,
+						 robj &result)
+{
+BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+ 
+}
+
+
+template <class FImpl>
+template <unsigned int mask, class mobj, class robj>
+inline void BaryonUtils<FImpl>::baryon_site_template(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result)
+{
+    constexpr bool wick_contraction_0 = ((mask & (1 << 5)) >> 5);
+    constexpr bool wick_contraction_1 = ((mask & (1 << 4)) >> 4);
+    constexpr bool wick_contraction_2 = ((mask & (1 << 3)) >> 3);
+    constexpr bool wick_contraction_3 = ((mask & (1 << 2)) >> 2);
+    constexpr bool wick_contraction_4 = ((mask & (1 << 1)) >> 1);
+    constexpr bool wick_contraction_5 = ((mask & (1 << 0)) >> 0);
+
+  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
+
+    auto gD1a = GammaA_left * GammaA_right * D1;
+    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
+    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
+    auto gD3 = GammaB_right * D3;
+
+    auto D2g = D2 * GammaB_left;
+    auto pD1g = pD1 * GammaB_left;
+    auto gD3g = gD3 * GammaB_left;
+
+    for (int ie_left=0; ie_left < 6 ; ie_left++){
+      int a_left = epsilon[ie_left][0]; //a
+      int b_left = epsilon[ie_left][1]; //b
+      int c_left = epsilon[ie_left][2]; //c
+      for (int ie_right=0; ie_right < 6 ; ie_right++){
+        int a_right = epsilon[ie_right][0]; //a'
+        int b_right = epsilon[ie_right][1]; //b'
+        int c_right = epsilon[ie_right][2]; //c'
+	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+        //All parts together
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    if(wick_contraction_0){
+	      result()()() += eepD1*D2g_ab*gD3_ab;
+	    }
+  	    if(wick_contraction_1){
+		    result()()() += eepD1g_gb*D2_ab*gD3_ag;
+	    }
+	    if(wick_contraction_2){
+		    result()()() += eepD1_gb*D2_ag*gD3g_ab;
+	    }
+            if(wick_contraction_3){
+    		result()()() -= eepD1*D2_ab*gD3g_ab;
+	    }
+    	    if(wick_contraction_4){
+		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
+	    }
+            if(wick_contraction_5){
+    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
+            }
+  	  }}}
+      }
+    }
+}
+
+template <class FImpl>
+template <unsigned int maxMask>
+template <class mobj, class robj>
+inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsigned int mask, const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result)
+{
+    if (mask == maxMask)
+    {
+        baryon_site_template<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
+    }
+    else
+    {
+        BaryonSiteHelper<(maxMask>0) ? maxMask-1 : 0>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
+    }
+}
+
+// top-level function
+template <class FImpl>
+template <class mobj, class robj>
+inline void BaryonUtils<FImpl>::baryon_site_template(const unsigned int mask, const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result)
+{
+    BaryonSiteHelper<63>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
+}
+
+
 template<class FImpl>
 void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const PropagatorField &q2_left,
@@ -259,6 +593,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
+    const std::chrono::system_clock::time_point start{ std::chrono::system_clock::now() };
+    std::time_t now = std::chrono::system_clock::to_time_t( start );
+    std::cout << "Setup start " << std::ctime( &now );
+
   std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
@@ -278,6 +616,16 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   auto v2 = q2_left.View();
   auto v3 = q3_left.View();
 
+    const std::chrono::system_clock::time_point stop{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( stop );
+    const std::chrono::duration<double> duration_seconds = stop - start;
+    const double seconds{ ( duration_seconds.count() ) };
+    std::cout << "Setup stop " << std::ctime( &now )
+              << "Total duration " << std::fixed << std::setprecision(5) << seconds << " seconds." << std::endl;
+
+    const std::chrono::system_clock::time_point start2{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( start2 );
+    std::cout << "Normal Loop start " << std::ctime( &now );
  // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
   thread_for(ss,grid->oSites(),{
   //for(int ss=0; ss < grid->oSites(); ss++){
@@ -290,6 +638,56 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
+    const std::chrono::system_clock::time_point stop2{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( stop2 );
+    const std::chrono::duration<double> duration_seconds2 = stop2 - start2;
+    const double seconds2{ ( duration_seconds2.count() ) };
+    std::cout << "Normal Loop stop " << std::ctime( &now )
+              << "Total duration " << std::fixed << std::setprecision(5) << seconds2 << " seconds." << std::endl;
+	      const std::chrono::system_clock::time_point start4{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( start4 );
+    std::cout << "Opt-macro Loop start " << std::ctime( &now );
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
+  //for(int ss=0; ss < grid->oSites(); ss++){
+
+    auto D1 = v1[ss];
+    auto D2 = v2[ss];
+    auto D3 = v3[ss];
+
+    vobj result=Zero();
+    baryon_site_macro(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    vbaryon_corr[ss] = result; 
+  }  );//end loop over lattice sites
+    const std::chrono::system_clock::time_point stop4{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( stop4 );
+    const std::chrono::duration<double> duration_seconds4 = stop4 - start4;
+    const double seconds4{ ( duration_seconds4.count() ) };
+    std::cout << "Opt-macro Loop stop " << std::ctime( &now )
+              << "Total duration " << std::fixed << std::setprecision(5) << seconds4 << " seconds." << std::endl; 
+    const std::chrono::system_clock::time_point start3{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( start3 );
+    int wick_id=32*wick_contraction[0]+16*wick_contraction[1]+8*wick_contraction[2]+4*wick_contraction[3]+2*wick_contraction[4]+wick_contraction[5];
+    std::cout << "Opt-template Loop start " << std::ctime( &now );
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
+  //for(int ss=0; ss < grid->oSites(); ss++){
+
+    auto D1 = v1[ss];
+    auto D2 = v2[ss];
+    auto D3 = v3[ss];
+
+    vobj result=Zero();
+    baryon_site_template(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
+    vbaryon_corr[ss] = result; 
+  }  );//end loop over lattice sites
+    const std::chrono::system_clock::time_point stop3{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( stop3 );
+    const std::chrono::duration<double> duration_seconds3 = stop3 - start3;
+    const double seconds3{ ( duration_seconds3.count() ) };
+    std::cout << "Opt-template Loop stop " << std::ctime( &now )
+              << "Total duration " << std::fixed << std::setprecision(5) << seconds3 << " seconds." << std::endl;
+
 }
 template <class FImpl>
 template <class mobj, class robj>
@@ -318,7 +716,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
     wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
 
      result=Zero();
-     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 }
 
 /***********************************************************************

From 6240e02619be0b0c5bda84173c0dc6cba62aef84 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Mon, 27 Apr 2020 18:50:53 +0100
Subject: [PATCH 11/51] added assertion to avoid potential infinite loop

---
 Grid/qcd/utils/BaryonUtils.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 18d6f84b..a392f223 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -553,6 +553,7 @@ inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsign
 						 const int parity,
 						 robj &result)
 {
+    assert(mask <= maxMask);
     if (mask == maxMask)
     {
         baryon_site_template<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);

From dee96cbf8296b9f16dc378c78cdcb74302da77c5 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <giltirn@gmail.com>
Date: Wed, 29 Apr 2020 10:37:11 -0400
Subject: [PATCH 12/51] Added workaround in configure to still catch Cuda
 compiler when nvcc with extra arguments (eg -ccbin) is used as CXX

---
 configure.ac | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index fb69ca0e..46559507 100644
--- a/configure.ac
+++ b/configure.ac
@@ -274,12 +274,20 @@ case ${ac_gen_scalar} in
 esac
 
 ##################### Compiler dependent choices
-case ${CXX} in 
+
+#Strip any optional compiler arguments from nvcc call (eg -ccbin) for compiler comparison
+CXXBASE=${CXX}
+CXXTEST=${CXX}
+if echo "${CXX}" | grep -q "nvcc"; then
+  CXXTEST="nvcc"
+fi   
+
+case ${CXXTEST} in 
   nvcc) 
 #    CXX="nvcc -keep -v -x cu "
 #    CXXLD="nvcc -v -link"
-    CXX="nvcc -x cu "
-    CXXLD="nvcc -link"
+    CXX="${CXXBASE} -x cu "
+    CXXLD="${CXXBASE} -link"
 #    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
     CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
     if test $ac_openmp = yes; then

From dbaeefaeef334eabd3feeb73c70da13c9a548bcf Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Thu, 30 Apr 2020 15:02:51 +0100
Subject: [PATCH 13/51] All Eigen::TensorMap objects are fixed (i.e. cannot be
 dynamically resized)

---
 Grid/serialisation/BaseIO.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h
index bf424fc7..49406201 100644
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@@ -87,11 +87,7 @@ namespace Grid {
     template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
     struct is_tensor_fixed<Eigen::TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType>>
         : public std::true_type {};
-    template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType,
-              int MapOptions_, template <class> class MapPointer_>
-    struct is_tensor_fixed<Eigen::TensorMap<Eigen::TensorFixedSize<Scalar_, Dimensions_,
-                                            Options_, IndexType>, MapOptions_, MapPointer_>>
-        : public std::true_type {};
+    template<typename T> struct is_tensor_fixed<Eigen::TensorMap<T>> : public std::true_type {};
 
     // Is this a variable-size Eigen tensor
     template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {};

From 4a4b9e305d39dc4be0d1497eab81d432a831dbf4 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Thu, 30 Apr 2020 19:40:04 +0100
Subject: [PATCH 14/51] Fix: strToVec enters infinite loop and exhausts memory
 if operator>> fails before the end of string, e.g. if parsing "0_0_0" for
 momentum instead of "0 0 0".

---
 Grid/serialisation/VectorUtils.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Grid/serialisation/VectorUtils.h b/Grid/serialisation/VectorUtils.h
index a5a73992..dd5ff0b8 100644
--- a/Grid/serialisation/VectorUtils.h
+++ b/Grid/serialisation/VectorUtils.h
@@ -432,12 +432,10 @@ namespace Grid {
   std::vector<T> strToVec(const std::string s)
   {
     std::istringstream sstr(s);
-    T                  buf;
     std::vector<T>     v;
     
-    while(!sstr.eof())
+    for(T buf; sstr >> buf;)
     {
-      sstr >> buf;
       v.push_back(buf);
     }
     

From ddb192bac73f7d0b2329f8bd43fe4d64e31f90e1 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@login1.summit.olcf.ornl.gov>
Date: Thu, 30 Apr 2020 16:09:57 -0400
Subject: [PATCH 15/51] re-work double precision promotion for summit

---
 Grid/lattice/Lattice_reduction.h |  2 +-
 Grid/parallelIO/NerscIO.h        |  2 +-
 Grid/tensors/Tensor_traits.h     | 92 ++++++++++++++++++++++++++++++--
 3 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index d3f5f9ae..e2690cfe 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -234,7 +234,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
   accelerator_for( ss, sites, nsimd,{
       auto left_tmp = left_v(ss);
       coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
-      coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)));
+      coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
   });
 
   tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h
index d3b62d1f..5522ba91 100644
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -146,7 +146,7 @@ public:
     int ieee32big = (format == std::string("IEEE32BIG"));
     int ieee32    = (format == std::string("IEEE32"));
     int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
 
     uint32_t nersc_csum,scidac_csuma,scidac_csumb;
     // depending on datatype, set up munger;
diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h
index 5359c547..04d7343e 100644
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -43,10 +43,38 @@ NAMESPACE_BEGIN(Grid);
   template<class T>        struct isGridScalar<iScalar<T>>    : public std::true_type  { static constexpr bool notvalue = false; };
 
   // Store double-precision data in single-precision grids for precision promoted localInnerProductD
-  typedef iVector<ComplexD,2> ComplexD2;
-  typedef iVector<vComplexD,2> vComplexD2;
-  typedef iVector<RealD,2> RealD2;
-  typedef iVector<vRealD,2> vRealD2;
+  template<typename T>
+  class TypePair {
+  public:
+    T _internal[2];
+    TypePair<T>& operator=(const Grid::Zero& o) {
+      _internal[0] = Zero();
+      _internal[1] = Zero();
+      return *this;
+    }
+
+    TypePair<T> operator+(const TypePair<T>& o) const {
+      TypePair<T> r;
+      r._internal[0] = _internal[0] + o._internal[0];
+      r._internal[1] = _internal[1] + o._internal[1];
+      return r;
+    }
+
+    TypePair<T>& operator+=(const TypePair<T>& o) {
+      _internal[0] += o._internal[0];
+      _internal[1] += o._internal[1];
+      return *this;
+    }
+
+    friend accelerator_inline void add(TypePair<T>* ret, const TypePair<T>* a, const TypePair<T>* b) {
+      add(&ret->_internal[0],&a->_internal[0],&b->_internal[0]);
+      add(&ret->_internal[1],&a->_internal[1],&b->_internal[1]);
+    }
+  };
+  typedef TypePair<ComplexD> ComplexD2;
+  typedef TypePair<RealD> RealD2;
+  typedef TypePair<vComplexD> vComplexD2;
+  typedef TypePair<vRealD> vRealD2;
 
   // Traits to identify fundamental data types
   template<typename T>     struct isGridFundamental                : public std::false_type { static constexpr bool notvalue = true; };
@@ -58,6 +86,10 @@ NAMESPACE_BEGIN(Grid);
   template<>               struct isGridFundamental<ComplexD>      : public std::true_type  { static constexpr bool notvalue = false; };
   template<>               struct isGridFundamental<RealF>         : public std::true_type  { static constexpr bool notvalue = false; };
   template<>               struct isGridFundamental<RealD>         : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<vComplexD2>    : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<vRealD2>       : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<ComplexD2>     : public std::true_type  { static constexpr bool notvalue = false; };
+  template<>               struct isGridFundamental<RealD2>        : public std::true_type  { static constexpr bool notvalue = false; };
 
 
 //////////////////////////////////////////////////////////////////////////////////
@@ -119,6 +151,19 @@ NAMESPACE_BEGIN(Grid);
     typedef RealD DoublePrecision;
     typedef RealD DoublePrecision2;
   };
+  template<> struct GridTypeMapper<RealD2> : public GridTypeMapper_Base {
+    typedef RealD2 scalar_type;
+    typedef RealD2 scalar_typeD;
+    typedef RealD2 vector_type;
+    typedef RealD2 vector_typeD;
+    typedef RealD2 tensor_reduced;
+    typedef RealD2 scalar_object;
+    typedef RealD2 scalar_objectD;
+    typedef ComplexD2 Complexified;
+    typedef RealD2 Realified;
+    typedef RealD2 DoublePrecision;
+    typedef RealD2 DoublePrecision2;
+  };
   template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base {
     typedef ComplexF scalar_type;
     typedef ComplexD scalar_typeD;
@@ -145,6 +190,19 @@ NAMESPACE_BEGIN(Grid);
     typedef ComplexD DoublePrecision;
     typedef ComplexD DoublePrecision2;
   };
+  template<> struct GridTypeMapper<ComplexD2> : public GridTypeMapper_Base {
+    typedef ComplexD2 scalar_type;
+    typedef ComplexD2 scalar_typeD;
+    typedef ComplexD2 vector_type;
+    typedef ComplexD2 vector_typeD;
+    typedef ComplexD2 tensor_reduced;
+    typedef ComplexD2 scalar_object;
+    typedef ComplexD2 scalar_objectD;
+    typedef ComplexD2 Complexified;
+    typedef RealD2 Realified;
+    typedef ComplexD2 DoublePrecision;
+    typedef ComplexD2 DoublePrecision2;
+  };
   template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base {
     typedef Integer scalar_type;
     typedef Integer scalar_typeD;
@@ -185,6 +243,19 @@ NAMESPACE_BEGIN(Grid);
     typedef vRealD DoublePrecision;
     typedef vRealD DoublePrecision2;
   };
+  template<> struct GridTypeMapper<vRealD2> : public GridTypeMapper_Base {
+    typedef RealD2  scalar_type;
+    typedef RealD2  scalar_typeD;
+    typedef vRealD2 vector_type;
+    typedef vRealD2 vector_typeD;
+    typedef vRealD2 tensor_reduced;
+    typedef RealD2  scalar_object;
+    typedef RealD2  scalar_objectD;
+    typedef vComplexD2 Complexified;
+    typedef vRealD2 Realified;
+    typedef vRealD2 DoublePrecision;
+    typedef vRealD2 DoublePrecision2;
+  };
   template<> struct GridTypeMapper<vRealH> : public GridTypeMapper_Base {
     // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
     typedef RealF  scalar_type;
@@ -239,6 +310,19 @@ NAMESPACE_BEGIN(Grid);
     typedef vComplexD DoublePrecision;
     typedef vComplexD DoublePrecision2;
   };
+  template<> struct GridTypeMapper<vComplexD2> : public GridTypeMapper_Base {
+    typedef ComplexD2  scalar_type;
+    typedef ComplexD2  scalar_typeD;
+    typedef vComplexD2 vector_type;
+    typedef vComplexD2 vector_typeD;
+    typedef vComplexD2 tensor_reduced;
+    typedef ComplexD2  scalar_object;
+    typedef ComplexD2  scalar_objectD;
+    typedef vComplexD2 Complexified;
+    typedef vRealD2 Realified;
+    typedef vComplexD2 DoublePrecision;
+    typedef vComplexD2 DoublePrecision2;
+  };
   template<> struct GridTypeMapper<vInteger> : public GridTypeMapper_Base {
     typedef  Integer scalar_type;
     typedef  Integer scalar_typeD;

From c8af498a2abb195f606b389a12824bf3e65315f7 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@login1.summit.olcf.ornl.gov>
Date: Fri, 1 May 2020 03:45:50 -0400
Subject: [PATCH 16/51] BinaryIO fix for alternative little-endian format name
 (used in 96I ensemble)

---
 Grid/parallelIO/BinaryIO.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h
index f90c34a9..1f11add9 100644
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -341,7 +341,7 @@ class BinaryIO {
     int ieee32big = (format == std::string("IEEE32BIG"));
     int ieee32    = (format == std::string("IEEE32"));
     int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
     assert(ieee64||ieee32|ieee64big||ieee32big);
     assert((ieee64+ieee32+ieee64big+ieee32big)==1);
     //////////////////////////////////////////////////////////////////////////////

From 63cf201ee7ad32271537e5bbfe70d4df049a83f5 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@login5.summit.olcf.ornl.gov>
Date: Sat, 2 May 2020 11:38:42 -0400
Subject: [PATCH 17/51] Add AdviseInfrequentUse

---
 Grid/lattice/Lattice_base.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index ec7c54ec..a16b4fa8 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -75,6 +76,14 @@ public:
     if (grid) conformable(grid, _grid);
     else      grid = _grid;
   };
+
+  // Advise that the data is used infrequently.  This can
+  // significantly influence performance of bulk storage.
+  accelerator_inline void AdviseInfrequentUse() {
+#ifdef __CUDA_ARCH__
+    cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
+#endif
+  };
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////

From 949be9605cf2d801439ec38ad85aa9554b29eb58 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@login1.summit.olcf.ornl.gov>
Date: Sat, 2 May 2020 16:20:03 -0400
Subject: [PATCH 18/51] fix pragmas

---
 Grid/lattice/Lattice_base.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index a16b4fa8..271ab43e 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -80,8 +80,10 @@ public:
   // Advise that the data is used infrequently.  This can
   // significantly influence performance of bulk storage.
   accelerator_inline void AdviseInfrequentUse() {
-#ifdef __CUDA_ARCH__
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
     cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
+#endif
 #endif
   };
 };

From 38532753f40a28b98d648e0896138f9fab9c3cf9 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@login4.summit.olcf.ornl.gov>
Date: Sun, 3 May 2020 08:58:32 -0400
Subject: [PATCH 19/51] interface cleanup

---
 Grid/lattice/Lattice_base.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 271ab43e..1e8b4152 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -50,6 +50,14 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
   assert(lhs == rhs);
 }
 
+////////////////////////////////////////////////////////////////////////////
+// Advise for memory management
+////////////////////////////////////////////////////////////////////////////
+enum LatticeAcceleratorAdvise {
+  AdviseInfrequentUse = 0x1    // Advise that the data is used infrequently.  This can
+                               // significantly influence performance of bulk storage.
+};
+
 ////////////////////////////////////////////////////////////////////////////
 // Minimal base class containing only data valid to access from accelerator
 // _odata will be a managed pointer in CUDA
@@ -77,12 +85,12 @@ public:
     else      grid = _grid;
   };
 
-  // Advise that the data is used infrequently.  This can
-  // significantly influence performance of bulk storage.
-  accelerator_inline void AdviseInfrequentUse() {
+  accelerator_inline void Advise(int advise) {
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
-    cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
+    if (advise & AdviseInfrequentUse) {
+      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
+    }
 #endif
 #endif
   };

From 9bfa51bffbd3550b4ee2f3cd7594ccb5bbe75535 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 3 May 2020 09:12:52 -0400
Subject: [PATCH 20/51] cleanup comment

---
 Grid/lattice/Lattice_base.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 1e8b4152..157c647b 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -51,7 +51,7 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
 }
 
 ////////////////////////////////////////////////////////////////////////////
-// Advise for memory management
+// Advise the LatticeAccelerator class
 ////////////////////////////////////////////////////////////////////////////
 enum LatticeAcceleratorAdvise {
   AdviseInfrequentUse = 0x1    // Advise that the data is used infrequently.  This can

From 2a1387e992781258ffaf636142bb0bb13b168997 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 3 May 2020 17:27:11 -0400
Subject: [PATCH 21/51] rankInnerProduct

---
 Grid/lattice/Lattice_reduction.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index e2690cfe..8acbde66 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -5,6 +5,7 @@
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation; either version 2 of the License, or
@@ -93,7 +94,7 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
 
 // Double inner product
 template<class vobj>
-inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
+inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
   typedef typename vobj::scalar_type scalar_type;
   typedef typename vobj::vector_typeD vector_type;
@@ -137,11 +138,18 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
   })
   nrm = TensorRemove(sum(inner_tmp_v,sites));
 #endif
-  grid->GlobalSum(nrm);
-
   return nrm;
 }
 
+template<class vobj>
+inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
+  GridBase *grid = left.Grid();
+  ComplexD nrm = rankInnerProduct(left,right);
+  grid->GlobalSum(nrm);
+  return nrm;
+}
+
+
 /////////////////////////
 // Fast axpby_norm
 // z = a x + b y

From 04863f8f3835bbfec9f913139eddc1dca717bbb2 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@batch1.summit.olcf.ornl.gov>
Date: Mon, 4 May 2020 16:07:03 -0400
Subject: [PATCH 22/51] debug new AcceleratorView

---
 Grid/lattice/Lattice_ET.h        |  3 +-
 Grid/lattice/Lattice_arith.h     | 69 ++++++++++++++++---------------
 Grid/lattice/Lattice_base.h      | 71 +++++++++++++++++++++++++++-----
 Grid/lattice/Lattice_reduction.h | 14 +++----
 4 files changed, 105 insertions(+), 52 deletions(-)

diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h
index cf7147b9..da63d5e6 100644
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
+Author: Christoph Lehner <christoph@lhnr.de
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -94,7 +95,7 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
 template <class lobj> accelerator_inline 
 const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
 {
-  auto view = arg.View();
+  auto view = arg.AcceleratorView(ViewRead);
   return view[ss];
 }
 
diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h
index 3543d6aa..c4a67620 100644
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -7,6 +7,7 @@
     Copyright (C) 2015
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
   conformable(ret,rhs);
   conformable(lhs,rhs);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@@ -55,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,rhs);
   conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -72,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,rhs);
   conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -88,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,rhs);
   conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -107,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(lhs,ret);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     mult(&tmp,&lhs_v(ss),&rhs);
@@ -120,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,lhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -134,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,lhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -147,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(lhs,ret);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -164,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = rhs.Checkerboard();
   conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto rhs_t=rhs_v(ss);
@@ -178,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = rhs.Checkerboard();
   conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto rhs_t=rhs_v(ss);
@@ -192,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = rhs.Checkerboard();
   conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto rhs_t=rhs_v(ss);
@@ -205,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = rhs.Checkerboard();
   conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto rhs_t=rhs_v(ss);
@@ -220,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
   ret.Checkerboard() = x.Checkerboard();
   conformable(ret,x);
   conformable(x,y);
-  auto ret_v = ret.View();
-  auto x_v = x.View();
-  auto y_v = y.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto x_v = x.AcceleratorView(ViewRead);
+  auto y_v = y.AcceleratorView(ViewRead);
   accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
     auto tmp = a*x_v(ss)+y_v(ss);
     coalescedWrite(ret_v[ss],tmp);
@@ -233,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
   ret.Checkerboard() = x.Checkerboard();
   conformable(ret,x);
   conformable(x,y);
-  auto ret_v = ret.View();
-  auto x_v = x.View();
-  auto y_v = y.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto x_v = x.AcceleratorView(ViewRead);
+  auto y_v = y.AcceleratorView(ViewRead);
   accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
     auto tmp = a*x_v(ss)+b*y_v(ss);
     coalescedWrite(ret_v[ss],tmp);
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 157c647b..30aa6b06 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -54,8 +54,20 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
 // Advise the LatticeAccelerator class
 ////////////////////////////////////////////////////////////////////////////
 enum LatticeAcceleratorAdvise {
-  AdviseInfrequentUse = 0x1    // Advise that the data is used infrequently.  This can
-                               // significantly influence performance of bulk storage.
+  AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
+                                // significantly influence performance of bulk storage.
+  AdviseReadMostly = 0x2,       // Data will mostly be read.  On some architectures
+                                // enables read-only copies of memory to be kept on
+                                // host and device.
+};
+
+////////////////////////////////////////////////////////////////////////////
+// View Access Mode
+////////////////////////////////////////////////////////////////////////////
+enum ViewMode {
+  ViewRead = 0x1,
+  ViewWrite = 0x2,
+  ViewReadWrite = 0x3
 };
 
 ////////////////////////////////////////////////////////////////////////////
@@ -91,6 +103,29 @@ public:
     if (advise & AdviseInfrequentUse) {
       cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
     }
+    if (advise & AdviseReadMostly) {
+      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
+    }
+#endif
+#endif
+  };
+
+  accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    int target;
+    cudaGetDevice(&target);
+    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
+    std::cout<< GridLogMessage << "To Device " << target << std::endl;
+#endif
+#endif
+  };
+
+  accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
+    std::cout<< GridLogMessage << "To Host" << std::endl;
 #endif
 #endif
   };
@@ -225,9 +260,23 @@ public:
   // The view is trivially copy constructible and may be copied to an accelerator device
   // in device lambdas
   /////////////////////////////////////////////////////////////////////////////////
-  LatticeView<vobj> View (void) const 
+  LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
+  {                                   //                     and HostView        for thread_for
+    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    return accessor;
+  }
+
+  LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const 
   {
     LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    accessor.AcceleratorPrefetch(mode);
+    return accessor;
+  }
+
+  LatticeView<vobj> HostView(int mode = ViewReadWrite) const 
+  {
+    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    accessor.HostPrefetch(mode);
     return accessor;
   }
   
@@ -251,7 +300,7 @@ public:
     assert( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
 
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
     accelerator_for(ss,me.size(),1,{
       auto tmp = eval(ss,expr);
       vstream(me[ss],tmp);
@@ -270,7 +319,7 @@ public:
     assert( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
 
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
     accelerator_for(ss,me.size(),1,{
       auto tmp = eval(ss,expr);
       vstream(me[ss],tmp);
@@ -288,7 +337,7 @@ public:
     CBFromExpression(cb,expr);
     assert( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
     accelerator_for(ss,me.size(),1,{
       auto tmp = eval(ss,expr);
       vstream(me[ss],tmp);
@@ -399,8 +448,9 @@ public:
     typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
     conformable(*this,r);
     this->checkerboard = r.Checkerboard();
-    auto me =   View();
-    auto him= r.View();
+    std::cout << GridLogMessage << "Copy other" << std::endl;
+    auto me =   AcceleratorView(ViewWrite);
+    auto him= r.AcceleratorView(ViewRead);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       coalescedWrite(me[ss],him(ss));
     });
@@ -413,8 +463,9 @@ public:
   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
     this->checkerboard = r.Checkerboard();
     conformable(*this,r);
-    auto me =   View();
-    auto him= r.View();
+    std::cout << GridLogMessage << "Copy same" << std::endl;
+    auto me =   AcceleratorView(ViewWrite);
+    auto him= r.AcceleratorView(ViewRead);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       coalescedWrite(me[ss],him(ss));
     });
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 8acbde66..3147823d 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -103,8 +103,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
   GridBase *grid = left.Grid();
   
   // Might make all code paths go this way.
-  auto left_v = left.View();
-  auto right_v=right.View();
+  auto left_v = left.AcceleratorView(ViewRead);
+  auto right_v=right.AcceleratorView(ViewRead);
 
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();
@@ -175,9 +175,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
   
   GridBase *grid = x.Grid();
 
-  auto x_v=x.View();
-  auto y_v=y.View();
-  auto z_v=z.View();
+  auto x_v=x.AcceleratorView(ViewRead);
+  auto y_v=y.AcceleratorView(ViewRead);
+  auto z_v=z.AcceleratorView(ViewWrite);
 
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();
@@ -224,8 +224,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 
   GridBase *grid = left.Grid();
 
-  auto left_v=left.View();
-  auto right_v=right.View();
+  auto left_v=left.AcceleratorView(ViewRead);
+  auto right_v=right.AcceleratorView(ViewRead);
 
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();

From 6b64727161b4328d54ed919be2979ba518614c37 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@login1.summit.olcf.ornl.gov>
Date: Tue, 5 May 2020 05:05:36 -0400
Subject: [PATCH 23/51] disable comments

---
 Grid/lattice/Lattice_base.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 30aa6b06..0b03dea0 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -116,7 +116,7 @@ public:
     int target;
     cudaGetDevice(&target);
     cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
-    std::cout<< GridLogMessage << "To Device " << target << std::endl;
+    //std::cout<< GridLogMessage << "To Device " << target << std::endl;
 #endif
 #endif
   };
@@ -125,7 +125,7 @@ public:
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
     cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
-    std::cout<< GridLogMessage << "To Host" << std::endl;
+    //std::cout<< GridLogMessage << "To Host" << std::endl;
 #endif
 #endif
   };
@@ -448,7 +448,7 @@ public:
     typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
     conformable(*this,r);
     this->checkerboard = r.Checkerboard();
-    std::cout << GridLogMessage << "Copy other" << std::endl;
+    //std::cout << GridLogMessage << "Copy other" << std::endl;
     auto me =   AcceleratorView(ViewWrite);
     auto him= r.AcceleratorView(ViewRead);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
@@ -463,7 +463,7 @@ public:
   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
     this->checkerboard = r.Checkerboard();
     conformable(*this,r);
-    std::cout << GridLogMessage << "Copy same" << std::endl;
+    //std::cout << GridLogMessage << "Copy same" << std::endl;
     auto me =   AcceleratorView(ViewWrite);
     auto him= r.AcceleratorView(ViewRead);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{

From e9b295f967c03bfe200880cd834721b2583dced5 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@login2.summit.olcf.ornl.gov>
Date: Wed, 6 May 2020 08:42:28 -0400
Subject: [PATCH 24/51] Synchronize blocking infrastructure with GPT

---
 .../iterative/ImplicitlyRestartedLanczos.h    | 205 -------------
 Grid/lattice/Lattice.h                        |   2 +-
 Grid/lattice/Lattice_base.h                   |   5 -
 Grid/lattice/Lattice_transfer.h               | 286 +++++++++++++-----
 4 files changed, 216 insertions(+), 282 deletions(-)

diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 8bee43cc..3d0a2a75 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -37,211 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>
 
 NAMESPACE_BEGIN(Grid); 
 
-  ////////////////////////////////////////////////////////
-  // Move following 100 LOC to lattice/Lattice_basis.h
-  ////////////////////////////////////////////////////////
-template<class Field>
-void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
-{
-  // If assume basis[j] are already orthonormal,
-  // can take all inner products in parallel saving 2x bandwidth
-  // Save 3x bandwidth on the second line of loop.
-  // perhaps 2.5x speed up.
-  // 2x overall in Multigrid Lanczos  
-  for(int j=0; j<k; ++j){
-    auto ip = innerProduct(basis[j],w);
-    w = w - ip*basis[j];
-  }
-}
-
-template<class Field>
-void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
-{
-  typedef decltype(basis[0].View()) View;
-  auto tmp_v = basis[0].View();
-  Vector<View> basis_v(basis.size(),tmp_v);
-  typedef typename Field::vector_object vobj;
-  GridBase* grid = basis[0].Grid();
-
-  for(int k=0;k<basis.size();k++){
-    basis_v[k] = basis[k].View();
-  }
-#if 0
-  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
-  thread_region
-  {
-    vobj* B = Bt.data() + Nm * thread_num();
-
-    thread_for_in_region(ss, grid->oSites(),{
-      for(int j=j0; j<j1; ++j) B[j]=0.;
-      
-      for(int j=j0; j<j1; ++j){
-	for(int k=k0; k<k1; ++k){
-	  B[j] +=Qt(j,k) * basis_v[k][ss];
-	}
-      }
-      for(int j=j0; j<j1; ++j){
-	basis_v[j][ss] = B[j];
-      }
-    });
-  }
-#else
-
-  int nrot = j1-j0;
-
-
-  uint64_t oSites   =grid->oSites();
-  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
-
-  //  printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
-
-  Vector <vobj> Bt(siteBlock * nrot); 
-  auto Bp=&Bt[0];
-
-  // GPU readable copy of Eigen matrix
-  Vector<double> Qt_jv(Nm*Nm);
-  double *Qt_p = & Qt_jv[0];
-  for(int k=0;k<Nm;++k){
-    for(int j=0;j<Nm;++j){
-      Qt_p[j*Nm+k]=Qt(j,k);
-    }
-  }
-
-  // Block the loop to keep storage footprint down
-  vobj zz=Zero();
-  for(uint64_t s=0;s<oSites;s+=siteBlock){
-
-    // remaining work in this block
-    int ssites=MIN(siteBlock,oSites-s);
-
-    // zero out the accumulators
-    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
-	auto z=coalescedRead(zz);
-	coalescedWrite(Bp[ss],z);
-    });
-
-    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
-	
-      int j =sj%nrot;
-      int jj  =j0+j;
-      int ss =sj/nrot;
-      int sss=ss+s;
-
-      for(int k=k0; k<k1; ++k){
-	auto tmp = coalescedRead(Bp[ss*nrot+j]);
-	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
-      }
-    });
-
-    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
-      int j =sj%nrot;
-      int jj  =j0+j;
-      int ss =sj/nrot;
-      int sss=ss+s;
-      coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
-    });
-  }
-#endif
-}
-
-// Extract a single rotated vector
-template<class Field>
-void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
-{
-  typedef decltype(basis[0].View()) View;
-  typedef typename Field::vector_object vobj;
-  GridBase* grid = basis[0].Grid();
-
-  result.Checkerboard() = basis[0].Checkerboard();
-  auto result_v=result.View();
-  Vector<View> basis_v(basis.size(),result_v);
-  for(int k=0;k<basis.size();k++){
-    basis_v[k] = basis[k].View();
-  }
-  vobj zz=Zero();
-  Vector<double> Qt_jv(Nm);
-  double * Qt_j = & Qt_jv[0];
-  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
-  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
-    auto B=coalescedRead(zz);
-    for(int k=k0; k<k1; ++k){
-      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
-    }
-    coalescedWrite(result_v[ss], B);
-  });
-}
-
-template<class Field>
-void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
-{
-  int vlen = idx.size();
-
-  assert(vlen>=1);
-  assert(vlen<=sort_vals.size());
-  assert(vlen<=_v.size());
-
-  for (size_t i=0;i<vlen;i++) {
-
-    if (idx[i] != i) {
-
-      //////////////////////////////////////
-      // idx[i] is a table of desired sources giving a permutation.
-      // Swap v[i] with v[idx[i]].
-      // Find  j>i for which _vnew[j] = _vold[i],
-      // track the move idx[j] => idx[i]
-      // track the move idx[i] => i
-      //////////////////////////////////////
-      size_t j;
-      for (j=i;j<idx.size();j++)
-	if (idx[j]==i)
-	  break;
-
-      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
-
-      swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
-      std::swap(sort_vals[i],sort_vals[idx[i]]);
-
-      idx[j] = idx[i];
-      idx[i] = i;
-    }
-  }
-}
-
-inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
-{
-  std::vector<int> idx(sort_vals.size());
-  std::iota(idx.begin(), idx.end(), 0);
-
-  // sort indexes based on comparing values in v
-  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
-    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
-  });
-  return idx;
-}
-
-template<class Field>
-void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
-{
-  std::vector<int> idx = basisSortGetIndex(sort_vals);
-  if (reverse)
-    std::reverse(idx.begin(), idx.end());
-  
-  basisReorderInPlace(_v,sort_vals,idx);
-}
-
-// PAB: faster to compute the inner products first then fuse loops.
-// If performance critical can improve.
-template<class Field>
-void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
-  result = Zero();
-  assert(_v.size()==eval.size());
-  int N = (int)_v.size();
-  for (int i=0;i<N;i++) {
-    Field& tmp = _v[i];
-    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
-  }
-}
-
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h
index 6eac0362..13c0a885 100644
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -43,4 +43,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_rng.h>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
-
+#include <Grid/lattice/Lattice_basis.h>
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 0b03dea0..74525cc1 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -116,7 +116,6 @@ public:
     int target;
     cudaGetDevice(&target);
     cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
-    //std::cout<< GridLogMessage << "To Device " << target << std::endl;
 #endif
 #endif
   };
@@ -125,7 +124,6 @@ public:
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
     cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
-    //std::cout<< GridLogMessage << "To Host" << std::endl;
 #endif
 #endif
   };
@@ -425,7 +423,6 @@ public:
   // copy constructor
   ///////////////////////////////////////////
   Lattice(const Lattice& r){ 
-    //    std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl; 
     this->_grid = r.Grid();
     resize(this->_grid->oSites());
     *this = r;
@@ -448,7 +445,6 @@ public:
     typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
     conformable(*this,r);
     this->checkerboard = r.Checkerboard();
-    //std::cout << GridLogMessage << "Copy other" << std::endl;
     auto me =   AcceleratorView(ViewWrite);
     auto him= r.AcceleratorView(ViewRead);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
@@ -463,7 +459,6 @@ public:
   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
     this->checkerboard = r.Checkerboard();
     conformable(*this,r);
-    //std::cout << GridLogMessage << "Copy same" << std::endl;
     auto me =   AcceleratorView(ViewWrite);
     auto him= r.AcceleratorView(ViewRead);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index c80e7db2..c23ddcdc 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -6,6 +6,7 @@
     Copyright (C) 2015
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -63,6 +64,7 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
     }
   });
 }
+
 template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
   int cb = half.Checkerboard();
   auto half_v = half.View();
@@ -81,25 +83,130 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
     }
   });
 }
-  
-template<class vobj,class CComplex,int nbasis>
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Flexible Type Conversion for internal promotion to double as well as graceful
+// treatment of scalar-compatible types
+////////////////////////////////////////////////////////////////////////////////////////////
+accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
+  out = in;
+}
+
+accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
+  out = in;
+}
+
+#ifdef __CUDA_ARCH__
+accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
+  ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
+}
+accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
+  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
+}
+accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
+  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
+}
+#endif
+
+accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
+  out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
+}
+
+accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
+  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
+}
+
+template<typename T1,typename T2,int N>
+  accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
+template<typename T1,typename T2,int N>
+  accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
+
+template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
+accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
+  convertType(out,in._internal);
+}
+
+template<typename T1,typename T2>
+accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
+  convertType(out._internal,in);
+}
+
+template<typename T1,typename T2,int N>
+accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
+  for (int i=0;i<N;i++)
+    for (int j=0;j<N;j++)
+      convertType(out._internal[i][j],in._internal[i][j]);
+}
+
+template<typename T1,typename T2,int N>
+accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
+  for (int i=0;i<N;i++)
+    convertType(out._internal[i],in._internal[i]);
+}
+
+template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
+accelerator_inline void convertType(T & out, const T & in) {
+  out = in;
+}
+
+template<typename T1,typename T2>
+accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
+  auto out_v = out.AcceleratorView(ViewWrite);
+  auto in_v  = in.AcceleratorView(ViewRead);
+
+  accelerator_for(ss,out_v.size(),T1::Nsimd(),{
+      convertType(out_v[ss],in_v(ss));
+    });
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// precision-promoted local inner product
+////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj>
+inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
+-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
+{
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
+
+  typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
+  Lattice<iScalar<t_inner>> ret(lhs.Grid());
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+
+  accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
+      convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
+    });
+
+  return ret;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// block routines
+////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
-			  const             Lattice<vobj>   &fineData,
-			  const std::vector<Lattice<vobj> > &Basis)
+			   const             Lattice<vobj>   &fineData,
+			   const VLattice &Basis)
 {
   GridBase * fine  = fineData.Grid();
   GridBase * coarse= coarseData.Grid();
 
-  Lattice<CComplex> ip(coarse); 
+  Lattice<iScalar<CComplex>> ip(coarse);
+  Lattice<vobj>     fineDataRed = fineData;
 
   //  auto fineData_   = fineData.View();
-  auto coarseData_ = coarseData.View();
-  auto ip_         = ip.View();
+  auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
+  auto ip_         = ip.AcceleratorView(ViewReadWrite);
   for(int v=0;v<nbasis;v++) {
-    blockInnerProduct(ip,Basis[v],fineData);
+    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
     accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
-	coalescedWrite(coarseData_[sc](v),ip_(sc));
+	convertType(coarseData_[sc](v),ip_[sc]);
       });
+
+    // improve numerical stability of projection
+    // |fine> = |fine> - <basis|fine> |basis>
+    ip=-ip;
+    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
   }
 }
 
@@ -166,11 +273,11 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
   return;
 }
 
-template<class vobj,class CComplex>
-inline void blockZAXPY(Lattice<vobj> &fineZ,
-		       const Lattice<CComplex> &coarseA,
-		       const Lattice<vobj> &fineX,
-		       const Lattice<vobj> &fineY)
+template<class vobj,class vobj2,class CComplex>
+  inline void blockZAXPY(Lattice<vobj> &fineZ,
+			 const Lattice<CComplex> &coarseA,
+			 const Lattice<vobj2> &fineX,
+			 const Lattice<vobj> &fineY)
 {
   GridBase * fine  = fineZ.Grid();
   GridBase * coarse= coarseA.Grid();
@@ -182,7 +289,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
   conformable(fineX,fineZ);
 
   int _ndimension = coarse->_ndimension;
-  
+
   Coordinate  block_r      (_ndimension);
 
   // FIXME merge with subdivide checking routine as this is redundant
@@ -191,29 +298,65 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
     assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
   }
 
-  auto fineZ_  = fineZ.View();
-  auto fineX_  = fineX.View();
-  auto fineY_  = fineY.View();
-  auto coarseA_= coarseA.View();
+  auto fineZ_  = fineZ.AcceleratorView(ViewWrite);
+  auto fineX_  = fineX.AcceleratorView(ViewRead);
+  auto fineY_  = fineY.AcceleratorView(ViewRead);
+  auto coarseA_= coarseA.AcceleratorView(ViewRead);
 
   accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
-    
-    int sc;
-    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
 
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+      int sc;
+      Coordinate coor_c(_ndimension);
+      Coordinate coor_f(_ndimension);
 
-    // z = A x + y
-    coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));
+      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 
-  });
+      // z = A x + y
+#ifdef __CUDA_ARCH__
+      typename vobj2::tensor_reduced::scalar_object cA;
+      typename vobj::scalar_object cAx;
+#else
+      typename vobj2::tensor_reduced cA;
+      vobj cAx;
+#endif
+      convertType(cA,TensorRemove(coarseA_(sc)));
+      auto prod = cA*fineX_(sf);
+      convertType(cAx,prod);
+      coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
+
+    });
 
   return;
 }
+
 template<class vobj,class CComplex>
+  inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
+				 const Lattice<vobj> &fineX,
+				 const Lattice<vobj> &fineY)
+{
+  typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
+
+  GridBase *coarse(CoarseInner.Grid());
+  GridBase *fine  (fineX.Grid());
+
+  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
+  Lattice<dotp> coarse_inner(coarse);
+
+  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
+  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
+
+  // Precision promotion
+  fine_inner = localInnerProductD(fineX,fineY);
+  blockSum(coarse_inner,fine_inner);
+  accelerator_for(ss, coarse->oSites(), 1, {
+      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
+    });
+ 
+}
+
+template<class vobj,class CComplex> // deprecate
 inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
 			      const Lattice<vobj> &fineX,
 			      const Lattice<vobj> &fineY)
@@ -227,8 +370,8 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
   Lattice<dotp> coarse_inner(coarse);
 
   // Precision promotion?
-  auto CoarseInner_  = CoarseInner.View();
-  auto coarse_inner_ = coarse_inner.View();
+  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
+  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
 
   fine_inner = localInnerProduct(fineX,fineY);
   blockSum(coarse_inner,fine_inner);
@@ -236,6 +379,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
     CoarseInner_[ss] = coarse_inner_[ss];
   });
 }
+
 template<class vobj,class CComplex>
 inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 {
@@ -248,7 +392,7 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 // useful in multigrid project;
 // Generic name : Coarsen?
 template<class vobj>
-inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
+inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) 
 {
   GridBase * fine  = fineData.Grid();
   GridBase * coarse= coarseData.Grid();
@@ -256,42 +400,41 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
   subdivides(coarse,fine); // require they map
 
   int _ndimension = coarse->_ndimension;
-  
+
   Coordinate  block_r      (_ndimension);
-  
+
   for(int d=0 ; d<_ndimension;d++){
     block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
   }
   int blockVol = fine->oSites()/coarse->oSites();
 
-  // Turn this around to loop threaded over sc and interior loop 
-  // over sf would thread better
-  auto coarseData_ = coarseData.View();
-  auto fineData_   = fineData.View();
+  auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
+  auto fineData_   = fineData.AcceleratorView(ViewRead);
 
   accelerator_for(sc,coarse->oSites(),1,{
 
-    // One thread per sub block
-    Coordinate coor_c(_ndimension);
-    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
-    coarseData_[sc]=Zero();
+      // One thread per sub block
+      Coordinate coor_c(_ndimension);
+      Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+      coarseData_[sc]=Zero();
 
-    for(int sb=0;sb<blockVol;sb++){
-      
-      int sf;
-      Coordinate coor_b(_ndimension);
-      Coordinate coor_f(_ndimension);
-      Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
-      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
-      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+      for(int sb=0;sb<blockVol;sb++){
 
-      coarseData_[sc]=coarseData_[sc]+fineData_[sf];
-    }
+	int sf;
+	Coordinate coor_b(_ndimension);
+	Coordinate coor_f(_ndimension);
+	Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
+	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
+	Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
 
-  });
+	coarseData_[sc]=coarseData_[sc]+fineData_[sf];
+      }
+
+    });
   return;
 }
 
+
 template<class vobj>
 inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
 {
@@ -313,8 +456,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
   }
 }
 
-template<class vobj,class CComplex>
-inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
+template<class CComplex,class VLattice>
+inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
 {
   GridBase *coarse = ip.Grid();
   GridBase *fine   = Basis[0].Grid();
@@ -322,23 +465,30 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
   int       nbasis = Basis.size() ;
 
   // checks
-  subdivides(coarse,fine); 
+  subdivides(coarse,fine);
   for(int i=0;i<nbasis;i++){
     conformable(Basis[i].Grid(),fine);
   }
 
   for(int v=0;v<nbasis;v++) {
     for(int u=0;u<v;u++) {
-      //Inner product & remove component 
-      blockInnerProduct(ip,Basis[u],Basis[v]);
+      //Inner product & remove component
+      blockInnerProductD(ip,Basis[u],Basis[v]);
       ip = -ip;
-      blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]);
+      blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
     }
     blockNormalise(ip,Basis[v]);
   }
 }
 
+template<class vobj,class CComplex>
+inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
+{
+  blockOrthonormalize(ip,Basis);
+}
+
 #if 0
+// TODO: CPU optimized version here
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -383,24 +533,18 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
   
 }
 #else
-template<class vobj,class CComplex,int nbasis>
+template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
-			 const std::vector<Lattice<vobj> > &Basis)
+			 const VLattice &Basis)
 {
   GridBase * fine  = fineData.Grid();
   GridBase * coarse= coarseData.Grid();
-
   fineData=Zero();
   for(int i=0;i<nbasis;i++) {
     Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
-    Lattice<CComplex> cip(coarse);
-    auto cip_ = cip.View();
-    auto  ip_ =  ip.View();
-    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
-	coalescedWrite(cip_[sc], ip_(sc)());
-    });
-    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
+    auto  ip_ =  ip.AcceleratorView(ViewRead);
+    blockZAXPY(fineData,ip,Basis[i],fineData);
   }
 }
 #endif
@@ -470,8 +614,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
   Coordinate rdt = Tg->_rdimensions;
   Coordinate ist = Tg->_istride;
   Coordinate ost = Tg->_ostride;
-  auto t_v = To.View();
-  auto f_v = From.View();
+  auto t_v = To.AcceleratorView(ViewWrite);
+  auto f_v = From.AcceleratorView(ViewRead);
   accelerator_for(idx,Fg->lSites(),1,{
     sobj s;
     Coordinate Fcoor(nd);

From 87984ece7d135f236c671ab3caeaafd7beb4158f Mon Sep 17 00:00:00 2001
From: Christoph Lehner <lehner@login2.summit.olcf.ornl.gov>
Date: Wed, 6 May 2020 08:47:18 -0400
Subject: [PATCH 25/51] add Lattice_basis.h

---
 Grid/lattice/Lattice_basis.h | 236 +++++++++++++++++++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100644 Grid/lattice/Lattice_basis.h

diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h
new file mode 100644
index 00000000..f1126936
--- /dev/null
+++ b/Grid/lattice/Lattice_basis.h
@@ -0,0 +1,236 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/lattice/Lattice_basis.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field>
+void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
+{
+  // If assume basis[j] are already orthonormal,
+  // can take all inner products in parallel saving 2x bandwidth
+  // Save 3x bandwidth on the second line of loop.
+  // perhaps 2.5x speed up.
+  // 2x overall in Multigrid Lanczos  
+  for(int j=0; j<k; ++j){
+    auto ip = innerProduct(basis[j],w);
+    w = w - ip*basis[j];
+  }
+}
+
+template<class VField, class Matrix>
+void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) 
+{
+  typedef decltype(basis[0]) Field;
+  typedef decltype(basis[0].View()) View;
+  auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
+  Vector<View> basis_v(basis.size(),tmp_v);
+  typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
+  GridBase* grid = basis[0].Grid();
+      
+  for(int k=0;k<basis.size();k++){
+    basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
+  }
+
+#ifndef GRID_NVCC
+  thread_region
+  {
+    std::vector < vobj > B(Nm); // Thread private
+    thread_for_in_region(ss, grid->oSites(),{
+	for(int j=j0; j<j1; ++j) B[j]=0.;
+      
+	for(int j=j0; j<j1; ++j){
+	  for(int k=k0; k<k1; ++k){
+	    B[j] +=Qt(j,k) * basis_v[k][ss];
+	  }
+	}
+	for(int j=j0; j<j1; ++j){
+	  basis_v[j][ss] = B[j];
+	}
+      });
+  }
+#else
+  int nrot = j1-j0;
+  if (!nrot) // edge case not handled gracefully by Cuda
+    return;
+
+  uint64_t oSites   =grid->oSites();
+  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
+
+  Vector <vobj> Bt(siteBlock * nrot); 
+  auto Bp=&Bt[0];
+
+  // GPU readable copy of matrix
+  Vector<double> Qt_jv(Nm*Nm);
+  double *Qt_p = & Qt_jv[0];
+  thread_for(i,Nm*Nm,{
+      int j = i/Nm;
+      int k = i%Nm;
+      Qt_p[i]=Qt(j,k);
+    });
+
+  // Block the loop to keep storage footprint down
+  for(uint64_t s=0;s<oSites;s+=siteBlock){
+
+    // remaining work in this block
+    int ssites=MIN(siteBlock,oSites-s);
+
+    // zero out the accumulators
+    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
+	decltype(coalescedRead(Bp[ss])) z;
+	z=Zero();
+	coalescedWrite(Bp[ss],z);
+      });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+	
+	int j =sj%nrot;
+	int jj  =j0+j;
+	int ss =sj/nrot;
+	int sss=ss+s;
+
+	for(int k=k0; k<k1; ++k){
+	  auto tmp = coalescedRead(Bp[ss*nrot+j]);
+	  coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
+	}
+      });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+	int j =sj%nrot;
+	int jj  =j0+j;
+	int ss =sj/nrot;
+	int sss=ss+s;
+	coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
+      });
+  }
+#endif
+}
+
+// Extract a single rotated vector
+template<class Field>
+void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
+{
+  typedef decltype(basis[0].AcceleratorView()) View;
+  typedef typename Field::vector_object vobj;
+  GridBase* grid = basis[0].Grid();
+
+  result.Checkerboard() = basis[0].Checkerboard();
+  auto result_v=result.AcceleratorView(ViewWrite);
+  Vector<View> basis_v(basis.size(),result_v);
+  for(int k=0;k<basis.size();k++){
+    basis_v[k] = basis[k].AcceleratorView(ViewRead);
+  }
+  vobj zz=Zero();
+  Vector<double> Qt_jv(Nm);
+  double * Qt_j = & Qt_jv[0];
+  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
+  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
+    auto B=coalescedRead(zz);
+    for(int k=k0; k<k1; ++k){
+      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
+    }
+    coalescedWrite(result_v[ss], B);
+  });
+}
+
+template<class Field>
+void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
+{
+  int vlen = idx.size();
+
+  assert(vlen>=1);
+  assert(vlen<=sort_vals.size());
+  assert(vlen<=_v.size());
+
+  for (size_t i=0;i<vlen;i++) {
+
+    if (idx[i] != i) {
+
+      //////////////////////////////////////
+      // idx[i] is a table of desired sources giving a permutation.
+      // Swap v[i] with v[idx[i]].
+      // Find  j>i for which _vnew[j] = _vold[i],
+      // track the move idx[j] => idx[i]
+      // track the move idx[i] => i
+      //////////////////////////////////////
+      size_t j;
+      for (j=i;j<idx.size();j++)
+	if (idx[j]==i)
+	  break;
+
+      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
+
+      swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
+      std::swap(sort_vals[i],sort_vals[idx[i]]);
+
+      idx[j] = idx[i];
+      idx[i] = i;
+    }
+  }
+}
+
+inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
+{
+  std::vector<int> idx(sort_vals.size());
+  std::iota(idx.begin(), idx.end(), 0);
+
+  // sort indexes based on comparing values in v
+  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
+    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
+  });
+  return idx;
+}
+
+template<class Field>
+void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
+{
+  std::vector<int> idx = basisSortGetIndex(sort_vals);
+  if (reverse)
+    std::reverse(idx.begin(), idx.end());
+  
+  basisReorderInPlace(_v,sort_vals,idx);
+}
+
+// PAB: faster to compute the inner products first then fuse loops.
+// If performance critical can improve.
+template<class Field>
+void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
+  result = Zero();
+  assert(_v.size()==eval.size());
+  int N = (int)_v.size();
+  for (int i=0;i<N;i++) {
+    Field& tmp = _v[i];
+    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
+  }
+}
+
+NAMESPACE_END(Grid);

From 56e2f7d088aa36af4990742822c8cd47363d2391 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Thu, 7 May 2020 10:03:45 +0100
Subject: [PATCH 26/51] deleted test routines. cleaned up fast version. assert
 Ns=4,Nc=3.

---
 Grid/qcd/utils/BaryonUtils.h | 414 ++++-------------------------------
 1 file changed, 40 insertions(+), 374 deletions(-)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index a392f223..241395c3 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -46,44 +46,11 @@ public:
   typedef typename SpinMatrixField::vector_object sobj;
 
   static const int epsilon[6][3] ;
-  //static const Complex epsilon_sgn[6];
   static const double epsilon_sgn[6];
 
   private: 
   template <class mobj, class robj>
-  static void baryon_site(const mobj &D1,
-				 const mobj &D2,
-				 const mobj &D3,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const int parity,
-				 const int * wick_contractions,
-  				 robj &result);
-  template <class mobj, class robj, int w0, int w1, int w2, int w3, int w4, int w5>
-  static void baryon_site_macro(const mobj &D1,
-				 const mobj &D2,
-				 const mobj &D3,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const int parity,
-  				 robj &result);
-  template <class mobj, class robj>
-  static void baryon_site_macro(const mobj &D1,
-				 const mobj &D2,
-				 const mobj &D3,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const int parity,
-				 const int * wick_contractions,
-  				 robj &result);
-  template <class mobj, class robj>
-  static inline void baryon_site_template(unsigned int mask, const mobj &D1,
+  static inline void baryon_site(unsigned int mask, const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -93,7 +60,7 @@ public:
 						 const int parity,
 						 robj &result);
   template <unsigned int mask, class mobj, class robj>
-  static inline void baryon_site_template(const mobj &D1,
+  static inline void baryon_site(const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -208,266 +175,12 @@ public:
 
 template <class FImpl> 
 const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
-/*template <class FImpl> 
-const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
-						    Complex(1),
-						    Complex(1),
-						    Complex(-1),
-						    Complex(-1),
-						    Complex(-1)};
-*/
 template <class FImpl> 
 const double BaryonUtils<FImpl>::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0};
 
-//This is the old version
-template <class FImpl>
-template <class mobj, class robj>
-void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 const int * wick_contraction,
-						 robj &result)
-{
-
-  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
-
-    auto gD1a = GammaA_left * GammaA_right * D1;
-    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
-    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
-    auto gD3 = GammaB_right * D3;
-
-    for (int ie_left=0; ie_left < 6 ; ie_left++){
-      int a_left = epsilon[ie_left][0]; //a
-      int b_left = epsilon[ie_left][1]; //b
-      int c_left = epsilon[ie_left][2]; //c
-      for (int ie_right=0; ie_right < 6 ; ie_right++){
-        int a_right = epsilon[ie_right][0]; //a'
-        int b_right = epsilon[ie_right][1]; //b'
-        int c_right = epsilon[ie_right][2]; //c'
-	//complex<double> ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
-	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
-        //This is the \delta_{456}^{123} part
-	if (wick_contraction[0]){
-          auto D2g = D2 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += ee * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
-          }}}
-  	}	  
-        //This is the \delta_{456}^{231} part
-	if (wick_contraction[1]){
-          auto pD1g = pD1 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += ee * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
-          }}}
-        }	  
-        //This is the \delta_{456}^{312} part
-	if (wick_contraction[2]){
-          auto gD3g = gD3 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += ee * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
-          }}}
-        }	  
-        //This is the \delta_{456}^{132} part
-	if (wick_contraction[3]){
-          auto gD3g = gD3 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= ee * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
-          }}}
-        }	  
-        //This is the \delta_{456}^{321} part
-	if (wick_contraction[4]){
-          auto D2g = D2 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= ee * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
-          }}}
-        }	  
-        //This is the \delta_{456}^{213} part
-	if (wick_contraction[5]){
-          auto pD1g = pD1 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= ee * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
-          }}}
-        }	  
-      }
-    }
-}
-
-template <class FImpl>
-template <class mobj, class robj, int w0, int w1, int w2, int w3, int w4, int w5>
-void BaryonUtils<FImpl>::baryon_site_macro(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result)
-{
-
-  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
-
-    auto gD1a = GammaA_left * GammaA_right * D1;
-    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
-    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
-    auto gD3 = GammaB_right * D3;
-
-    auto D2g = D2 * GammaB_left;
-    auto pD1g = pD1 * GammaB_left;
-    auto gD3g = gD3 * GammaB_left;
-
-    for (int ie_left=0; ie_left < 6 ; ie_left++){
-      int a_left = epsilon[ie_left][0]; //a
-      int b_left = epsilon[ie_left][1]; //b
-      int c_left = epsilon[ie_left][2]; //c
-      for (int ie_right=0; ie_right < 6 ; ie_right++){
-        int a_right = epsilon[ie_right][0]; //a'
-        int b_right = epsilon[ie_right][1]; //b'
-        int c_right = epsilon[ie_right][2]; //c'
-	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
-        //All parts together
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
-	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
-            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
-	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
-	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
-	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
-	    if(w0){
-	        result()()() += eepD1*D2g_ab*gD3_ab;
-	    }
-  	    if(w1){
-		result()()() += eepD1g_gb*D2_ab*gD3_ag;
-	    }
-	    if(w2){
-		result()()() += eepD1_gb*D2_ag*gD3g_ab;
-	    }
-	    if(w3){
-    		result()()() -= eepD1*D2_ab*gD3g_ab;
-	    }
-    	    if(w4){
-		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
-	    }
-            if(w5){
-    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
-            }
-  	  }}}
-      }
-    }
-}
-
-#define BARYON_SITE(w0, w1, w2, w3, w4, w5, D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, wick_contraction, result) \
-	if((wick_contraction[0] == w0) && (wick_contraction[1] == w1) &&  (wick_contraction[2] == w2) &&  (wick_contraction[3] == w3) &&  (wick_contraction[4] == w4) &&  (wick_contraction[5] == w5)) \
-{\
- baryon_site_macro<mobj, robj, w0, w1, w2, w3, w4, w5>( D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, result );\
-}
-
-template <class FImpl>
-template <class mobj, class robj>
-void BaryonUtils<FImpl>::baryon_site_macro(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 const int * wick_contraction,
-						 robj &result)
-{
-BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
- 
-}
-
-
 template <class FImpl>
 template <unsigned int mask, class mobj, class robj>
-inline void BaryonUtils<FImpl>::baryon_site_template(const mobj &D1,
+inline void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -556,7 +269,7 @@ inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsign
     assert(mask <= maxMask);
     if (mask == maxMask)
     {
-        baryon_site_template<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
+        baryon_site<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
     }
     else
     {
@@ -567,7 +280,7 @@ inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsign
 // top-level function
 template <class FImpl>
 template <class mobj, class robj>
-inline void BaryonUtils<FImpl>::baryon_site_template(const unsigned int mask, const mobj &D1,
+inline void BaryonUtils<FImpl>::baryon_site(const unsigned int mask, const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -594,40 +307,30 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
-    const std::chrono::system_clock::time_point start{ std::chrono::system_clock::now() };
-    std::time_t now = std::chrono::system_clock::to_time_t( start );
-    std::cout << "Setup start " << std::ctime( &now );
+   
+    assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+    assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
-  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+    std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
     std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
     std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
-  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+    assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-  GridBase *grid = q1_left.Grid();
+    GridBase *grid = q1_left.Grid();
 
-  int wick_contraction[6];
-  for (int ie=0; ie < 6 ; ie++)
-    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
+    int wick_id;
+    for (int ie=0; ie < 6 ; ie++)
+      wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie);
 
-  auto vbaryon_corr= baryon_corr.View();
-  auto v1 = q1_left.View();
-  auto v2 = q2_left.View();
-  auto v3 = q3_left.View();
+    auto vbaryon_corr= baryon_corr.View();
+    auto v1 = q1_left.View();
+    auto v2 = q2_left.View();
+    auto v3 = q3_left.View();
 
-    const std::chrono::system_clock::time_point stop{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( stop );
-    const std::chrono::duration<double> duration_seconds = stop - start;
-    const double seconds{ ( duration_seconds.count() ) };
-    std::cout << "Setup stop " << std::ctime( &now )
-              << "Total duration " << std::fixed << std::setprecision(5) << seconds << " seconds." << std::endl;
-
-    const std::chrono::system_clock::time_point start2{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( start2 );
-    std::cout << "Normal Loop start " << std::ctime( &now );
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+    // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
   thread_for(ss,grid->oSites(),{
   //for(int ss=0; ss < grid->oSites(); ss++){
 
@@ -636,58 +339,9 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     auto D3 = v3[ss];
 
     vobj result=Zero();
-    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
-    const std::chrono::system_clock::time_point stop2{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( stop2 );
-    const std::chrono::duration<double> duration_seconds2 = stop2 - start2;
-    const double seconds2{ ( duration_seconds2.count() ) };
-    std::cout << "Normal Loop stop " << std::ctime( &now )
-              << "Total duration " << std::fixed << std::setprecision(5) << seconds2 << " seconds." << std::endl;
-	      const std::chrono::system_clock::time_point start4{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( start4 );
-    std::cout << "Opt-macro Loop start " << std::ctime( &now );
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-  thread_for(ss,grid->oSites(),{
-  //for(int ss=0; ss < grid->oSites(); ss++){
-
-    auto D1 = v1[ss];
-    auto D2 = v2[ss];
-    auto D3 = v3[ss];
-
-    vobj result=Zero();
-    baryon_site_macro(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
-    vbaryon_corr[ss] = result; 
-  }  );//end loop over lattice sites
-    const std::chrono::system_clock::time_point stop4{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( stop4 );
-    const std::chrono::duration<double> duration_seconds4 = stop4 - start4;
-    const double seconds4{ ( duration_seconds4.count() ) };
-    std::cout << "Opt-macro Loop stop " << std::ctime( &now )
-              << "Total duration " << std::fixed << std::setprecision(5) << seconds4 << " seconds." << std::endl; 
-    const std::chrono::system_clock::time_point start3{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( start3 );
-    int wick_id=32*wick_contraction[0]+16*wick_contraction[1]+8*wick_contraction[2]+4*wick_contraction[3]+2*wick_contraction[4]+wick_contraction[5];
-    std::cout << "Opt-template Loop start " << std::ctime( &now );
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-  thread_for(ss,grid->oSites(),{
-  //for(int ss=0; ss < grid->oSites(); ss++){
-
-    auto D1 = v1[ss];
-    auto D2 = v2[ss];
-    auto D3 = v3[ss];
-
-    vobj result=Zero();
-    baryon_site_template(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
-    vbaryon_corr[ss] = result; 
-  }  );//end loop over lattice sites
-    const std::chrono::system_clock::time_point stop3{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( stop3 );
-    const std::chrono::duration<double> duration_seconds3 = stop3 - start3;
-    const double seconds3{ ( duration_seconds3.count() ) };
-    std::cout << "Opt-template Loop stop " << std::ctime( &now )
-              << "Total duration " << std::fixed << std::setprecision(5) << seconds3 << " seconds." << std::endl;
 
 }
 template <class FImpl>
@@ -704,20 +358,24 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 						 const int parity,
 						 robj &result)
 {
-  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+
+    assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+    assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+    
+    std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
     std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
     std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
-  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+    assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-  int wick_contraction[6];
-  for (int ie=0; ie < 6 ; ie++)
-    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
-
-     result=Zero();
-     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    int wick_id;
+    for (int ie=0; ie < 6 ; ie++)
+      wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie);
+  
+    result=Zero();
+    baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
 }
 
 /***********************************************************************
@@ -957,6 +615,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();
@@ -994,6 +656,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();

From 253bcc3426a212675cc497147c6142c6700102ee Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Thu, 7 May 2020 18:03:17 +0100
Subject: [PATCH 27/51] back to old version

---
 Grid/qcd/utils/BaryonUtils.h | 249 +++++++++++++++--------------------
 1 file changed, 105 insertions(+), 144 deletions(-)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 241395c3..6cf526c3 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -46,44 +46,20 @@ public:
   typedef typename SpinMatrixField::vector_object sobj;
 
   static const int epsilon[6][3] ;
-  static const double epsilon_sgn[6];
+  static const Complex epsilon_sgn[6];
 
   private: 
   template <class mobj, class robj>
-  static inline void baryon_site(unsigned int mask, const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result);
-  template <unsigned int mask, class mobj, class robj>
-  static inline void baryon_site(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result);
-						 
-  template <unsigned int maxMask>
-  struct BaryonSiteHelper
-  {
-  template <class mobj, class robj>
-    static inline void function(const unsigned int mask, const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result);
-						 };
+  static void baryon_site(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const int parity,
+				 const int * wick_contractions,
+  				 robj &result);
   public:
   static void ContractBaryons(const PropagatorField &q1_left,
 				 const PropagatorField &q2_left,
@@ -176,11 +152,17 @@ public:
 template <class FImpl> 
 const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
 template <class FImpl> 
-const double BaryonUtils<FImpl>::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0};
+const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
+						    Complex(1),
+						    Complex(1),
+						    Complex(-1),
+						    Complex(-1),
+						    Complex(-1)};
 
+//This is the old version
 template <class FImpl>
-template <unsigned int mask, class mobj, class robj>
-inline void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -188,14 +170,9 @@ inline void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 				                 const Gamma GammaA_right,
 		                 		 const Gamma GammaB_right,
 						 const int parity,
+						 const int * wick_contraction,
 						 robj &result)
 {
-    constexpr bool wick_contraction_0 = ((mask & (1 << 5)) >> 5);
-    constexpr bool wick_contraction_1 = ((mask & (1 << 4)) >> 4);
-    constexpr bool wick_contraction_2 = ((mask & (1 << 3)) >> 3);
-    constexpr bool wick_contraction_3 = ((mask & (1 << 2)) >> 2);
-    constexpr bool wick_contraction_4 = ((mask & (1 << 1)) >> 1);
-    constexpr bool wick_contraction_5 = ((mask & (1 << 0)) >> 0);
 
   Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
 
@@ -216,84 +193,77 @@ inline void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
         int a_right = epsilon[ie_right][0]; //a'
         int b_right = epsilon[ie_right][1]; //b'
         int c_right = epsilon[ie_right][2]; //c'
-	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
-        //All parts together
+	Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+        //This is the \delta_{456}^{123} part
+	if (wick_contraction[0]){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
             auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+	        result()()() += eepD1*D2g_ab*gD3_ab;
+          }}}
+  	}	  
+        //This is the \delta_{456}^{231} part
+	if (wick_contraction[1]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
             auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+		result()()() += eepD1g_gb*D2_ab*gD3_ag;
+          }}}
+        }	  
+        //This is the \delta_{456}^{312} part
+	if (wick_contraction[2]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+		result()()() += eepD1_gb*D2_ag*gD3g_ab;
+          }}}
+        }	  
+        //This is the \delta_{456}^{132} part
+	if (wick_contraction[3]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+    		result()()() -= eepD1*D2_ab*gD3g_ab;
+          }}}
+        }	  
+        //This is the \delta_{456}^{321} part
+	if (wick_contraction[4]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
+          }}}
+        }	  
+        //This is the \delta_{456}^{213} part
+	if (wick_contraction[5]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
             auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
-            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
-	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
 	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
-	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
-	    if(wick_contraction_0){
-	      result()()() += eepD1*D2g_ab*gD3_ab;
-	    }
-  	    if(wick_contraction_1){
-		    result()()() += eepD1g_gb*D2_ab*gD3_ag;
-	    }
-	    if(wick_contraction_2){
-		    result()()() += eepD1_gb*D2_ag*gD3g_ab;
-	    }
-            if(wick_contraction_3){
-    		result()()() -= eepD1*D2_ab*gD3g_ab;
-	    }
-    	    if(wick_contraction_4){
-		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
-	    }
-            if(wick_contraction_5){
     	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
-            }
-  	  }}}
+          }}}
+        }	  
       }
     }
 }
 
-template <class FImpl>
-template <unsigned int maxMask>
-template <class mobj, class robj>
-inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsigned int mask, const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result)
-{
-    assert(mask <= maxMask);
-    if (mask == maxMask)
-    {
-        baryon_site<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
-    }
-    else
-    {
-        BaryonSiteHelper<(maxMask>0) ? maxMask-1 : 0>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
-    }
-}
-
-// top-level function
-template <class FImpl>
-template <class mobj, class robj>
-inline void BaryonUtils<FImpl>::baryon_site(const unsigned int mask, const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result)
-{
-    BaryonSiteHelper<63>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
-}
-
-
 template<class FImpl>
 void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const PropagatorField &q2_left,
@@ -307,30 +277,30 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
-   
-    assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-    assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
-    std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
+  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
     std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
     std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
-    assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-    GridBase *grid = q1_left.Grid();
+  GridBase *grid = q1_left.Grid();
 
-    int wick_id;
-    for (int ie=0; ie < 6 ; ie++)
-      wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie);
+  int wick_contraction[6];
+  for (int ie=0; ie < 6 ; ie++)
+    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
 
-    auto vbaryon_corr= baryon_corr.View();
-    auto v1 = q1_left.View();
-    auto v2 = q2_left.View();
-    auto v3 = q3_left.View();
+  auto vbaryon_corr= baryon_corr.View();
+  auto v1 = q1_left.View();
+  auto v2 = q2_left.View();
+  auto v3 = q3_left.View();
 
-    // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
   thread_for(ss,grid->oSites(),{
   //for(int ss=0; ss < grid->oSites(); ss++){
 
@@ -339,10 +309,9 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     auto D3 = v3[ss];
 
     vobj result=Zero();
-    baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
+    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
-
 }
 template <class FImpl>
 template <class mobj, class robj>
@@ -359,23 +328,23 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 						 robj &result)
 {
 
-    assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-    assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-    
-    std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
+  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
     std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
     std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
-    assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-    int wick_id;
-    for (int ie=0; ie < 6 ; ie++)
-      wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie);
-  
-    result=Zero();
-    baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
+  int wick_contraction[6];
+  for (int ie=0; ie < 6 ; ie++)
+    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
+
+     result=Zero();
+     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 }
 
 /***********************************************************************
@@ -615,10 +584,6 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
-
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();
@@ -656,10 +621,6 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
-
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();

From 42bb5f0721de5cfdf0d16c6eda66a9fd9f4d13c6 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Thu, 7 May 2020 18:06:12 +0100
Subject: [PATCH 28/51] asserrtion

---
 Grid/qcd/utils/BaryonUtils.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 6cf526c3..fa2f3376 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -584,6 +584,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();
@@ -621,6 +625,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();

From 21ca182c368ba2415d874c744b8def244c0b37dd Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 09:18:24 -0400
Subject: [PATCH 29/51] Comments remove

---
 Grid/algorithms/LinearOperator.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
index 50600d2d..a7fa1a90 100644
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -257,13 +257,11 @@ public:
       virtual  RealD Mpc      (const Field &in, Field &out) {
       Field tmp(in.Grid());
       tmp.Checkerboard() = !in.Checkerboard();
-	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
 
-      //std::cout << "cb in " << in.Checkerboard() << "  cb out " << out.Checkerboard() << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
       }

From 6859a3e1d4cf2487b6d2f7f560f845df4fd5a7af Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 09:19:12 -0400
Subject: [PATCH 30/51] Schur operator

---
 benchmarks/Benchmark_schur.cc | 176 ++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 benchmarks/Benchmark_schur.cc

diff --git a/benchmarks/Benchmark_schur.cc b/benchmarks/Benchmark_schur.cc
new file mode 100644
index 00000000..afee31b0
--- /dev/null
+++ b/benchmarks/Benchmark_schur.cc
@@ -0,0 +1,176 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_dwf.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+void benchDw(std::vector<int> & L, int Ls);
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  const int Ls=12;
+  std::vector< std::vector<int> > latts;
+#if 0
+  latts.push_back(std::vector<int> ({24,24,24,24}) );
+  latts.push_back(std::vector<int> ({48,24,24,24}) );
+  latts.push_back(std::vector<int> ({96,24,24,24}) );
+  latts.push_back(std::vector<int> ({96,48,24,24}) );
+  //  latts.push_back(std::vector<int> ({96,48,48,24}) );
+  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
+#else
+  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
+  latts.push_back(std::vector<int> ({96,96,96,192}) );
+#endif
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t SchurDiagOne "<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+
+  for (int l=0;l<latts.size();l++){
+    std::vector<int> latt4 = latts[l];
+    std::cout << GridLogMessage <<"\t";
+    for(int d=0;d<Nd;d++){
+      std::cout<<latt4[d]<<"x";
+    }
+    std::cout <<Ls<<"\t" ;
+    benchDw (latt4,Ls);
+  }
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  Grid_finalize();
+}
+
+
+void benchDw(std::vector<int> & latt4, int Ls)
+{
+  /////////////////////////////////////////////////////////////////////////////////////
+  // for Nc=3
+  /////////////////////////////////////////////////////////////////////////////////////
+  // Dw :  Ls*24*(7+48)= Ls*1320 
+  //
+  // M5D:  Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72
+  // Meo:  Ls*24*(7+48) + Ls*72 = Ls*1392 
+  //
+  // Mee:  3*Ns*2*Nc*Ls  // Chroma 6*N5*Nc*Ns 
+  //
+  // LeemInv : 2*2*Nc*madd*Ls
+  // LeeInv  : 2*2*Nc*madd*Ls
+  // DeeInv  : 4*2*Nc*mul *Ls
+  // UeeInv  : 2*2*Nc*madd*Ls
+  // UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops
+  // QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops
+  // Mpc => 1452*cbvol*2*Ls flops // 
+  //     => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16
+  /////////////////////////////////////////////////////////////////////////////////////
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  //  long unsigned int single_site_flops     = 8*Nc*(7+16*Nc)*Ls;
+  long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls;
+  long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls;
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+
+  ColourMatrixF cm = ComplexF(1.0,0.0);
+
+  int ncall=300;
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  RealD NP = UGrid->_Nprocessors;
+  double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+
+  LatticeGaugeFieldF Umu(UGrid); Umu=Zero();
+  MobiusFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.5,0.5);
+  
+  LatticeFermionF src_o (FrbGrid); src_o=1.0;
+  LatticeFermionF r_o   (FrbGrid); r_o=Zero();
+
+  int order =151;
+  SchurDiagOneOperator<MobiusFermionF,LatticeFermionF>  Mpc(Dw);
+  Chebyshev<LatticeFermionF>      Cheby(0.0,60.0,order);
+
+  {
+    Mpc.Mpc(src_o,r_o);
+    Mpc.Mpc(src_o,r_o);
+    Mpc.Mpc(src_o,r_o);
+
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Mpc.Mpc(src_o,r_o);
+    }
+    double t1=usecond();
+
+    double flops=(single_site_mpc_flops*volume*ncall); // Mpc has 1 - Moo^-1 Moe Mee^-1 Meo  so CB cancels.
+    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0);
+    flops=(single_site_quda_flops*volume*ncall);
+    std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
+
+    // Cheby uses MpcDagMpc so 2x flops
+    for(int i=0;i<100;i++){
+    Cheby(Mpc,src_o,r_o);
+    t0=usecond();
+    Cheby(Mpc,src_o,r_o);
+    t1=usecond();
+    flops=(single_site_mpc_flops*volume*2*order);
+    std::cout <<"\t"<<flops/(t1-t0);
+    flops=(single_site_quda_flops*volume*2*order);
+    std::cout <<"\t"<<flops/(t1-t0) << "\t" << (t1-t0)/1000./1000. <<" s";
+    std::cout <<std::endl;
+    }
+  }
+  //  Dw.Report();
+}
+
+
+

From 93920c481146e8ad46e8abacd1c314c4d65b9571 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 09:19:54 -0400
Subject: [PATCH 31/51] Remove verbose

---
 Grid/qcd/action/fermion/MobiusFermion.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/qcd/action/fermion/MobiusFermion.h b/Grid/qcd/action/fermion/MobiusFermion.h
index 1cbb6609..1e948092 100644
--- a/Grid/qcd/action/fermion/MobiusFermion.h
+++ b/Grid/qcd/action/fermion/MobiusFermion.h
@@ -59,7 +59,7 @@ public:
   {
     RealD eps = 1.0;
 
-    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
+    //    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
     Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
     assert(zdata->n==this->Ls);
 	

From 1d65e2f62ccb3555e6fec9413960bd7b8f46b84d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 09:20:54 -0400
Subject: [PATCH 32/51] Slightly faster Chebyshev; ifdef'ed out the fastest
 until tested numerics Lifteed from HDCR setup

---
 Grid/algorithms/approx/Chebyshev.h | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
index 133db2b4..c0b0646d 100644
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -234,10 +234,9 @@ public:
 
     GridBase *grid=in.Grid();
 
-    // std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
-    //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
-
     int vol=grid->gSites();
+    typedef typename Field::vector_type vector_type;
+    constexpr int Nsimd = vector_type::Nsimd();
 
     Field T0(grid); T0 = in;  
     Field T1(grid); 
@@ -258,14 +257,27 @@ public:
     //    out = ()*T0 + Coeffs[1]*T1;
     axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
     for(int n=2;n<order;n++){
-	
+
       Linop.HermOp(*Tn,y);
-      //     y=xscale*y+mscale*(*Tn);
-      //      *Tnp=2.0*y-(*Tnm);
-      //      out=out+Coeffs[n]* (*Tnp);
+#if 0
+      auto y_v = y.View();
+      auto Tn_v = Tn->View();
+      auto Tnp_v = Tnp->View();
+      auto Tnm_v = Tnm->View();
+      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+      });
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#else
       axpby(y,xscale,mscale,y,(*Tn));
       axpby(*Tnp,2.0,-1.0,y,(*Tnm));
-      axpy(out,Coeffs[n],*Tnp,out);
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#endif
       // Cycle pointers to avoid copies
       Field *swizzle = Tnm;
       Tnm    =Tn;

From 0c570824f24e2033b15d5efa291430561a9c2c83 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Tue, 21 Apr 2020 13:26:43 +0200
Subject: [PATCH 33/51] Add missing declaration of GridCmdOptionInt

---
 Grid/util/Init.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/util/Init.h b/Grid/util/Init.h
index f7f032ba..dad963a0 100644
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -56,6 +56,7 @@ std::string GridCmdVectorIntToString(const VectorInt & vec);
 void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
+void GridCmdOptionInt(std::string &str,int & val);
 
 
 void GridParseLayout(char **argv,int argc,

From 779e3c74425c9d5e0a65abc0128e215a911fe90a Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Tue, 21 Apr 2020 13:30:08 +0200
Subject: [PATCH 34/51] Const-correctness for retrieval routines of
 GridStopWatch

---
 Grid/perfmon/Timer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h
index 88b4e1cc..2a44faee 100644
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -110,15 +110,15 @@ public:
 #endif
     accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
   }
-  GridTime Elapsed(void) {
+  GridTime Elapsed(void) const {
     assert(running == false);
     return std::chrono::duration_cast<GridTime>( accumulator );
   }
-  uint64_t useconds(void){
+  uint64_t useconds(void) const {
     assert(running == false);
     return (uint64_t) accumulator.count();
   }
-  bool isRunning(void){
+  bool isRunning(void) const {
     return running;
   }
 };

From ab0c5d77fbce6a0525fbe4385d45bec58f3bb3cb Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Wed, 22 Apr 2020 19:50:30 +0200
Subject: [PATCH 35/51] Correct NonHermitianSchurOperatorBase

---
 Grid/algorithms/LinearOperator.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
index a7fa1a90..c41f8eef 100644
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -364,6 +364,9 @@ public:
         void OpDir(const Field& in, Field& out, int dir, int disp) {
           assert(0);
         }
+        void OpDirAll(const Field& in, std::vector<Field>& out){
+          assert(0);
+        };
     };
 
     template<class Matrix, class Field>

From c83471bfd098c4ad36f5f368231c68dfa48ca6bf Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Thu, 23 Apr 2020 10:54:19 +0200
Subject: [PATCH 36/51] Fix missing checkerboards for adj und conjugate

---
 Grid/lattice/Lattice_reality.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Grid/lattice/Lattice_reality.h b/Grid/lattice/Lattice_reality.h
index 7373b2f9..96af8dab 100644
--- a/Grid/lattice/Lattice_reality.h
+++ b/Grid/lattice/Lattice_reality.h
@@ -40,6 +40,7 @@ NAMESPACE_BEGIN(Grid);
 
 template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
   Lattice<vobj> ret(lhs.Grid());
+  ret.Checkerboard()=lhs.Checkerboard();
   auto lhs_v = lhs.View();
   auto ret_v = ret.View();
   accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
@@ -50,6 +51,7 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
 
 template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
   Lattice<vobj> ret(lhs.Grid());
+  ret.Checkerboard() = lhs.Checkerboard();
   auto lhs_v = lhs.View();
   auto ret_v = ret.View();
   accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {

From 2b576fc1852cf0dcb526b21848586f16fc730ce1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 18:54:29 -0400
Subject: [PATCH 37/51] Comment deadd codde remove

---
 benchmarks/Benchmark_staggered.cc | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/benchmarks/Benchmark_staggered.cc b/benchmarks/Benchmark_staggered.cc
index 93086927..17b73c57 100644
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@@ -88,25 +88,6 @@ int main (int argc, char ** argv)
     U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
   }
   ref = Zero();
-  /*  
-  { // Naive wilson implementation
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
-      tmp = U[mu]*Cshift(src,mu,1);
-      for(int i=0;i<ref._odata.size();i++){
-	ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
-      }
-
-      tmp =adj(U[mu])*src;
-      tmp =Cshift(tmp,mu,-1);
-      for(int i=0;i<ref._odata.size();i++){
-	ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
-      }
-    }
-  }
-  ref = -0.5*ref;
-  */
 
   RealD mass=0.1;
   RealD c1=9.0/8.0;

From ee1de82a532858200260cc47f658640b9f73a643 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 18:54:50 -0400
Subject: [PATCH 38/51] Working ITT benchmark again

---
 benchmarks/Benchmark_ITT.cc | 584 +++++++++++++++---------------------
 1 file changed, 235 insertions(+), 349 deletions(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 7ad4a147..1bb77aff 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 using namespace Grid;
 
-
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
@@ -76,7 +75,6 @@ struct controls {
   int Opt;
   int CommsOverlap;
   Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
-  //  int HugePages;
 };
 
 class Benchmark {
@@ -119,14 +117,15 @@ public:
     std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
     comms_header();
 
-    for(int lat=4;lat<=maxlat;lat+=4){
-      for(int Ls=8;Ls<=8;Ls*=2){
+    for(int lat=16;lat<=maxlat;lat+=8){
+      //      for(int Ls=8;Ls<=8;Ls*=2){
+      { int Ls=12;
 
 	Coordinate latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
-
+	std::cout << GridLogMessage<< latt_size <<std::endl;
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
@@ -184,9 +183,6 @@ public:
 	}
 
 	timestat.statistics(t_time);
-	//	for(int i=0;i<t_time.size();i++){
-	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
-	//	}
 
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
@@ -199,8 +195,6 @@ public:
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-
- 
 	
 	    }
     }    
@@ -227,14 +221,15 @@ public:
     uint64_t NN;
 
 
-  uint64_t lmax=48;
+  uint64_t lmax=32;
 #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 
     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-    for(int lat=8;lat<=lmax;lat+=4){
+    for(int lat=8;lat<=lmax;lat+=8){
 
       Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
       //      NP= Grid.RankCount();
@@ -270,191 +265,8 @@ public:
     }
   };
 
-#if 0
-  static double DWF5(int Ls,int L)
-  {
-    //    RealD mass=0.1;
-    RealD M5  =1.8;
 
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst= 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
-    Coordinate local({L,L,L,L});
-
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global=NN;
-    uint64_t SHM=NP/NN;
-
-    Coordinate internal;
-    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
-    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
-    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
-    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
-    else assert(0);
-
-    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
-    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
-
-    ///////// Welcome message ////////////
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
-    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
-    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
-    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
-    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
-    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
-
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1,2,3,4});
-    std::vector<int> seeds5({5,6,7,8});
-    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    ///////// Source preparation ////////////
-    LatticeFermion src   (sFGrid); 
-    LatticeFermion tmp   (sFGrid);
-    std::cout << GridLogMessage << "allocated src and tmp" << std::endl;
-    random(RNG5,src);
-    std::cout << GridLogMessage << "intialised random source" << std::endl;
-
-    RealD N2 = 1.0/::sqrt(norm2(src));
-    src = src*N2;
-    
-    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
-
-    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
-    LatticeFermion src_e (sFrbGrid);
-    LatticeFermion src_o (sFrbGrid);
-    LatticeFermion r_e   (sFrbGrid);
-    LatticeFermion r_o   (sFrbGrid);
-    LatticeFermion r_eo  (sFGrid);
-    LatticeFermion err   (sFGrid);
-    {
-
-      pickCheckerboard(Even,src_e,src);
-      pickCheckerboard(Odd,src_o,src);
-
-#if defined(AVX512) 
-      const int num_cases = 6;
-      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
-#else
-      const int num_cases = 4;
-      std::string fmt("U/S ; U/O ; G/S ; G/O ");
-#endif
-      controls Cases [] = {
-#ifdef AVX512
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-#endif
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
-      }; 
-
-      for(int c=0;c<num_cases;c++) {
-
-	 WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
-	 WilsonKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-	int nwarm = 100;
-	uint64_t ncall = 1000;
-
-	double t0=usecond();
-	sFGrid->Barrier();
-	for(int i=0;i<nwarm;i++){
-	  sDw.DhopEO(src_o,r_e,DaggerNo);
-	}
-	sFGrid->Barrier();
-	double t1=usecond();
-
-	sDw.ZeroCounters();
-	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  sDw.DhopEO(src_o,r_e,DaggerNo);
-	  t1=usecond();
-	  t_time[i] = t1-t0;
-	}
-	sFGrid->Barrier();
-	
-	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-	double flops=(1344.0*volume)/2;
-	double mf_hi, mf_lo, mf_err;
-
-	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
-	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-
-	mflops = flops/timestat.mean;
-	mflops_all.push_back(mflops);
-	if ( mflops_best == 0   ) mflops_best = mflops;
-	if ( mflops_worst== 0   ) mflops_worst= mflops;
-	if ( mflops>mflops_best ) mflops_best = mflops;
-	if ( mflops<mflops_worst) mflops_worst= mflops;
-
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
-
-	sDw.Report();
-
-      }
-      double robust = mflops_worst/mflops_best;;
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-
-      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
-      std::cout<<GridLogMessage <<fmt << std::endl;
-      std::cout<<GridLogMessage;
-
-      for(int i=0;i<mflops_all.size();i++){
-	std::cout<<mflops_all[i]/NN<<" ; " ;
-      }
-      std::cout<<std::endl;
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    }
-    return mflops_best;
-  }
-#endif
-
-  static double DWF(int Ls,int L, double & robust)
+  static double DWF(int Ls,int L)
   {
     RealD mass=0.1;
     RealD M5  =1.8;
@@ -471,37 +283,30 @@ public:
     Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
     Coordinate local({L,L,L,L});
 
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
     uint64_t NP = TmpGrid->RankCount();
     uint64_t NN = TmpGrid->NodeCount();
     NN_global=NN;
     uint64_t SHM=NP/NN;
 
-    Coordinate internal;
-    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
-    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
-    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
-    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
-    else assert(0);
-
-    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
-    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 
     ///////// Welcome message ////////////
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
-    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-
     ///////// Lattice Init ////////////
-    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
     GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
     GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
@@ -514,74 +319,31 @@ public:
     GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
     std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 
+    typedef DomainWallFermionF Action;
+    typedef typename Action::FermionField Fermion;
+    typedef LatticeGaugeFieldF Gauge;
+    
     ///////// Source preparation ////////////
-    LatticeFermion src   (FGrid); random(RNG5,src);
-    LatticeFermion ref   (FGrid);
-    LatticeFermion tmp   (FGrid);
+    Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Fermion src   (FGrid); random(RNG5,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+    Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 
-    RealD N2 = 1.0/::sqrt(norm2(src));
-    std::cout<<GridLogMessage << "Normalising src  "<< N2 <<std::endl;
-    src = src*N2;
-    
-    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
-    
-
-    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
-    ////////////////////////////////////
-    // Naive wilson implementation
-    ////////////////////////////////////
-    {
-      LatticeGaugeField Umu5d(FGrid); 
-      std::vector<LatticeColourMatrix> U(4,FGrid);
-      auto Umu_v = Umu.View();
-      auto Umu5d_v = Umu5d.View();
-      for(int ss=0;ss<Umu.Grid()->oSites();ss++){
-	for(int s=0;s<Ls;s++){
-	  Umu5d_v[Ls*ss+s] = Umu_v[ss];
-	}
-      }
-      ref = Zero();
-      for(int mu=0;mu<Nd;mu++){
-	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
-      }
-      for(int mu=0;mu<Nd;mu++){
-	
-	tmp = U[mu]*Cshift(src,mu+1,1);
-	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-	
-	tmp =adj(U[mu])*src;
-	tmp =Cshift(tmp,mu+1,-1);
-	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-      }
-      ref = -0.5*ref;
-    }
-
-    LatticeFermion src_e (FrbGrid);
-    LatticeFermion src_o (FrbGrid);
-    LatticeFermion r_e   (FrbGrid);
-    LatticeFermion r_o   (FrbGrid);
-    LatticeFermion r_eo  (FGrid);
-    LatticeFermion err   (FGrid);
     {
 
       pickCheckerboard(Even,src_e,src);
       pickCheckerboard(Odd,src_o,src);
 
-#if defined(AVX512) 
-      const int num_cases = 6;
-      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
-#else
       const int num_cases = 4;
-      std::string fmt("U/S ; U/O ; G/S ; G/O ");
-#endif
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+
       controls Cases [] = {
-#ifdef AVX512
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-#endif
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
       }; 
@@ -594,15 +356,12 @@ public:
 
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-	int nwarm = 200;
+	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
@@ -610,9 +369,7 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
-	//	if (ncall < 500) ncall = 500;
-	uint64_t ncall = 1000;
+	uint64_t ncall = 50;
 
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 
@@ -649,24 +406,11 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 
-	Dw.Report();
-
-	Dw.DhopEO(src_o,r_e,DaggerNo);
-	Dw.DhopOE(src_e,r_o,DaggerNo);
-	setCheckerboard(r_eo,r_o);
-	setCheckerboard(r_eo,r_e);
-	err = r_eo-ref; 
-	RealD absref = norm2(ref);
-	RealD abserr = norm2(err);
-	std::cout<<GridLogMessage << "norm diff   "<< abserr << " / " << absref<<std::endl;
-	assert(abserr<1.0e-4);
-
       }
-      robust = mflops_worst/mflops_best;
+
       std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
       std::cout<<GridLogMessage <<fmt << std::endl;
       std::cout<<GridLogMessage ;
 
@@ -680,8 +424,166 @@ public:
     return mflops_best;
   }
 
+
+  static double Staggered(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass=0.1;
+    RealD c1=9.0/8.0;
+    RealD c2=-1.0/24.0;
+    RealD u0=1.0;
+
+    typedef ImprovedStaggeredFermionF Action;
+    typedef typename Action::FermionField Fermion; 
+    typedef LatticeGaugeFieldF Gauge;
+    
+    Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu); 
+
+    typename Action::ImplParams params;
+    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
+
+    ///////// Source preparation ////////////
+    Fermion src   (FGrid); random(RNG4,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+  
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+    
+      const int num_cases = 4;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+      
+      controls Cases [] = {
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+	
+	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
+	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+      
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+	Ds.ZeroCounters();
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1146.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+	
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+	
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+      
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    return mflops_best;
+  }
 };
 
+
+
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
@@ -696,62 +598,50 @@ int main (int argc, char ** argv)
 
   int do_memory=1;
   int do_comms =1;
-  int do_su3   =0;
-  int do_wilson=1;
-  int do_dwf   =1;
 
-  if ( do_su3 ) {
-    // empty for now
-  }
-#if 1
   int sel=2;
-  Coordinate L_list({8,12,16,24});
-#else
-  int sel=1;
-  Coordinate L_list({8,12});
-#endif
+  std::vector<int> L_list({16,24,32});
   int selm1=sel-1;
-  std::vector<double> robust_list;
 
   std::vector<double> wilson;
   std::vector<double> dwf4;
-  std::vector<double> dwf5;
+  std::vector<double> staggered;
 
-  if ( do_wilson ) {
-    int Ls=1;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    for(int l=0;l<L_list.size();l++){
-      double robust;
-      wilson.push_back(Benchmark::DWF(Ls,L_list[l],robust));
-    }
+  int Ls=1;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
   }
 
-  int Ls=16;
-  if ( do_dwf ) {
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    for(int l=0;l<L_list.size();l++){
-      double robust;
-      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
-      dwf4.push_back(result);
-      robust_list.push_back(robust);
-    }
+  Ls=12;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::DWF(Ls,L_list[l]) ;
+    dwf4.push_back(result);
   }
 
-  if ( do_dwf ) {
+  /*
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::Staggered(L_list[l]) ;
+    staggered.push_back(result);
+  }
+  */
 
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 " <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
   for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l] <<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
   }
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  }
 
   int NN=NN_global;
   if ( do_memory ) {
@@ -768,24 +658,20 @@ int main (int argc, char ** argv)
     Benchmark::Comms();
   }
 
-  if ( do_dwf ) {
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
-  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
-  }
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
+    for(int l=0;l<L_list.size();l++){
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
-  std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
-  std::cout<<std::setprecision(3);
-  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-  }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
+    std::cout<<std::setprecision(3);
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
   Grid_finalize();
 }

From efe5bc6a3cb7f2069646883f29a16d3f345f74f8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 9 May 2020 22:27:56 -0400
Subject: [PATCH 39/51] Split allocator cache into two pools of different sizes

---
 Grid/allocator/AlignedAllocator.cc | 69 +++++++++++++++---------------
 Grid/allocator/AlignedAllocator.h  | 17 ++++----
 2 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc
index d53c4dc2..77646410 100644
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,21 +6,19 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 
-#ifdef GRID_NVCC
-#define SMALL_LIMIT (0)
-#else
-#define SMALL_LIMIT (4096)
-#endif
-
-#ifdef POINTER_CACHE
-int PointerCache::victim;
-
+int PointerCache::Victim;
+int PointerCache::VictimSmall;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
+PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmall];
 
-void *PointerCache::Insert(void *ptr,size_t bytes) {
-
-  if (bytes < SMALL_LIMIT ) return ptr;
-
+void *PointerCache::Insert(void *ptr,size_t bytes) 
+{
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
+    return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
+  return Insert(ptr,bytes,Entries,Ncache,Victim);  
+}
+void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) 
+{
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);
 #endif 
@@ -28,8 +26,8 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
   void * ret = NULL;
   int v = -1;
 
-  for(int e=0;e<Ncache;e++) {
-    if ( Entries[e].valid==0 ) {
+  for(int e=0;e<ncache;e++) {
+    if ( entries[e].valid==0 ) {
       v=e; 
       break;
     }
@@ -37,40 +35,43 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
 
   if ( v==-1 ) {
     v=victim;
-    victim = (victim+1)%Ncache;
+    victim = (victim+1)%ncache;
   }
 
-  if ( Entries[v].valid ) {
-    ret = Entries[v].address;
-    Entries[v].valid = 0;
-    Entries[v].address = NULL;
-    Entries[v].bytes = 0;
+  if ( entries[v].valid ) {
+    ret = entries[v].address;
+    entries[v].valid = 0;
+    entries[v].address = NULL;
+    entries[v].bytes = 0;
   }
 
-  Entries[v].address=ptr;
-  Entries[v].bytes  =bytes;
-  Entries[v].valid  =1;
+  entries[v].address=ptr;
+  entries[v].bytes  =bytes;
+  entries[v].valid  =1;
 
   return ret;
 }
 
-void *PointerCache::Lookup(size_t bytes) {
-
-  if (bytes < SMALL_LIMIT ) return NULL;
-
+void *PointerCache::Lookup(size_t bytes)
+{
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
+    return Lookup(bytes,EntriesSmall,NcacheSmall);
+  return Lookup(bytes,Entries,Ncache);
+}
+void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) 
+{
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);
 #endif 
-
-  for(int e=0;e<Ncache;e++){
-    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
-      Entries[e].valid = 0;
-      return Entries[e].address;
+  for(int e=0;e<ncache;e++){
+    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
+      entries[e].valid = 0;
+      return entries[e].address;
     }
   }
   return NULL;
 }
-#endif
+
 
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 8c189be8..d6e2e073 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -42,21 +42,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 #define POINTER_CACHE
 #define GRID_ALLOC_ALIGN (2*1024*1024)
+#define GRID_ALLOC_SMALL_LIMIT (4096)
 
 NAMESPACE_BEGIN(Grid);
 
 // Move control to configure.ac and Config.h?
-#ifdef POINTER_CACHE
+
 class PointerCache {
 private:
 /*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
-#ifdef GRID_NVCC 
-  static const int Ncache=128;
-#else
+/* Could make these configurable, perhaps up to a max size*/
+  static const int NcacheSmall=128; 
   static const int Ncache=8;
-#endif
-  static int victim;
 
   typedef struct { 
     void *address;
@@ -65,14 +63,17 @@ private:
   } PointerCacheEntry;
     
   static PointerCacheEntry Entries[Ncache];
+  static int Victim;
+  static PointerCacheEntry EntriesSmall[NcacheSmall];
+  static int VictimSmall;
 
 public:
 
   static void *Insert(void *ptr,size_t bytes) ;
+  static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
   static void *Lookup(size_t bytes) ;
-
+  static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
 };
-#endif  
 
 std::string sizeString(size_t bytes);
 

From 2bb2c68e15572bcc6012bfe2694bdca10948463f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 9 May 2020 22:57:21 -0400
Subject: [PATCH 40/51] Separate pools for small and large allocations cache

---
 Grid/allocator/AlignedAllocator.cc               | 16 ++++++++++++++--
 Grid/allocator/AlignedAllocator.h                | 12 +++++++-----
 .../CayleyFermion5DImplementation.h              |  3 ++-
 Grid/util/Init.cc                                |  2 ++
 4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc
index 77646410..976dfbdc 100644
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,11 +6,23 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 
+int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
+int PointerCache::Ncache      = PointerCache::NcacheMax;
 int PointerCache::Victim;
 int PointerCache::VictimSmall;
-PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
-PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmall];
+PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
+PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
 
+void PointerCache::Init(void)
+{
+  char * str;
+  str= getenv("GRID_ALLOC_NCACHE_LARGE");
+  if ( str ) Ncache = atoi(str);
+  if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
+  str= getenv("GRID_ALLOC_NCACHE_SMALL");
+  if ( str ) NcacheSmall = atoi(str);
+  if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
+}
 void *PointerCache::Insert(void *ptr,size_t bytes) 
 {
   if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index d6e2e073..77167299 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -53,8 +53,10 @@ private:
 /*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
 /* Could make these configurable, perhaps up to a max size*/
-  static const int NcacheSmall=128; 
-  static const int Ncache=8;
+  static const int NcacheSmallMax=128; 
+  static const int NcacheMax=16;
+  static int NcacheSmall;
+  static int Ncache;
 
   typedef struct { 
     void *address;
@@ -62,13 +64,13 @@ private:
     int valid;
   } PointerCacheEntry;
     
-  static PointerCacheEntry Entries[Ncache];
+  static PointerCacheEntry Entries[NcacheMax];
   static int Victim;
-  static PointerCacheEntry EntriesSmall[NcacheSmall];
+  static PointerCacheEntry EntriesSmall[NcacheSmallMax];
   static int VictimSmall;
 
 public:
-
+  static void Init(void);
   static void *Insert(void *ptr,size_t bytes) ;
   static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
   static void *Lookup(size_t bytes) ;
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
index c80d2425..e379026c 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -779,9 +779,9 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
   assert(mu>=0);
   assert(mu<Nd);
 
-  int tshift = (mu == Nd-1) ? 1 : 0;
 
 #if 0
+  int tshift = (mu == Nd-1) ? 1 : 0;
   ////////////////////////////////////////////////
   // SHAMIR CASE 
   ////////////////////////////////////////////////
@@ -829,6 +829,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #endif
 
 #ifndef GRID_NVCC
+  int tshift = (mu == Nd-1) ? 1 : 0;
   ////////////////////////////////////////////////
   // GENERAL CAYLEY CASE
   ////////////////////////////////////////////////
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 570f4234..1b672141 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -355,6 +355,8 @@ void Grid_init(int *argc,char ***argv)
   //////////////////////////////////////////////////////////
   GridGpuInit(); // Must come first to set device prior to MPI init
 
+  PointerCache::Init();
+
   if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
     int MB;
     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");

From ea08f193e7bdd7fcb8d18a8713f0f5387def9b2f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sun, 10 May 2020 05:24:26 -0400
Subject: [PATCH 41/51] Allocator cache spliit into large/small pools

---
 Grid/algorithms/approx/Chebyshev.h |  2 +-
 Grid/allocator/AlignedAllocator.cc | 10 +++++++++-
 Grid/communicator/SharedMemory.cc  |  4 +++-
 benchmarks/Benchmark_schur.cc      |  4 ++--
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
index c0b0646d..584ed1d5 100644
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -236,7 +236,6 @@ public:
 
     int vol=grid->gSites();
     typedef typename Field::vector_type vector_type;
-    constexpr int Nsimd = vector_type::Nsimd();
 
     Field T0(grid); T0 = in;  
     Field T1(grid); 
@@ -264,6 +263,7 @@ public:
       auto Tn_v = Tn->View();
       auto Tnp_v = Tnp->View();
       auto Tnm_v = Tnm->View();
+      constexpr int Nsimd = vector_type::Nsimd();
       accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc
index 976dfbdc..ef6459ed 100644
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -7,7 +7,11 @@ MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 
 int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
-int PointerCache::Ncache      = PointerCache::NcacheMax;
+#ifdef GRID_CUDA
+int PointerCache::Ncache      = 32;
+#else 
+int PointerCache::Ncache      = 8;
+#endif
 int PointerCache::Victim;
 int PointerCache::VictimSmall;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
@@ -16,12 +20,16 @@ PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheS
 void PointerCache::Init(void)
 {
   char * str;
+
   str= getenv("GRID_ALLOC_NCACHE_LARGE");
   if ( str ) Ncache = atoi(str);
   if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
+
   str= getenv("GRID_ALLOC_NCACHE_SMALL");
   if ( str ) NcacheSmall = atoi(str);
   if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
+
+  //  printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
 }
 void *PointerCache::Insert(void *ptr,size_t bytes) 
 {
diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc
index 5bca9764..de10da3d 100644
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
   if (heap_bytes >= heap_size) {
     std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
     std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current heap  is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
     assert(heap_bytes<heap_size);
   }
   //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
diff --git a/benchmarks/Benchmark_schur.cc b/benchmarks/Benchmark_schur.cc
index afee31b0..8171998a 100644
--- a/benchmarks/Benchmark_schur.cc
+++ b/benchmarks/Benchmark_schur.cc
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
 
   const int Ls=12;
   std::vector< std::vector<int> > latts;
-#if 0
+#if 1
   latts.push_back(std::vector<int> ({24,24,24,24}) );
   latts.push_back(std::vector<int> ({48,24,24,24}) );
   latts.push_back(std::vector<int> ({96,24,24,24}) );
@@ -157,7 +157,7 @@ void benchDw(std::vector<int> & latt4, int Ls)
     std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
 
     // Cheby uses MpcDagMpc so 2x flops
-    for(int i=0;i<100;i++){
+    for(int i=0;i<1;i++){
     Cheby(Mpc,src_o,r_o);
     t0=usecond();
     Cheby(Mpc,src_o,r_o);

From 82f71643a4cad05bd48a0528371ca0bd6b608417 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 12 May 2020 17:55:53 -0400
Subject: [PATCH 42/51] Remove the norm in MdagM

---
 Grid/algorithms/CoarsenedMatrix.h             |  31 +-
 Grid/algorithms/LinearOperator.h              | 628 +++++++++---------
 Grid/algorithms/SparseMatrix.h                |  14 +-
 Grid/qcd/action/fermion/CayleyFermion5D.h     |   4 +-
 .../fermion/ContinuedFractionFermion5D.h      |   4 +-
 .../action/fermion/DomainWallEOFAFermion.h    |   4 +-
 Grid/qcd/action/fermion/FermionOperator.h     |   9 +-
 .../action/fermion/ImprovedStaggeredFermion.h |   4 +-
 .../fermion/ImprovedStaggeredFermion5D.h      |   5 +-
 Grid/qcd/action/fermion/MobiusEOFAFermion.h   |   4 +-
 .../action/fermion/PartialFractionFermion5D.h |   4 +-
 Grid/qcd/action/fermion/WilsonCloverFermion.h |   5 +-
 Grid/qcd/action/fermion/WilsonFermion.h       |   4 +-
 Grid/qcd/action/fermion/WilsonFermion5D.h     |   5 +-
 Grid/qcd/action/fermion/WilsonTMFermion5D.h   |   8 +-
 .../CayleyFermion5DImplementation.h           |   6 +-
 ...ContinuedFractionFermion5DImplementation.h |   7 +-
 .../DomainWallEOFAFermionImplementation.h     |   6 +-
 ...ImprovedStaggeredFermion5DImplementation.h |  29 +-
 .../ImprovedStaggeredFermionImplementation.h  |  44 +-
 .../MobiusEOFAFermionImplementation.h         |   6 +-
 .../PartialFractionFermion5DImplementation.h  |   6 +-
 .../WilsonCloverFermionImplementation.h       |   6 +-
 .../WilsonFermionImplementation.h             |  52 +-
 Grid/qcd/modules/Registration.h               |   8 +-
 Grid/qcd/smearing/GaugeConfiguration.h        |   2 +-
 Grid/qcd/utils/CovariantCshift.h              |  40 ++
 27 files changed, 469 insertions(+), 476 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index a6b01986..4c26f799 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -541,17 +541,14 @@ public:
   ///////////////////////
   GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
 
-  RealD M (const CoarseVector &in, CoarseVector &out){
-
+  void M (const CoarseVector &in, CoarseVector &out)
+  {
     conformable(_grid,in.Grid());
     conformable(in.Grid(),out.Grid());
 
-    //    RealD Nin = norm2(in);
     SimpleCompressor<siteVector> compressor;
 
-    double comms_usec = -usecond();
     Stencil.HaloExchange(in,compressor);
-    comms_usec += usecond();
 
     auto in_v = in.View();
     auto out_v = out.View();
@@ -565,12 +562,7 @@ public:
     typedef decltype(coalescedRead(in_v[0])) calcVector;
     typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 
-    GridStopWatch ArithmeticTimer;
     int osites=Grid()->oSites();
-    //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
-    //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
-    double usecs =-usecond();
-    // assert(geom.npoint==9);
 
     accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
       int ss = sss/nbasis;
@@ -598,23 +590,9 @@ public:
       }
       coalescedWrite(out_v[ss](b),res,lane);
     });
-    usecs +=usecond();
-
-    double nrm_usec=-usecond();
-    RealD Nout= norm2(out);
-    nrm_usec+=usecond();
-
-    /*
-        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
-        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
-        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
-        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
-        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
-    */
-    return Nout;
   };
 
-  RealD Mdag (const CoarseVector &in, CoarseVector &out)
+  void Mdag (const CoarseVector &in, CoarseVector &out)
   {
     if(hermitian) {
       // corresponds to Petrov-Galerkin coarsening
@@ -625,7 +603,6 @@ public:
       G5C(tmp, in); 
       M(tmp, out);
       G5C(out, out);
-      return norm2(out);
     }
   };
   void MdirComms(const CoarseVector &in)
@@ -870,8 +847,6 @@ public:
 	    auto A_self  = A[self_stencil].View();
 
 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
-	    //      if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
-	    //	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });
 
 	  }
 	}
diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
index c41f8eef..1add212c 100644
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -43,7 +43,6 @@ NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////////////////
 template<class Field> class LinearOperatorBase {
 public:
-
   // Support for coarsening to a multigrid
   virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
   virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
@@ -94,7 +93,10 @@ public:
     _Mat.Mdag(in,out);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    _Mat.MdagM(in,out,n1,n2);
+    _Mat.MdagM(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
   }
   void HermOp(const Field &in, Field &out){
     _Mat.MdagM(in,out);
@@ -131,17 +133,14 @@ public:
     assert(0);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    _Mat.MdagM(in,out,n1,n2);
-    out = out + _shift*in;
-
-    ComplexD dot;	
-    dot= innerProduct(in,out);
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
     n1=real(dot);
     n2=norm2(out);
   }
   void HermOp(const Field &in, Field &out){
-    RealD n1,n2;
-    HermOpAndNorm(in,out,n1,n2);
+    _Mat.MdagM(in,out);
+    out = out + _shift*in;
   }
 };
 
@@ -170,7 +169,7 @@ public:
     _Mat.M(in,out);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    _Mat.M(in,out);
+    HermOp(in,out);
     ComplexD dot= innerProduct(in,out); n1=real(dot);
     n2=norm2(out);
   }
@@ -208,339 +207,305 @@ public:
   }
 };
 
-    //////////////////////////////////////////////////////////
-    // Even Odd Schur decomp operators; there are several
-    // ways to introduce the even odd checkerboarding
-    //////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////
+// Even Odd Schur decomp operators; there are several
+// ways to introduce the even odd checkerboarding
+//////////////////////////////////////////////////////////
 
-    template<class Field>
-    class SchurOperatorBase :  public LinearOperatorBase<Field> {
-    public:
-      virtual  RealD Mpc      (const Field &in, Field &out) =0;
-      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
-      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
-      Field tmp(in.Grid());
-      tmp.Checkerboard() = in.Checkerboard();
-	ni=Mpc(in,tmp);
-	no=MpcDag(tmp,out);
-      }
-      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-      out.Checkerboard() = in.Checkerboard();
-	MpcDagMpc(in,out,n1,n2);
-      }
-      virtual void HermOp(const Field &in, Field &out){
-	RealD n1,n2;
-	HermOpAndNorm(in,out,n1,n2);
-      }
-      void Op     (const Field &in, Field &out){
-	Mpc(in,out);
-      }
-      void AdjOp     (const Field &in, Field &out){ 
-	MpcDag(in,out);
-      }
-      // Support for coarsening to a multigrid
-      void OpDiag (const Field &in, Field &out) {
-	assert(0); // must coarsen the unpreconditioned system
-      }
-      void OpDir  (const Field &in, Field &out,int dir,int disp) {
-	assert(0);
-      }
-      void OpDirAll  (const Field &in, std::vector<Field> &out){
-	assert(0);
-      };
-    };
-    template<class Matrix,class Field>
-    class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
-    public:
-      Matrix &_Mat;
-      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
-      virtual  RealD Mpc      (const Field &in, Field &out) {
-      Field tmp(in.Grid());
-      tmp.Checkerboard() = !in.Checkerboard();
-
-	_Mat.Meooe(in,tmp);
-	_Mat.MooeeInv(tmp,out);
-	_Mat.Meooe(out,tmp);
-
-	_Mat.Mooee(in,out);
-	return axpy_norm(out,-1.0,tmp,out);
-      }
-      virtual  RealD MpcDag   (const Field &in, Field &out){
-	Field tmp(in.Grid());
-
-	_Mat.MeooeDag(in,tmp);
-        _Mat.MooeeInvDag(tmp,out);
-	_Mat.MeooeDag(out,tmp);
-
-	_Mat.MooeeDag(in,out);
-	return axpy_norm(out,-1.0,tmp,out);
-      }
-    };
-    template<class Matrix,class Field>
-      class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
-    protected:
-      Matrix &_Mat;
-    public:
-      SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
-
-      virtual  RealD Mpc      (const Field &in, Field &out) {
-	Field tmp(in.Grid());
-
-	_Mat.Meooe(in,out);
-	_Mat.MooeeInv(out,tmp);
-	_Mat.Meooe(tmp,out);
-	_Mat.MooeeInv(out,tmp);
-
-	return axpy_norm(out,-1.0,tmp,in);
-      }
-      virtual  RealD MpcDag   (const Field &in, Field &out){
-	Field tmp(in.Grid());
-
-	_Mat.MooeeInvDag(in,out);
-	_Mat.MeooeDag(out,tmp);
-	_Mat.MooeeInvDag(tmp,out);
-	_Mat.MeooeDag(out,tmp);
-
-	return axpy_norm(out,-1.0,tmp,in);
-      }
-    };
-    template<class Matrix,class Field>
-      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
-    protected:
-      Matrix &_Mat;
-    public:
-      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
-
-      virtual  RealD Mpc      (const Field &in, Field &out) {
-	Field tmp(in.Grid());
-
-	_Mat.MooeeInv(in,out);
-	_Mat.Meooe(out,tmp);
-	_Mat.MooeeInv(tmp,out);
-	_Mat.Meooe(out,tmp);
-
-	return axpy_norm(out,-1.0,tmp,in);
-      }
-      virtual  RealD MpcDag   (const Field &in, Field &out){
-	Field tmp(in.Grid());
-
-	_Mat.MeooeDag(in,out);
-	_Mat.MooeeInvDag(out,tmp);
-	_Mat.MeooeDag(tmp,out);
-	_Mat.MooeeInvDag(out,tmp);
-
-	return axpy_norm(out,-1.0,tmp,in);
-      }
-    };
-
-    template<class Field>
-    class NonHermitianSchurOperatorBase :  public LinearOperatorBase<Field> 
-    {
-      public:
-        virtual RealD Mpc      (const Field& in, Field& out) = 0;
-        virtual RealD MpcDag   (const Field& in, Field& out) = 0;
-        virtual void  MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) {
-          Field tmp(in.Grid());
-          tmp.Checkerboard() = in.Checkerboard();
-	        ni = Mpc(in,tmp);
-	        no = MpcDag(tmp,out);
-        }
-        virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
-          assert(0);
-        }
-        virtual void HermOp(const Field& in, Field& out) {
-          assert(0);
-        }
-        void Op(const Field& in, Field& out) {
-          Mpc(in, out);
-        }
-        void AdjOp(const Field& in, Field& out) { 
-          MpcDag(in, out);
-        }
-        // Support for coarsening to a multigrid
-        void OpDiag(const Field& in, Field& out) {
-          assert(0); // must coarsen the unpreconditioned system
-        }
-        void OpDir(const Field& in, Field& out, int dir, int disp) {
-          assert(0);
-        }
-        void OpDirAll(const Field& in, std::vector<Field>& out){
-          assert(0);
-        };
-    };
-
-    template<class Matrix, class Field>
-    class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase<Field> 
-    {
-      public:
-        Matrix& _Mat;
-        NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
-        virtual RealD Mpc(const Field& in, Field& out) {
-          Field tmp(in.Grid());
-          tmp.Checkerboard() = !in.Checkerboard();
-
-  	      _Mat.Meooe(in, tmp);
-	        _Mat.MooeeInv(tmp, out);
-	        _Mat.Meooe(out, tmp);
-
-	        _Mat.Mooee(in, out);
-	
-          return axpy_norm(out, -1.0, tmp, out);
-        }
-        virtual RealD MpcDag(const Field& in, Field& out) {
-	        Field tmp(in.Grid());
-
-	        _Mat.MeooeDag(in, tmp);
-          _Mat.MooeeInvDag(tmp, out);
-	        _Mat.MeooeDag(out, tmp);
-
-	        _Mat.MooeeDag(in, out);
-	
-          return axpy_norm(out, -1.0, tmp, out);
-      }
-    };
-    
-    template<class Matrix,class Field>
-    class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field> 
-    {
-      protected:
-        Matrix &_Mat;
-    
-      public:
-        NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
-        virtual RealD Mpc(const Field& in, Field& out) {
-	        Field tmp(in.Grid());
-
-	        _Mat.Meooe(in, out);
-	        _Mat.MooeeInv(out, tmp);
-	        _Mat.Meooe(tmp, out);
-	        _Mat.MooeeInv(out, tmp);
-
-	        return axpy_norm(out, -1.0, tmp, in);
-        }
-        virtual RealD MpcDag(const Field& in, Field& out) {
-	        Field tmp(in.Grid());
-
-	        _Mat.MooeeInvDag(in, out);
-	        _Mat.MeooeDag(out, tmp);
-	        _Mat.MooeeInvDag(tmp, out);
-	        _Mat.MeooeDag(out, tmp);
-
-	        return axpy_norm(out, -1.0, tmp, in);
-        }
-    };
-
-    template<class Matrix, class Field>
-    class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field> 
-    {
-      protected:
-        Matrix& _Mat;
-    
-      public:
-        NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
-
-        virtual RealD Mpc(const Field& in, Field& out) {
-          Field tmp(in.Grid());
-
-	        _Mat.MooeeInv(in, out);
-	        _Mat.Meooe(out, tmp);
-	        _Mat.MooeeInv(tmp, out);
-	        _Mat.Meooe(out, tmp);
-
-	        return axpy_norm(out, -1.0, tmp, in);
-        }
-        virtual RealD MpcDag(const Field& in, Field& out) {
-	        Field tmp(in.Grid());
-
-          _Mat.MeooeDag(in, out);
-          _Mat.MooeeInvDag(out, tmp);
-          _Mat.MeooeDag(tmp, out);
-          _Mat.MooeeInvDag(out, tmp);
-
-          return axpy_norm(out, -1.0, tmp, in);
-        }
-    };
-
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
-    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
-    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    //  Staggered use
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class Matrix,class Field>
-      class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
-    protected:
-      Matrix &_Mat;
-      Field tmp;
-      RealD mass;
-      double tMpc;
-      double tIP;
-      double tMeo;
-      double taxpby_norm;
-      uint64_t ncall;
-public:
-      void Report(void)
-      {
-	std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
-	std::cout << GridLogMessage << " HermOpAndNorm.IP  "<< tIP /ncall<<" usec "<<std::endl;
-	std::cout << GridLogMessage << " Mpc.MeoMoe        "<< tMeo/ncall<<" usec "<<std::endl;
-	std::cout << GridLogMessage << " Mpc.axpby_norm    "<< taxpby_norm/ncall<<" usec "<<std::endl;
-      }
-      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
-      { 
-	assert( _Mat.isTrivialEE() );
-	mass = _Mat.Mass();
-	tMpc=0;
-	tIP =0;
-        tMeo=0;
-        taxpby_norm=0;
-	ncall=0;
-      }
+template<class Field>
+class SchurOperatorBase :  public LinearOperatorBase<Field> {
+ public:
+  virtual  void Mpc      (const Field &in, Field &out) =0;
+  virtual  void MpcDag   (const Field &in, Field &out) =0;
+  virtual  void MpcDagMpc(const Field &in, Field &out) {
+    Field tmp(in.Grid());
+    tmp.Checkerboard() = in.Checkerboard();
+    Mpc(in,tmp);
+    MpcDag(tmp,out);
+  }
   virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-	ncall++;
-	tMpc-=usecond();
-    n2 = Mpc(in,out);
-	tMpc+=usecond();
-	tIP-=usecond();
-    ComplexD dot= innerProduct(in,out);
-	tIP+=usecond();
-    n1 = real(dot);
+    out.Checkerboard() = in.Checkerboard();
+    MpcDagMpc(in,out);
+    ComplexD dot= innerProduct(in,out); 
+    n1=real(dot);
+    n2=norm2(out);
   }
   virtual void HermOp(const Field &in, Field &out){
-	ncall++;
-	tMpc-=usecond();
-	_Mat.Meooe(in,out);
-	_Mat.Meooe(out,tmp);
-	tMpc+=usecond();
-	taxpby_norm-=usecond();
-	axpby(out,-1.0,mass*mass,tmp,in);
-	taxpby_norm+=usecond();
+    out.Checkerboard() = in.Checkerboard();
+    MpcDagMpc(in,out);
   }
-  virtual  RealD Mpc      (const Field &in, Field &out) 
-  {
+  void Op     (const Field &in, Field &out){
+    Mpc(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){ 
+    MpcDag(in,out);
+  }
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    assert(0); // must coarsen the unpreconditioned system
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+};
+template<class Matrix,class Field>
+  class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
+ public:
+    Matrix &_Mat;
+    SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
+    virtual  void Mpc      (const Field &in, Field &out) {
+      Field tmp(in.Grid());
+      tmp.Checkerboard() = !in.Checkerboard();
+      
+      _Mat.Meooe(in,tmp);
+      _Mat.MooeeInv(tmp,out);
+      _Mat.Meooe(out,tmp);
+      _Mat.Mooee(in,out);
+      axpy(out,-1.0,tmp,out);
+    }
+    virtual void MpcDag   (const Field &in, Field &out){
+      Field tmp(in.Grid());
+	
+      _Mat.MeooeDag(in,tmp);
+      _Mat.MooeeInvDag(tmp,out);
+      _Mat.MeooeDag(out,tmp);
+      _Mat.MooeeDag(in,out);
+      axpy(out,-1.0,tmp,out);
+    }
+};
+template<class Matrix,class Field>
+  class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
+ protected:
+    Matrix &_Mat;
+ public:
+    SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
+    
+    virtual void Mpc      (const Field &in, Field &out) {
+      Field tmp(in.Grid());
 
+      _Mat.Meooe(in,out);
+      _Mat.MooeeInv(out,tmp);
+      _Mat.Meooe(tmp,out);
+      _Mat.MooeeInv(out,tmp);
+      axpy(out,-1.0,tmp,in);
+    }
+    virtual void MpcDag   (const Field &in, Field &out){
+      Field tmp(in.Grid());
+      
+      _Mat.MooeeInvDag(in,out);
+      _Mat.MeooeDag(out,tmp);
+      _Mat.MooeeInvDag(tmp,out);
+      _Mat.MeooeDag(out,tmp);
+      axpy(out,-1.0,tmp,in);
+    }
+};
+template<class Matrix,class Field>
+  class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
+ protected:
+    Matrix &_Mat;
+ public:
+    SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
+    
+    virtual void Mpc      (const Field &in, Field &out) {
+      Field tmp(in.Grid());
+      
+      _Mat.MooeeInv(in,out);
+      _Mat.Meooe(out,tmp);
+      _Mat.MooeeInv(tmp,out);
+      _Mat.Meooe(out,tmp);
+      
+      axpy(out,-1.0,tmp,in);
+    }
+    virtual  void MpcDag   (const Field &in, Field &out){
+      Field tmp(in.Grid());
+
+      _Mat.MeooeDag(in,out);
+      _Mat.MooeeInvDag(out,tmp);
+      _Mat.MeooeDag(tmp,out);
+      _Mat.MooeeInvDag(out,tmp);
+
+      axpy(out,-1.0,tmp,in);
+    }
+};
+
+template<class Field>
+class NonHermitianSchurOperatorBase :  public LinearOperatorBase<Field> 
+{
+ public:
+  virtual void  Mpc      (const Field& in, Field& out) = 0;
+  virtual void  MpcDag   (const Field& in, Field& out) = 0;
+  virtual void  MpcDagMpc(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    tmp.Checkerboard() = in.Checkerboard();
+    Mpc(in,tmp);
+    MpcDag(tmp,out);
+  }
+  virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
+    assert(0);
+  }
+  virtual void HermOp(const Field& in, Field& out) {
+    assert(0);
+  }
+  void Op(const Field& in, Field& out) {
+    Mpc(in, out);
+  }
+  void AdjOp(const Field& in, Field& out) { 
+    MpcDag(in, out);
+  }
+  // Support for coarsening to a multigrid
+  void OpDiag(const Field& in, Field& out) {
+    assert(0); // must coarsen the unpreconditioned system
+  }
+  void OpDir(const Field& in, Field& out, int dir, int disp) {
+    assert(0);
+  }
+  void OpDirAll(const Field& in, std::vector<Field>& out){
+    assert(0);
+  };
+};
+
+template<class Matrix, class Field>
+class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase<Field> 
+{
+ public:
+  Matrix& _Mat;
+ NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
+  virtual void Mpc(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    tmp.Checkerboard() = !in.Checkerboard();
+    
+    _Mat.Meooe(in, tmp);
+    _Mat.MooeeInv(tmp, out);
+    _Mat.Meooe(out, tmp);
+    
+    _Mat.Mooee(in, out);
+    
+    axpy(out, -1.0, tmp, out);
+  }
+  virtual void MpcDag(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    
+    _Mat.MeooeDag(in, tmp);
+    _Mat.MooeeInvDag(tmp, out);
+    _Mat.MeooeDag(out, tmp);
+	  
+    _Mat.MooeeDag(in, out);
+    
+    axpy(out, -1.0, tmp, out);
+  }
+};
+    
+template<class Matrix,class Field>
+class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field> 
+{
+ protected:
+  Matrix &_Mat;
+  
+ public:
+  NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
+  virtual void Mpc(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+	  
+    _Mat.Meooe(in, out);
+    _Mat.MooeeInv(out, tmp);
+    _Mat.Meooe(tmp, out);
+    _Mat.MooeeInv(out, tmp);
+
+    axpy(out, -1.0, tmp, in);
+  }
+  virtual void MpcDag(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    
+    _Mat.MooeeInvDag(in, out);
+    _Mat.MeooeDag(out, tmp);
+    _Mat.MooeeInvDag(tmp, out);
+    _Mat.MeooeDag(out, tmp);
+    
+    axpy(out, -1.0, tmp, in);
+  }
+};
+
+template<class Matrix, class Field>
+class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field> 
+{
+ protected:
+  Matrix& _Mat;
+  
+ public:
+ NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
+
+  virtual void Mpc(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    
+    _Mat.MooeeInv(in, out);
+    _Mat.Meooe(out, tmp);
+    _Mat.MooeeInv(tmp, out);
+    _Mat.Meooe(out, tmp);
+
+    axpy(out, -1.0, tmp, in);
+  }
+  virtual void MpcDag(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    
+    _Mat.MeooeDag(in, out);
+    _Mat.MooeeInvDag(out, tmp);
+    _Mat.MeooeDag(tmp, out);
+    _Mat.MooeeInvDag(out, tmp);
+
+    axpy(out, -1.0, tmp, in);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
+// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
+template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//  Staggered use
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field>
+class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
+ protected:
+  Matrix &_Mat;
+  Field tmp;
+  RealD mass;
+ public:
+  SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
+  { 
+    assert( _Mat.isTrivialEE() );
+    mass = _Mat.Mass();
+  }
+  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    Mpc(in,out);
+    ComplexD dot= innerProduct(in,out);
+    n1 = real(dot);
+    n2 =0.0;
+  }
+  virtual void HermOp(const Field &in, Field &out){
+    Mpc(in,out);
+    //    _Mat.Meooe(in,out);
+    //    _Mat.Meooe(out,tmp);
+    //    axpby(out,-1.0,mass*mass,tmp,in);
+  }
+  virtual  void Mpc      (const Field &in, Field &out) 
+  {
     Field tmp(in.Grid());
     Field tmp2(in.Grid());
+	
+    //    _Mat.Mooee(in,out);
+    //    _Mat.Mooee(out,tmp);
 
-    //    std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
-    _Mat.Mooee(in,out);
-    _Mat.Mooee(out,tmp);
-    //    std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
-
-    tMeo-=usecond();
     _Mat.Meooe(in,out);
     _Mat.Meooe(out,tmp);
-    tMeo+=usecond();
-    taxpby_norm-=usecond();
-    RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
-    taxpby_norm+=usecond();
-    return nn;
+    axpby(out,-1.0,mass*mass,tmp,in);
   }
-  virtual  RealD MpcDag   (const Field &in, Field &out){
-    return Mpc(in,out);
+  virtual  void MpcDag   (const Field &in, Field &out){
+    Mpc(in,out);
   }
   virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
     assert(0);// Never need with staggered
@@ -548,7 +513,6 @@ public:
 };
 template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
 
-
 /////////////////////////////////////////////////////////////
 // Base classes for functions of operators
 /////////////////////////////////////////////////////////////
diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h
index b959f53c..8a265b3f 100644
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -38,16 +38,12 @@ template<class Field> class SparseMatrixBase {
 public:
   virtual GridBase *Grid(void) =0;
   // Full checkerboar operations
-  virtual RealD M    (const Field &in, Field &out)=0;
-  virtual RealD Mdag (const Field &in, Field &out)=0;
-  virtual void  MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
-    Field tmp (in.Grid());
-    ni=M(in,tmp);
-    no=Mdag(tmp,out);
-  }
+  virtual void  M    (const Field &in, Field &out)=0;
+  virtual void  Mdag (const Field &in, Field &out)=0;
   virtual void  MdagM(const Field &in, Field &out) {
-    RealD ni, no;
-    MdagM(in,out,ni,no);
+    Field tmp (in.Grid());
+    M(in,tmp);
+    Mdag(tmp,out);
   }
   virtual  void Mdiag    (const Field &in, Field &out)=0;
   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h
index f27f4c23..c7d68d73 100644
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -40,8 +40,8 @@ public:
 public:
 
   // override multiply
-  virtual RealD  M    (const FermionField &in, FermionField &out);
-  virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  virtual void   M    (const FermionField &in, FermionField &out);
+  virtual void   Mdag (const FermionField &in, FermionField &out);
 
   // half checkerboard operations
   virtual void   Meooe       (const FermionField &in, FermionField &out);
diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
index 5aa7bfbd..2300afd3 100644
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -41,8 +41,8 @@ public:
 public:
 
   // override multiply
-  virtual RealD  M    (const FermionField &in, FermionField &out);
-  virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  virtual void   M    (const FermionField &in, FermionField &out);
+  virtual void   Mdag (const FermionField &in, FermionField &out);
 
   // half checkerboard operaions
   virtual void   Meooe       (const FermionField &in, FermionField &out);
diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
index a2d0e733..bcc97176 100644
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
@@ -53,8 +53,8 @@ public:
   virtual void  DtildeInv  (const FermionField& in, FermionField& out);
 
   // override multiply
-  virtual RealD M          (const FermionField& in, FermionField& out);
-  virtual RealD Mdag       (const FermionField& in, FermionField& out);
+  virtual void  M          (const FermionField& in, FermionField& out);
+  virtual void  Mdag       (const FermionField& in, FermionField& out);
 
   // half checkerboard operations
   virtual void  Mooee      (const FermionField& in, FermionField& out);
diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h
index f0c2a039..570e350d 100644
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -58,8 +58,8 @@ public:
   virtual GridBase *GaugeRedBlackGrid(void)   =0;
 
   // override multiply
-  virtual RealD  M    (const FermionField &in, FermionField &out)=0;
-  virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;
+  virtual void  M    (const FermionField &in, FermionField &out)=0;
+  virtual void  Mdag (const FermionField &in, FermionField &out)=0;
 
   // half checkerboard operaions
   virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
@@ -86,15 +86,14 @@ public:
   virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
   virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
 
-
   virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
   virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
   virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 
 
-      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
+  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
 
-      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) 
+  virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) 
       {
 	FFT theFFT((GridCartesian *) in.Grid());
 
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
index 0cfae7b6..ecf44ed7 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -71,8 +71,8 @@ public:
   // override multiply; cut number routines if pass dagger argument
   // and also make interface more uniformly consistent
   //////////////////////////////////////////////////////////////////
-  RealD M(const FermionField &in, FermionField &out);
-  RealD Mdag(const FermionField &in, FermionField &out);
+  void M(const FermionField &in, FermionField &out);
+  void Mdag(const FermionField &in, FermionField &out);
 
   /////////////////////////////////////////////////////////
   // half checkerboard operations
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index 0ce1c701..d1bb0e9c 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -1,4 +1,3 @@
-
 /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -74,8 +73,8 @@ public:
   GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
 
   // full checkerboard operations; leave unimplemented as abstract for now
-  RealD  M    (const FermionField &in, FermionField &out);
-  RealD  Mdag (const FermionField &in, FermionField &out);
+  void  M    (const FermionField &in, FermionField &out);
+  void  Mdag (const FermionField &in, FermionField &out);
 
   // half checkerboard operations
   void   Meooe       (const FermionField &in, FermionField &out);
diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermion.h b/Grid/qcd/action/fermion/MobiusEOFAFermion.h
index 6b214233..6e4f79eb 100644
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h
@@ -56,8 +56,8 @@ public:
   virtual void  DtildeInv        (const FermionField& in, FermionField& out);
 
   // override multiply
-  virtual RealD M                (const FermionField& in, FermionField& out);
-  virtual RealD Mdag             (const FermionField& in, FermionField& out);
+  virtual void  M                (const FermionField& in, FermionField& out);
+  virtual void  Mdag             (const FermionField& in, FermionField& out);
 
   // half checkerboard operations
   virtual void  Mooee            (const FermionField& in, FermionField& out);
diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
index 928abd3f..54f8547f 100644
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -47,8 +47,8 @@ public:
   void   M_internal(const FermionField &in, FermionField &out,int dag);
 
   // override multiply
-  virtual RealD  M    (const FermionField &in, FermionField &out);
-  virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  virtual void   M    (const FermionField &in, FermionField &out);
+  virtual void   Mdag (const FermionField &in, FermionField &out);
 
   // half checkerboard operaions
   virtual void   Meooe       (const FermionField &in, FermionField &out);
diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h
index 3847b0d9..4b25d00e 100644
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -109,9 +109,8 @@ public:
     ImportGauge(_Umu);
   }
 
-  virtual RealD M(const FermionField &in, FermionField &out);
-  virtual RealD Mdag(const FermionField &in, FermionField &out);
-
+  virtual void M(const FermionField &in, FermionField &out);
+  virtual void Mdag(const FermionField &in, FermionField &out);
   virtual void Mooee(const FermionField &in, FermionField &out);
   virtual void MooeeDag(const FermionField &in, FermionField &out);
   virtual void MooeeInv(const FermionField &in, FermionField &out);
diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h
index 2e0bc9bf..1c4dd3cf 100644
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -78,8 +78,8 @@ public:
   // override multiply; cut number routines if pass dagger argument
   // and also make interface more uniformly consistent
   //////////////////////////////////////////////////////////////////
-  virtual RealD M(const FermionField &in, FermionField &out);
-  virtual RealD Mdag(const FermionField &in, FermionField &out);
+  virtual void  M(const FermionField &in, FermionField &out);
+  virtual void  Mdag(const FermionField &in, FermionField &out);
 
   /////////////////////////////////////////////////////////
   // half checkerboard operations
diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h
index ea71376c..804b1d10 100644
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -1,4 +1,3 @@
-
 /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -99,8 +98,8 @@ public:
   GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
 
   // full checkerboard operations; leave unimplemented as abstract for now
-  virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-  virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+  virtual void   M    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};
 
   // half checkerboard operations; leave unimplemented as abstract for now
   virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
diff --git a/Grid/qcd/action/fermion/WilsonTMFermion5D.h b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
index 71acf763..982e722a 100644
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -120,7 +120,8 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
     }
   }
   
-  virtual RealD M(const FermionField &in, FermionField &out) {
+  virtual void M(const FermionField &in, FermionField &out) 
+  {
     out.Checkerboard() = in.Checkerboard();
     this->Dhop(in, out, DaggerNo);
     FermionField tmp(out.Grid());
@@ -129,11 +130,12 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
       ComplexD b(0.0,this->mu[s]);
       axpbg5y_ssp(tmp,a,in,b,in,s,s);
     }
-    return axpy_norm(out, 1.0, tmp, out);
+    axpy(out, 1.0, tmp, out);
   }
   
   // needed for fast PV
-  void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
+  void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) 
+  {
     assert(_mass.size() == _mu.size());
     assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
     this->mass = _mass;
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
index e379026c..e9675b36 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -323,7 +323,7 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
 }
 
 template<class Impl>
-RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
   FermionField Din(psi.Grid());
   
@@ -335,11 +335,10 @@ RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
   axpby(chi,1.0,1.0,chi,psi); 
   
   M5D(psi,chi);
-  return(norm2(chi));
 }
 
 template<class Impl>
-RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
+void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
 {
   // Under adjoint
   //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
@@ -354,7 +353,6 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
   M5Ddag(psi,chi);
   // ((b D_W + D_w hop terms +1) on s-diag
   axpby (chi,1.0,1.0,chi,psi); 
-  return norm2(chi);
 }
 
 // half checkerboard operations
diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
index beeb3e00..6687800e 100644
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -94,7 +94,7 @@ void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Ap
 
 
 template<class Impl>
-RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
+void ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
 {
   int Ls = this->Ls;
 
@@ -116,15 +116,14 @@ RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, F
     }
     sign=-sign; 
   }
-  return norm2(chi);
 }
 template<class Impl>
-RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
+void ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
 {
   // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
   // The rest of matrix is symmetric.
   // Can ignore "dag"
-  return M(psi,chi);
+  M(psi,chi);
 }
 template<class Impl>
 void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
index 3684fd6c..64ee4033 100644
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@@ -89,7 +89,7 @@ void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionFiel
 /*****************************************************************************************************/
 
 template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+void DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
 {
   FermionField Din(psi.Grid());
 
@@ -97,11 +97,10 @@ RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
   this->DW(Din, chi, DaggerNo);
   axpby(chi, 1.0, 1.0, chi, psi);
   this->M5D(psi, chi);
-  return(norm2(chi));
 }
 
 template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+void DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
 {
   FermionField Din(psi.Grid());
 
@@ -109,7 +108,6 @@ RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& c
   this->MeooeDag5D(Din, chi);
   this->M5Ddag(psi, chi);
   axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
 }
 
 /********************************************************************
diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
index 23692d49..44a201c1 100644
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -548,21 +548,24 @@ void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vect
   assert(0);
 }
 template <class Impl>
-RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerNo);
-  return axpy_norm(out, mass, in, out);
+  axpy(out, mass, in, out);
 }
 
 template <class Impl>
-RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerYes);
-  return axpy_norm(out, mass, in, out);
+  axpy(out, mass, in, out);
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) 
+{
   if (in.Checkerboard() == Odd) {
     DhopEO(in, out, DaggerNo);
   } else {
@@ -570,7 +573,8 @@ void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionFiel
   }
 }
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
+{
   if (in.Checkerboard() == Odd) {
     DhopEO(in, out, DaggerYes);
   } else {
@@ -579,27 +583,30 @@ void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionF
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   typename FermionField::scalar_type scal(mass);
   out = scal * in;
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Mooee(in, out);
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   out = (1.0 / (mass)) * in;
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
-						   FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   MooeeInv(in, out);
 }
diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
index 37675da0..57f4cb89 100644
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -171,21 +171,24 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
 /////////////////////////////
 
 template <class Impl>
-RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerNo);
-  return axpy_norm(out, mass, in, out);
+  axpy(out, mass, in, out);
 }
 
 template <class Impl>
-RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerYes);
-  return axpy_norm(out, mass, in, out);
+  axpy(out, mass, in, out);
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
+{
   if (in.Checkerboard() == Odd) {
     DhopEO(in, out, DaggerNo);
   } else {
@@ -193,7 +196,8 @@ void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField
   }
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
+{
   if (in.Checkerboard() == Odd) {
     DhopEO(in, out, DaggerYes);
   } else {
@@ -202,27 +206,30 @@ void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionFie
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   typename FermionField::scalar_type scal(mass);
   out = scal * in;
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Mooee(in, out);
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   out = (1.0 / (mass)) * in;
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
-						 FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   MooeeInv(in, out);
 }
@@ -234,7 +241,8 @@ void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, 
 						   GaugeField & mat,
-						   const FermionField &A, const FermionField &B, int dag) {
+						   const FermionField &A, const FermionField &B, int dag) 
+{
   assert((dag == DaggerNo) || (dag == DaggerYes));
 
   Compressor compressor;
@@ -284,8 +292,8 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
+void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
   conformable(U.Grid(), _grid);
   conformable(U.Grid(), V.Grid());
   conformable(U.Grid(), mat.Grid());
@@ -296,8 +304,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionFie
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
+void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
   conformable(U.Grid(), _cbgrid);
   conformable(U.Grid(), V.Grid());
   conformable(U.Grid(), mat.Grid());
@@ -310,8 +318,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionF
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
+void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
   conformable(U.Grid(), _cbgrid);
   conformable(U.Grid(), V.Grid());
   conformable(U.Grid(), mat.Grid());
diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
index 256423e6..9b9db178 100644
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
@@ -166,7 +166,7 @@ void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& c
 /*****************************************************************************************************/
 
 template<class Impl>
-RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+void MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
 {
   FermionField Din(psi.Grid());
 
@@ -174,11 +174,10 @@ RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
   this->DW(Din, chi, DaggerNo);
   axpby(chi, 1.0, 1.0, chi, psi);
   this->M5D(psi, chi);
-  return(norm2(chi));
 }
 
 template<class Impl>
-RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+void MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
 {
   FermionField Din(psi.Grid());
 
@@ -186,7 +185,6 @@ RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
   this->MeooeDag5D(Din, chi);
   this->M5Ddag(psi, chi);
   axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
 }
 
 /********************************************************************
diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
index edc674cc..0206828b 100644
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -269,16 +269,14 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
 }
 
 template<class Impl>
-RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
+void PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
 {
   M_internal(in,out,DaggerNo);
-  return norm2(out);
 }
 template<class Impl>
-RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
+void PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
 {
   M_internal(in,out,DaggerYes);
-  return norm2(out);
 }
 
 template<class Impl>
diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
index 9d99d9e7..36447153 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -35,7 +35,7 @@ NAMESPACE_BEGIN(Grid);
 
 // *NOT* EO
 template <class Impl>
-RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
   FermionField temp(out.Grid());
 
@@ -47,11 +47,10 @@ RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
   Mooee(in, temp);
 
   out += temp;
-  return norm2(out);
 }
 
 template <class Impl>
-RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
   FermionField temp(out.Grid());
 
@@ -63,7 +62,6 @@ RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
   MooeeDag(in, temp);
 
   out += temp;
-  return norm2(out);
 }
 
 template <class Impl>
diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
index be05fcf8..5267e0c1 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -102,21 +102,24 @@ void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 /////////////////////////////
 
 template <class Impl>
-RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerNo);
-  return axpy_norm(out, diag_mass, in, out);
+  axpy(out, diag_mass, in, out);
 }
 
 template <class Impl>
-RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerYes);
-  return axpy_norm(out, diag_mass, in, out);
+  axpy(out, diag_mass, in, out);
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
+{
   if (in.Checkerboard() == Odd) {
     DhopEO(in, out, DaggerNo);
   } else {
@@ -125,7 +128,8 @@ void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
+{
   if (in.Checkerboard() == Odd) {
     DhopEO(in, out, DaggerYes);
   } else {
@@ -134,26 +138,30 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
 }
   
 template <class Impl>
-void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   typename FermionField::scalar_type scal(diag_mass);
   out = scal * in;
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   Mooee(in, out);
 }
 
 template<class Impl>
-void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   out = (1.0/(diag_mass))*in;
 }
   
 template<class Impl>
-void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) 
+{
   out.Checkerboard() = in.Checkerboard();
   MooeeInv(in,out);
 }
@@ -249,7 +257,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
   conformable(U.Grid(), _grid);
   conformable(U.Grid(), V.Grid());
   conformable(U.Grid(), mat.Grid());
@@ -260,7 +269,8 @@ void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, cons
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
   conformable(U.Grid(), _cbgrid);
   conformable(U.Grid(), V.Grid());
   //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
@@ -274,7 +284,8 @@ void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, co
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
   conformable(U.Grid(), _cbgrid);
   conformable(U.Grid(), V.Grid());
   //conformable(U.Grid(), mat.Grid());
@@ -287,7 +298,8 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
+void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
+{
   conformable(in.Grid(), _grid);  // verifies full grid
   conformable(in.Grid(), out.Grid());
 
@@ -297,7 +309,8 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
+void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
+{
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
@@ -308,7 +321,8 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
+void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) 
+{
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
@@ -386,7 +400,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 						      DoubledGaugeField &U,
 						      const FermionField &in,
-						      FermionField &out, int dag) {
+						      FermionField &out, int dag) 
+{
   assert((dag == DaggerNo) || (dag == DaggerYes));
 
   Compressor compressor(dag);
@@ -436,7 +451,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                        DoubledGaugeField &U,
                                        const FermionField &in,
-                                       FermionField &out, int dag) {
+                                       FermionField &out, int dag) 
+{
   assert((dag == DaggerNo) || (dag == DaggerYes));
   Compressor compressor(dag);
   st.HaloExchange(in, compressor);
diff --git a/Grid/qcd/modules/Registration.h b/Grid/qcd/modules/Registration.h
index ec28f020..459e1d0b 100644
--- a/Grid/qcd/modules/Registration.h
+++ b/Grid/qcd/modules/Registration.h
@@ -80,10 +80,10 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
 
 static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,   
                   HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
-static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
-                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB"); 
-static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
-                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
+//static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
+//                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB"); 
+//static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
+//                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
 
 // add the staggered, scalar versions here
 
diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h
index f4d00c72..6f2ff2e7 100644
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@@ -49,7 +49,7 @@ public:
 
 private:
   const unsigned int smearingLevels;
-  Smear_Stout<Gimpl> StoutSmearing;
+  Smear_Stout<Gimpl> &StoutSmearing;
   std::vector<GaugeField> SmearedSet;
 
   // Member functions
diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h
index ed96f3bf..6ac69150 100644
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -52,6 +52,26 @@ namespace PeriodicBC {
     tmp = adj(Link)*field;
     return Cshift(tmp,mu,-1);// moves towards positive mu
   }
+
+  template<class gauge,typename Op, typename T1> auto
+    CovShiftForward(const Lattice<gauge> &Link, 
+		    int mu,
+		    const LatticeUnaryExpression<Op,T1> &expr)
+    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  {
+    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    return CovShiftForward(Link,mu,arg);
+  }
+  template<class gauge,typename Op, typename T1> auto
+    CovShiftBackward(const Lattice<gauge> &Link, 
+		     int mu,
+		     const LatticeUnaryExpression<Op,T1> &expr)
+    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  {
+    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    return CovShiftForward(Link,mu,arg);
+  }
+
 }
 
 
@@ -122,6 +142,26 @@ namespace ConjugateBC {
     return Cshift(tmp,mu,-1);// moves towards positive mu
   }
 
+  template<class gauge,typename Op, typename T1> auto
+    CovShiftForward(const Lattice<gauge> &Link, 
+		    int mu,
+		    const LatticeUnaryExpression<Op,T1> &expr)
+    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  {
+    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    return CovShiftForward(Link,mu,arg);
+  }
+  template<class gauge,typename Op, typename T1> auto
+    CovShiftBackward(const Lattice<gauge> &Link, 
+		     int mu,
+		     const LatticeUnaryExpression<Op,T1> &expr)
+    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  {
+    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    return CovShiftForward(Link,mu,arg);
+  }
+
+
 }
 
 

From cb7ee375628d00ca387bb9eba13ead6cf4cc6c73 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 12 May 2020 17:56:40 -0400
Subject: [PATCH 43/51] Close expressions in arg to cshift

---
 Grid/cshift/Cshift.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/Grid/cshift/Cshift.h b/Grid/cshift/Cshift.h
index 07ec8412..9150579c 100644
--- a/Grid/cshift/Cshift.h
+++ b/Grid/cshift/Cshift.h
@@ -49,4 +49,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
+
+NAMESPACE_BEGIN(Grid);
+
+template<typename Op, typename T1> 
+auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
+    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+{
+  return Cshift(closure(expr),dim,shift);
+}
+template <class Op, class T1, class T2>
+auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
+{
+  return Cshift(closure(expr),dim,shift);
+}
+template <class Op, class T1, class T2, class T3>
+auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
+				   eval(0, expr.arg2),
+				   eval(0, expr.arg3)))> 
+{
+  return Cshift(closure(expr),dim,shift);
+}
+NAMESPACE_END(Grid);
+
 #endif

From 0e3c49f687c7fdc8398b9c2859053265ed1da121 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 12 May 2020 17:57:01 -0400
Subject: [PATCH 44/51] TransposeIndex was broken by Christoph

---
 Grid/lattice/Lattice.h           | 2 +-
 Grid/lattice/Lattice_transpose.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h
index 13c0a885..036633b4 100644
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_ET.h>
 #include <Grid/lattice/Lattice_arith.h>
 #include <Grid/lattice/Lattice_trace.h>
-//#include <Grid/lattice/Lattice_transpose.h>
+#include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
diff --git a/Grid/lattice/Lattice_transpose.h b/Grid/lattice/Lattice_transpose.h
index 9b0b3483..6fe08c10 100644
--- a/Grid/lattice/Lattice_transpose.h
+++ b/Grid/lattice/Lattice_transpose.h
@@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Transpose
 ////////////////////////////////////////////////////////////////////////////////////////////////////
+/*
 template<class vobj>
 inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
   Lattice<vobj> ret(lhs.Grid());
@@ -48,7 +49,8 @@ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
   });
   return ret;
 };
-    
+*/    
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Index level dependent transpose
 ////////////////////////////////////////////////////////////////////////////////////////////////////

From 8b5b55b68228f1433108bde7d6dbef68dedc05fa Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 12 May 2020 17:57:24 -0400
Subject: [PATCH 45/51] Make tests all compile ccurrent Grid, mostly MdagM
 removal of norms fixes but a few minor issues fiixed too

---
 tests/IO/Test_openqcd_io.cc                     |  6 +++---
 tests/core/Test_contfrac_even_odd.cc            |  8 ++++----
 tests/core/Test_dwf_eofa_even_odd.cc            |  8 ++++----
 tests/core/Test_dwf_even_odd.cc                 |  8 ++++----
 tests/core/Test_gpwilson_even_odd.cc            |  8 ++++----
 tests/core/Test_mobius_eofa_even_odd.cc         |  8 ++++----
 tests/core/Test_mobius_even_odd.cc              |  8 ++++----
 tests/core/Test_staggered.cc                    |  8 ++++----
 tests/core/Test_staggered5D.cc                  |  8 ++++----
 tests/core/Test_wilson_even_odd.cc              |  8 ++++----
 tests/core/Test_wilson_twisted_mass_even_odd.cc |  8 ++++----
 tests/core/Test_zmobius_even_odd.cc             |  8 ++++----
 tests/hmc/Test_multishift_sqrt.cc               |  1 +
 tests/solver/Test_dwf_fpgcr.cc                  | 11 ++++++-----
 tests/solver/Test_staggered_block_cg_prec.cc    | 11 -----------
 tests/solver/Test_zMADWF_prec.cc                |  2 +-
 16 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc
index 83b498c2..765509a9 100644
--- a/tests/IO/Test_openqcd_io.cc
+++ b/tests/IO/Test_openqcd_io.cc
@@ -28,13 +28,12 @@ See the full license in the file "LICENSE" in the top level distribution directo
 
 #include <Grid/Grid.h>
 
-#if defined(GRID_COMMS_NONE)
-#error This test requires Grid compiled with MPI
-#endif
+
 
 using namespace Grid;
 
 int main(int argc, char** argv) {
+#if !defined(GRID_COMMS_NONE)
   Grid_init(&argc, &argv);
 
   auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
@@ -81,4 +80,5 @@ int main(int argc, char** argv) {
   // clang-format on
 
   Grid_finalize();
+#endif
 }
diff --git a/tests/core/Test_contfrac_even_odd.cc b/tests/core/Test_contfrac_even_odd.cc
index 25affd00..5311f869 100644
--- a/tests/core/Test_contfrac_even_odd.cc
+++ b/tests/core/Test_contfrac_even_odd.cc
@@ -238,11 +238,11 @@ void  TestWhat(What & Ddwf,
   RealD t1,t2;
 
   SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
   
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/core/Test_dwf_eofa_even_odd.cc b/tests/core/Test_dwf_eofa_even_odd.cc
index 1d2f2909..01fff9ea 100644
--- a/tests/core/Test_dwf_eofa_even_odd.cc
+++ b/tests/core/Test_dwf_eofa_even_odd.cc
@@ -218,11 +218,11 @@ int main (int argc, char ** argv)
     RealD t1,t2;
 
     SchurDiagMooeeOperator<DomainWallEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
-    HermOpEO.MpcDagMpc(chi_e, dchi_e, t1, t2);
-    HermOpEO.MpcDagMpc(chi_o, dchi_o, t1, t2);
+    HermOpEO.MpcDagMpc(chi_e, dchi_e);
+    HermOpEO.MpcDagMpc(chi_o, dchi_o);
 
-    HermOpEO.MpcDagMpc(phi_e, dphi_e, t1, t2);
-    HermOpEO.MpcDagMpc(phi_o, dphi_o, t1, t2);
+    HermOpEO.MpcDagMpc(phi_e, dphi_e);
+    HermOpEO.MpcDagMpc(phi_o, dphi_o);
 
     pDce = innerProduct(phi_e, dchi_e);
     pDco = innerProduct(phi_o, dchi_o);
diff --git a/tests/core/Test_dwf_even_odd.cc b/tests/core/Test_dwf_even_odd.cc
index d654e588..6093ee8f 100644
--- a/tests/core/Test_dwf_even_odd.cc
+++ b/tests/core/Test_dwf_even_odd.cc
@@ -216,11 +216,11 @@ int main (int argc, char ** argv)
 
 
   SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc
index ac4cde99..bf37f4d5 100644
--- a/tests/core/Test_gpwilson_even_odd.cc
+++ b/tests/core/Test_gpwilson_even_odd.cc
@@ -201,11 +201,11 @@ int main (int argc, char ** argv)
   RealD t1,t2;
 
   SchurDiagMooeeOperator<GparityWilsonFermionR,FermionField> HermOpEO(Dw);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/core/Test_mobius_eofa_even_odd.cc b/tests/core/Test_mobius_eofa_even_odd.cc
index bfd53c72..68091229 100644
--- a/tests/core/Test_mobius_eofa_even_odd.cc
+++ b/tests/core/Test_mobius_eofa_even_odd.cc
@@ -220,11 +220,11 @@ int main (int argc, char ** argv)
     RealD t1,t2;
 
     SchurDiagMooeeOperator<MobiusEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
-    HermOpEO.MpcDagMpc(chi_e, dchi_e, t1, t2);
-    HermOpEO.MpcDagMpc(chi_o, dchi_o, t1, t2);
+    HermOpEO.MpcDagMpc(chi_e, dchi_e);
+    HermOpEO.MpcDagMpc(chi_o, dchi_o);
 
-    HermOpEO.MpcDagMpc(phi_e, dphi_e, t1, t2);
-    HermOpEO.MpcDagMpc(phi_o, dphi_o, t1, t2);
+    HermOpEO.MpcDagMpc(phi_e, dphi_e);
+    HermOpEO.MpcDagMpc(phi_o, dphi_o);
 
     pDce = innerProduct(phi_e, dchi_e);
     pDco = innerProduct(phi_o, dchi_o);
diff --git a/tests/core/Test_mobius_even_odd.cc b/tests/core/Test_mobius_even_odd.cc
index 0a035dc8..7f808cac 100644
--- a/tests/core/Test_mobius_even_odd.cc
+++ b/tests/core/Test_mobius_even_odd.cc
@@ -266,11 +266,11 @@ int main (int argc, char ** argv)
 
 
   SchurDiagMooeeOperator<MobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/core/Test_staggered.cc b/tests/core/Test_staggered.cc
index c85d4090..1f42ff0d 100644
--- a/tests/core/Test_staggered.cc
+++ b/tests/core/Test_staggered.cc
@@ -270,11 +270,11 @@ int main (int argc, char ** argv)
   pickCheckerboard(Odd ,phi_o,phi);
 
   SchurDiagMooeeOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc
index 7055d183..68b8e537 100644
--- a/tests/core/Test_staggered5D.cc
+++ b/tests/core/Test_staggered5D.cc
@@ -289,11 +289,11 @@ int main (int argc, char ** argv)
   pickCheckerboard(Odd ,phi_o,phi);
 
   SchurDiagMooeeOperator<ImprovedStaggeredFermion5DR,FermionField> HermOpEO(Ds);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/core/Test_wilson_even_odd.cc b/tests/core/Test_wilson_even_odd.cc
index 8f325f1c..dc49cf81 100644
--- a/tests/core/Test_wilson_even_odd.cc
+++ b/tests/core/Test_wilson_even_odd.cc
@@ -207,11 +207,11 @@ int main (int argc, char ** argv)
   RealD t1,t2;
 
   SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/core/Test_wilson_twisted_mass_even_odd.cc b/tests/core/Test_wilson_twisted_mass_even_odd.cc
index 58b0b60f..ba80fd0e 100644
--- a/tests/core/Test_wilson_twisted_mass_even_odd.cc
+++ b/tests/core/Test_wilson_twisted_mass_even_odd.cc
@@ -208,11 +208,11 @@ int main (int argc, char ** argv)
   RealD t1,t2;
 
   SchurDiagMooeeOperator<WilsonTMFermionR,LatticeFermion> HermOpEO(Dw);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/core/Test_zmobius_even_odd.cc b/tests/core/Test_zmobius_even_odd.cc
index 1150930b..a52e9bc2 100644
--- a/tests/core/Test_zmobius_even_odd.cc
+++ b/tests/core/Test_zmobius_even_odd.cc
@@ -280,11 +280,11 @@ int main (int argc, char ** argv)
 
 
   SchurDiagMooeeOperator<ZMobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);
diff --git a/tests/hmc/Test_multishift_sqrt.cc b/tests/hmc/Test_multishift_sqrt.cc
index 834e6fc8..f8477220 100644
--- a/tests/hmc/Test_multishift_sqrt.cc
+++ b/tests/hmc/Test_multishift_sqrt.cc
@@ -57,6 +57,7 @@ public:
   // Support for coarsening to a multigrid
   void OpDiag (const Field &in, Field &out) {};
   void OpDir  (const Field &in, Field &out,int dir,int disp){};
+  void OpDirAll  (const Field &in, std::vector<Field> &out) {}; // Abstract base
 
   void Op     (const Field &in, Field &out){
     out = scale * in;
diff --git a/tests/solver/Test_dwf_fpgcr.cc b/tests/solver/Test_dwf_fpgcr.cc
index 226bd933..156f678a 100644
--- a/tests/solver/Test_dwf_fpgcr.cc
+++ b/tests/solver/Test_dwf_fpgcr.cc
@@ -70,9 +70,6 @@ int main (int argc, char ** argv)
 
   SU3::HotConfiguration(RNG4,Umu);
 
-  TrivialPrecon<LatticeFermion> simple;
-
-  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-6,10000,simple,4,160);
 
   ConjugateResidual<LatticeFermion> CR(1.0e-6,10000);
 
@@ -86,15 +83,19 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage<<"* Solving with MdagM VPGCR "<<std::endl;
   std::cout<<GridLogMessage<<"*********************************************************"<<std::endl;
   MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
+  TrivialPrecon<LatticeFermion> simple;
+  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-6,10000,HermOp,simple,4,160);
+
   result=Zero();
-  PGCR(HermOp,src,result);
+  PGCR(src,result);
 
   std::cout<<GridLogMessage<<"*********************************************************"<<std::endl;
   std::cout<<GridLogMessage<<"* Solving with g5-VPGCR "<<std::endl;
   std::cout<<GridLogMessage<<"*********************************************************"<<std::endl;
   Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> g5HermOp(Ddwf);
+  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR5(1.0e-6,10000,g5HermOp,simple,4,160);
   result=Zero();
-  PGCR(g5HermOp,src,result);
+  PGCR5(src,result);
 
   std::cout<<GridLogMessage<<"*********************************************************"<<std::endl;
   std::cout<<GridLogMessage<<"* Solving with MdagM-CR "<<std::endl;
diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc
index 95156939..2499fc8a 100644
--- a/tests/solver/Test_staggered_block_cg_prec.cc
+++ b/tests/solver/Test_staggered_block_cg_prec.cc
@@ -128,9 +128,7 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
     std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
-    HermOp4d.Report();
   }
-  Ds4d.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
 
@@ -148,9 +146,7 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
     std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
-    HermOp.Report();
   }
-  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@@ -167,10 +163,8 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
     std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
-    HermOp.Report();
   }
 
-  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@@ -187,9 +181,7 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
     std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
-    HermOp.Report();
   }
-  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@@ -206,9 +198,7 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
     std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
-    HermOp.Report();
   }
-  Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
@@ -232,7 +222,6 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
     std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
-    //    HermOp4d.Report();
   }
 
 
diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc
index cbcdd055..13b4c3b2 100644
--- a/tests/solver/Test_zMADWF_prec.cc
+++ b/tests/solver/Test_zMADWF_prec.cc
@@ -220,7 +220,7 @@ void run(const TestParams &params){
 
   GridStopWatch CGTimer;
   
-  typename RunParamsOuter::HermOpType<MobiusFermionD> HermOpEO_outer(D_outer);
+  typename RunParamsOuter::template HermOpType<MobiusFermionD> HermOpEO_outer(D_outer);
 
   CGTimer.Start();
   CG_outer(HermOpEO_outer, src_o_outer, result_o_outer);

From 2e652431e5248461d4fecb75b37b0b00fc40e405 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 12 May 2020 18:56:47 -0400
Subject: [PATCH 46/51]  No compile on summiit fix

---
 Grid/qcd/smearing/GaugeConfiguration.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h
index 6f2ff2e7..f4d00c72 100644
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@@ -49,7 +49,7 @@ public:
 
 private:
   const unsigned int smearingLevels;
-  Smear_Stout<Gimpl> &StoutSmearing;
+  Smear_Stout<Gimpl> StoutSmearing;
   std::vector<GaugeField> SmearedSet;
 
   // Member functions

From 5aa60be17d5ee1263c13f5b65190980f8d05e58e Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 15 May 2020 20:00:34 +0100
Subject: [PATCH 47/51] SerialisableClassName method for serialisable enum, and
 boolean to test if a serialisable object is an enum

---
 Grid/serialisation/MacroMagic.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Grid/serialisation/MacroMagic.h b/Grid/serialisation/MacroMagic.h
index 7866327e..0495b91e 100644
--- a/Grid/serialisation/MacroMagic.h
+++ b/Grid/serialisation/MacroMagic.h
@@ -114,7 +114,8 @@ THE SOFTWARE.
 #define GRID_MACRO_WRITE_MEMBER(A,B) ::Grid::write(WR,#B,obj. B);
 
 #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)\
-  std::string SerialisableClassName(void) const {return std::string(#cname);}	\
+static inline std::string SerialisableClassName(void) {return std::string(#cname);}	\
+static constexpr bool isEnum = false; \
 GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\
 template <typename T>\
 static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
@@ -162,6 +163,8 @@ public:\
 public:\
   accelerator name(void)     : value_(undefname) {};		\
   accelerator name(int value): value_(value) {};			\
+  static inline std::string SerialisableClassName(void) {return std::string(#name);}\
+  static constexpr bool isEnum = true; \
   template <typename T>\
   static inline void write(::Grid::Writer<T> &WR,const std::string &s, const name &obj) \
   {\

From d708c0258dd572777064de3c5e9885020f1a0781 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Tue, 19 May 2020 13:48:00 +0100
Subject: [PATCH 48/51] some cleanup in BaryonUtils

---
 Grid/qcd/utils/BaryonUtils.h | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index fa2f3376..9205802b 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -300,9 +300,15 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   auto v2 = q2_left.View();
   auto v3 = q3_left.View();
 
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-  thread_for(ss,grid->oSites(),{
-  //for(int ss=0; ss < grid->oSites(); ss++){
+  Real bytes =0.;
+  for (int ie=0; ie < 6 ; ie++){
+    //bytes += 3. * (grid->oSites() * 12. * 12. * sizeof(Complex)) * wick_contraction[ie]; // size of the 3 propagatorFields
+    bytes += grid->oSites() * 36. * 4. * 4. * sizeof(Complex) * wick_contraction[ie];  //number of operations
+  }
+  double t=0.;
+  t =-usecond();
+
+  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 
     auto D1 = v1[ss];
     auto D2 = v2[ss];
@@ -312,6 +318,11 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
+
+  t += usecond();
+
+  std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
+
 }
 template <class FImpl>
 template <class mobj, class robj>
@@ -595,8 +606,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
   auto vd_tf = qd_tf.View();
   auto vs_ti = qs_ti.View();
 
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-  thread_for(ss,grid->oSites(),{
+  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
     auto Dq_loop = vq_loop[ss];
     auto Dd_tf = vd_tf[ss];
     auto Ds_ti = vs_ti[ss];
@@ -637,8 +647,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
   auto vd_tf = qd_tf.View();
   auto vs_ti = qs_ti.View();
 
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-  thread_for(ss,grid->oSites(),{
+  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
     auto Dq_ti = vq_ti[ss];
     auto Dq_tf = vq_tf[ss];
     auto Dd_tf = vd_tf[ss];

From 1f154fe652d90eb0de6ab3a662dedf4db6143ffb Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Tue, 19 May 2020 13:48:56 +0100
Subject: [PATCH 49/51] some cleanup in BaryonUtils

---
 Grid/qcd/utils/BaryonUtils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 9205802b..8c8ed59c 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -305,7 +305,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     //bytes += 3. * (grid->oSites() * 12. * 12. * sizeof(Complex)) * wick_contraction[ie]; // size of the 3 propagatorFields
     bytes += grid->oSites() * 36. * 4. * 4. * sizeof(Complex) * wick_contraction[ie];  //number of operations
   }
-  double t=0.;
+  Real t=0.;
   t =-usecond();
 
   accelerator_for(ss, grid->oSites(), grid->Nsimd(), {

From 6c6812a5cab2f4920d355501427116cc79b090e4 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Wed, 20 May 2020 12:26:57 +0100
Subject: [PATCH 50/51] GB/s output

---
 Grid/qcd/utils/BaryonUtils.h | 116 +++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 53 deletions(-)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 8c8ed59c..23267270 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -46,7 +46,7 @@ public:
   typedef typename SpinMatrixField::vector_object sobj;
 
   static const int epsilon[6][3] ;
-  static const Complex epsilon_sgn[6];
+  static const Real epsilon_sgn[6];
 
   private: 
   template <class mobj, class robj>
@@ -151,13 +151,16 @@ public:
 
 template <class FImpl> 
 const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
-template <class FImpl> 
+/*template <class FImpl> 
 const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
 						    Complex(1),
 						    Complex(1),
 						    Complex(-1),
 						    Complex(-1),
 						    Complex(-1)};
+*/
+template <class FImpl> 
+const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
 
 //This is the old version
 template <class FImpl>
@@ -174,13 +177,11 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 						 robj &result)
 {
 
-  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
-
+    Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
     auto gD1a = GammaA_left * GammaA_right * D1;
     auto gD1b = GammaA_left * g4 * GammaA_right * D1;
-    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
+    auto pD1 = 0.5* (gD1a + (Real)parity * gD1b);
     auto gD3 = GammaB_right * D3;
-
     auto D2g = D2 * GammaB_left;
     auto pD1g = pD1 * GammaB_left;
     auto gD3g = gD3 * GammaB_left;
@@ -193,72 +194,78 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
         int a_right = epsilon[ie_right][0]; //a'
         int b_right = epsilon[ie_right][1]; //b'
         int c_right = epsilon[ie_right][2]; //c'
-	Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+	Real ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
         //This is the \delta_{456}^{123} part
 	if (wick_contraction[0]){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
             auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
-	        result()()() += eepD1*D2g_ab*gD3_ab;
-          }}}
+	    for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	    for (int beta_left=0; beta_left<Ns; beta_left++){
+	      auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	      auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+	      result()()() += eepD1*D2g_ab*gD3_ab;
+            }}
+	  }
   	}	  
         //This is the \delta_{456}^{231} part
 	if (wick_contraction[1]){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
             auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
-	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
-		result()()() += eepD1g_gb*D2_ab*gD3_ag;
-          }}}
+	    for (int beta_left=0; beta_left<Ns; beta_left++){
+              auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+	      auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	      result()()() += eepD1g_gb*D2_ab*gD3_ag;
+            }
+	  }}
         }	  
         //This is the \delta_{456}^{312} part
 	if (wick_contraction[2]){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
-	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
-		result()()() += eepD1_gb*D2_ag*gD3g_ab;
-          }}}
+	    for (int beta_left=0; beta_left<Ns; beta_left++){
+              auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	      auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+                result()()() += eepD1_gb*D2_ag*gD3g_ab;
+            }
+	  }}
         }	  
         //This is the \delta_{456}^{132} part
 	if (wick_contraction[3]){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
             auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
-	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
-    		result()()() -= eepD1*D2_ab*gD3g_ab;
-          }}}
+	    for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	    for (int beta_left=0; beta_left<Ns; beta_left++){
+	      auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	      auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+    	      result()()() -= eepD1*D2_ab*gD3g_ab;
+            }}
+	  }
         }	  
         //This is the \delta_{456}^{321} part
 	if (wick_contraction[4]){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
             auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
-	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
-          }}}
+	    for (int beta_left=0; beta_left<Ns; beta_left++){
+              auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	      auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	      result()()() -= eepD1_gb*D2g_ab*gD3_ag;
+            }
+	  }}
         }	  
         //This is the \delta_{456}^{213} part
 	if (wick_contraction[5]){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
-	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
-    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
-          }}}
+	    for (int beta_left=0; beta_left<Ns; beta_left++){
+              auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+	      auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+    	      result()()() -= eepD1g_gb*D2_ag*gD3_ab;
+            }
+	  }}
         }	  
       }
     }
@@ -282,10 +289,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
   std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
-    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
-    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
-    std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
-    std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
+  std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
+  std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
+  std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
+  std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
   assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
@@ -301,19 +308,22 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   auto v3 = q3_left.View();
 
   Real bytes =0.;
+  bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
   for (int ie=0; ie < 6 ; ie++){
-    //bytes += 3. * (grid->oSites() * 12. * 12. * sizeof(Complex)) * wick_contraction[ie]; // size of the 3 propagatorFields
-    bytes += grid->oSites() * 36. * 4. * 4. * sizeof(Complex) * wick_contraction[ie];  //number of operations
+    if(ie==0 or ie==3){
+       bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contraction[ie];
+    }
+    else{
+       bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contraction[ie];
+    }
   }
   Real t=0.;
   t =-usecond();
 
   accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-
     auto D1 = v1[ss];
     auto D2 = v2[ss];
     auto D3 = v3[ss];
-
     vobj result=Zero();
     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
     vbaryon_corr[ss] = result; 
@@ -343,10 +353,10 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
   assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
   std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
-    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
-    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
-    std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
-    std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
+  std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
+  std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
+  std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
+  std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
   assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
@@ -354,8 +364,8 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
   for (int ie=0; ie < 6 ; ie++)
     wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
 
-     result=Zero();
-     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+  result=Zero();
+  baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 }
 
 /***********************************************************************

From c5c2dbc0cef01817a4442093dc94e440f7193e09 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 2 Jun 2020 14:21:49 -0400
Subject: [PATCH 51/51] Optional CUDA info

---
 Grid/allocator/AlignedAllocator.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 77167299..2976aed7 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -92,6 +92,13 @@ public:
   static bool        debug;
 };
 
+#ifdef GRID_NVCC
+#define profilerCudaMeminfo \
+  { size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
+#else
+#define profilerCudaMeminfo
+#endif
+
 #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
 #define profilerDebugPrint						\
   if (MemoryProfiler::stats)						\
@@ -106,7 +113,8 @@ public:
 		<< std::endl;						\
       std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
 		<< std::endl;						\
-    }
+    }									\
+  profilerCudaMeminfo;
 
 #define profilerAllocate(bytes)						\
   if (MemoryProfiler::stats)						\