2023-01-27 16:35:49 +00:00
28 changed files with 1831 additions and 1224 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
 /.vscode
-build*
 .buildutils
 autom4te.cache
 config.*
@@ -12,4 +11,7 @@ install-sh
 missing
 Makefile.in
 .DS_Store
-*~
+*~
+/*/env
+/*/build
+/Grid/json.hpp
--- a/Grid/.clang-format
+++ b/Grid/.clang-format
@@ -0,0 +1,14 @@
+{
+  BasedOnStyle: LLVM,
+  UseTab: Never,
+  IndentWidth: 2,
+  TabWidth: 2,
+  BreakBeforeBraces: Allman,
+  AllowShortIfStatementsOnASingleLine: false,
+  IndentCaseLabels: false,
+  ColumnLimit: 90,
+  AccessModifierOffset: -4,
+  NamespaceIndentation: All,
+  FixNamespaceComments: false,
+  SortIncludes: true,
+}
--- a/Grid/Benchmark_Grid.cpp
+++ b/Grid/Benchmark_Grid.cpp
--- a/Grid/Benchmark_IO.cpp
+++ b/Grid/Benchmark_IO.cpp
@@ -32,23 +32,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 #ifdef HAVE_LIME
 using namespace Grid;

-std::string filestem(const int l)
-{
-  return "iobench_l" + std::to_string(l);
-}
+std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }

-int vol(const int i)
-{
-  return BENCH_IO_LMIN + 2 * i;
-}
+int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }

-int volInd(const int l)
-{
-  return (l - BENCH_IO_LMIN) / 2;
-}
+int volInd(const int l) { return (l - BENCH_IO_LMIN) / 2; }

-template <typename Mat>
-void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
+template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
 {
  auto nr = data[0].rows(), nc = data[0].cols();
  Eigen::MatrixXd sqSum(nr, nc);
@@ -66,11 +56,11 @@ void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
  mean /= n;
 }

-#define grid_printf(...)        \
-  {                             \
-    char _buf[1024];            \
-    sprintf(_buf, __VA_ARGS__); \
-    MSG << _buf;                \
+#define grid_printf(...)                                                                 \
+  {                                                                                      \
+    char _buf[1024];                                                                     \
+    sprintf(_buf, __VA_ARGS__);                                                          \
+    MSG << _buf;                                                                         \
  }

 enum
@@ -173,47 +163,49 @@ int main(int argc, char **argv)
  MSG << "SUMMARY" << std::endl;
  MSG << BIGSEP << std::endl;
  MSG << "Summary of individual results (all results in MB/s)." << std::endl;
-  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column."
+      << std::endl;
  MSG << std::endl;
-  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
-              "L", "std read", "std dev", "std write", "std dev",
-              "Grid read", "std dev", "Grid write", "std dev");
+  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
+              "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
  {
-    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
-                l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
-                mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
-                mean(volInd(l), gRead), stdDev(volInd(l), gRead),
-                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", l,
+                mean(volInd(l), sRead), stdDev(volInd(l), sRead), mean(volInd(l), sWrite),
+                stdDev(volInd(l), sWrite), mean(volInd(l), gRead),
+                stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
+                stdDev(volInd(l), gWrite));
  }
  MSG << std::endl;
-  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
+      << std::endl;
  MSG << std::endl;
-  grid_printf("%4s %12s %12s %12s %12s\n",
-              "L", "std read", "std write", "Grid read", "Grid write");
+  grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
+              "Grid write");
  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
  {
-    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
-                l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
-                rob(volInd(l), gRead), rob(volInd(l), gWrite));
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
+                rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
  }
  MSG << std::endl;
-  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
-  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
+      << "^4 (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column."
+      << std::endl;
  MSG << std::endl;
-  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
-              "std read", "std dev", "std write", "std dev",
-              "Grid read", "std dev", "Grid write", "std dev");
-  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
-              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
-              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
+  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
+              "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
+              avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
+              avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
  MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
+      << std::endl;
  MSG << std::endl;
-  grid_printf("%12s %12s %12s %12s\n",
-              "std read", "std write", "Grid read", "Grid write");
-  grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
-              avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));
+  grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
+              "Grid write");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
+              avRob(gWrite));

  Grid_finalize();

--- a/Grid/Benchmark_IO.hpp
+++ b/Grid/Benchmark_IO.hpp
@@ -20,9 +20,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.

 #include <Grid/Grid.h>
 #define MSG std::cout << GridLogMessage
-#define SEP \
+#define SEP                                                                              \
  "-----------------------------------------------------------------------------"
-#define BIGSEP \
+#define BIGSEP                                                                           \
  "============================================================================="
 #ifdef HAVE_LIME

@@ -36,16 +36,15 @@ namespace Grid

  // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
  //
-  // template <typename Field>
-  // void stdWrite(const std::string filestem, Field &vec)
+  // template <typename Field> void stdWrite(const std::string filestem, Field &vec)
  // {
-  //   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
-  //   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
-  //   size_t        size;
-  //   uint32_t      crc;
+  //   std::string rankStr = std::to_string(vec.Grid()->ThisRank());
+  //   std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
+  //   size_t size;
+  //   uint32_t crc;
  //   GridStopWatch ioWatch, crcWatch;

-  //   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  //   size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
  //   autoView(vec_v, vec, CpuRead);
  //   crcWatch.Start();
  //   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
@@ -53,36 +52,39 @@ namespace Grid
  //   crcWatch.Stop();
  //   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
  //   ioWatch.Start();
-  //   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+  //   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
+  //               vec.Grid()->lSites(), file);
  //   ioWatch.Stop();
  //   std::fclose(file);
  //   size *= vec.Grid()->ProcessorCount();
  //   auto &p = BinaryIO::lastPerf;
-  //   p.size            = size;
-  //   p.time            = ioWatch.useconds();
-  //   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  //   p.size = size;
+  //   p.time = ioWatch.useconds();
+  //   p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
  //   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
-  //       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  //       << ",
+  //          "
+  //       << p.mbytesPerSecond << " MB/s" << std::endl;
  //   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
  // }
-  //
-  // template <typename Field>
-  // void stdRead(Field &vec, const std::string filestem)
+
+  // template <typename Field> void stdRead(Field &vec, const std::string filestem)
  // {
-  //   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
-  //   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
-  //   size_t        size;
-  //   uint32_t      crcRead, crcData;
+  //   std::string rankStr = std::to_string(vec.Grid()->ThisRank());
+  //   std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
+  //   size_t size;
+  //   uint32_t crcRead, crcData;
  //   GridStopWatch ioWatch, crcWatch;

-  //   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  //   size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
  //   crcWatch.Start();
  //   std::fread(&crcRead, sizeof(uint32_t), 1, file);
  //   crcWatch.Stop();
  //   {
  //     autoView(vec_v, vec, CpuWrite);
  //     ioWatch.Start();
-  //     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+  //     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
+  //                vec.Grid()->lSites(), file);
  //     ioWatch.Stop();
  //     std::fclose(file);
  //   }
@@ -96,19 +98,19 @@ namespace Grid
  //   assert(crcData == crcRead);
  //   size *= vec.Grid()->ProcessorCount();
  //   auto &p = BinaryIO::lastPerf;
-  //   p.size            = size;
-  //   p.time            = ioWatch.useconds();
-  //   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
-  //   MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed()
-  //       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  //   p.size = size;
+  //   p.time = ioWatch.useconds();
+  //   p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
+  //   MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+  //       << p.mbytesPerSecond << " MB/s" << std::endl;
  //   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
  // }

-  template <typename Field>
-  void stdWrite(const std::string filestem, Field &vec)
+  template <typename Field> void stdWrite(const std::string filestem, Field &vec)
  {
    std::string rankStr = std::to_string(vec.Grid()->ThisRank());
-    std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
+    std::ofstream file(filestem + "." + rankStr + ".bin",
+                       std::ios::out | std::ios::binary);
    size_t size, sizec;
    uint32_t crc;
    GridStopWatch ioWatch, crcWatch;
@@ -130,16 +132,16 @@ namespace Grid
    p.size = size;
    p.time = ioWatch.useconds();
    p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
-    MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
-        << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+    MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+        << p.mbytesPerSecond << " MB/s" << std::endl;
    MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
  }

-  template <typename Field>
-  void stdRead(Field &vec, const std::string filestem)
+  template <typename Field> void stdRead(Field &vec, const std::string filestem)
  {
    std::string rankStr = std::to_string(vec.Grid()->ThisRank());
-    std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
+    std::ifstream file(filestem + "." + rankStr + ".bin",
+                       std::ios::in | std::ios::binary);
    size_t size, sizec;
    uint32_t crcRead, crcData;
    GridStopWatch ioWatch, crcWatch;
@@ -168,13 +170,12 @@ namespace Grid
    p.size = size;
    p.time = ioWatch.useconds();
    p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
-    MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
-        << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+    MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+        << p.mbytesPerSecond << " MB/s" << std::endl;
    MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
  }

-  template <typename Field>
-  void limeWrite(const std::string filestem, Field &vec)
+  template <typename Field> void limeWrite(const std::string filestem, Field &vec)
  {
    emptyUserRecord record;
    ScidacWriter binWriter(vec.Grid()->IsBoss());
@@ -184,8 +185,7 @@ namespace Grid
    binWriter.close();
  }

-  template <typename Field>
-  void limeRead(Field &vec, const std::string filestem)
+  template <typename Field> void limeRead(Field &vec, const std::string filestem)
  {
    emptyUserRecord record;
    ScidacReader binReader;
@@ -225,12 +225,13 @@ namespace Grid

  template <typename Field>
  void writeBenchmark(const Coordinate &latt, const std::string filename,
-                      const WriterFn<Field> &write,
-                      const unsigned int Ls = 1, const bool rb = false)
+                      const WriterFn<Field> &write, const unsigned int Ls = 1,
+                      const bool rb = false)
  {
    auto mpi = GridDefaultMpi();
    auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
-    std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
+    std::shared_ptr<GridCartesian> gBasePt(
+        SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
    std::shared_ptr<GridBase> gPt;
    std::random_device rd;

@@ -251,12 +252,13 @@ namespace Grid

  template <typename Field>
  void readBenchmark(const Coordinate &latt, const std::string filename,
-                     const ReaderFn<Field> &read,
-                     const unsigned int Ls = 1, const bool rb = false)
+                     const ReaderFn<Field> &read, const unsigned int Ls = 1,
+                     const bool rb = false)
  {
    auto mpi = GridDefaultMpi();
    auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
-    std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
+    std::shared_ptr<GridCartesian> gBasePt(
+        SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
    std::shared_ptr<GridBase> gPt;

    makeGrid(gPt, gBasePt, Ls, rb);
--- a/Grid/Benchmark_ITT.cpp
+++ b/Grid/Benchmark_ITT.cpp
@@ -1,801 +0,0 @@
-/*
-Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
-Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <Grid/Grid.h>
-
-using namespace Grid;
-
-std::vector<int> L_list;
-std::vector<int> Ls_list;
-std::vector<double> mflop_list;
-
-double mflop_ref;
-double mflop_ref_err;
-
-int NN_global;
-
-struct time_statistics
-{
-  double mean;
-  double err;
-  double min;
-  double max;
-
-  void statistics(std::vector<double> v)
-  {
-    double sum = std::accumulate(v.begin(), v.end(), 0.0);
-    mean = sum / v.size();
-
-    std::vector<double> diff(v.size());
-    std::transform(v.begin(), v.end(), diff.begin(), [=](double x)
-                   { return x - mean; });
-    double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
-    err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
-
-    auto result = std::minmax_element(v.begin(), v.end());
-    min = *result.first;
-    max = *result.second;
-  }
-};
-
-void comms_header()
-{
-  std::cout << GridLogMessage << " L  "
-            << "\t"
-            << " Ls  "
-            << "\t"
-            << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
-};
-
-Gamma::Algebra Gmu[] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT};
-struct controls
-{
-  int Opt;
-  int CommsOverlap;
-  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
-};
-
-class Benchmark
-{
-public:
-  static void Decomposition(void)
-  {
-
-    int threads = GridThread::GetThreads();
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
-    std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() << std::endl;
-    std::cout << GridLogMessage << "\tMPI tasks      : " << GridCmdVectorIntToString(GridDefaultMpi()) << std::endl;
-    std::cout << GridLogMessage << "\tvReal          : " << sizeof(vReal) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvRealF         : " << sizeof(vRealF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvRealD         : " << sizeof(vRealD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvComplex       : " << sizeof(vComplex) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvComplexF      : " << sizeof(vComplexF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvComplexD      : " << sizeof(vComplexD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  }
-
-  static void Comms(void)
-  {
-    int Nloop = 200;
-    int nmu = 0;
-    int maxlat = 32;
-
-    Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
-    Coordinate mpi_layout = GridDefaultMpi();
-
-    for (int mu = 0; mu < Nd; mu++)
-      if (mpi_layout[mu] > 1)
-        nmu++;
-
-    std::vector<double> t_time(Nloop);
-    time_statistics timestat;
-
-    std::cout << GridLogMessage << "====================================================================================================" << std::endl;
-    std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl;
-    std::cout << GridLogMessage << "====================================================================================================" << std::endl;
-    comms_header();
-
-    for (int lat = 16; lat <= maxlat; lat += 8)
-    {
-      //      for(int Ls=8;Ls<=8;Ls*=2){
-      {
-        int Ls = 12;
-
-        Coordinate latt_size({lat * mpi_layout[0],
-                              lat * mpi_layout[1],
-                              lat * mpi_layout[2],
-                              lat * mpi_layout[3]});
-
-        GridCartesian Grid(latt_size, simd_layout, mpi_layout);
-        RealD Nrank = Grid._Nprocessors;
-        RealD Nnode = Grid.NodeCount();
-        RealD ppn = Nrank / Nnode;
-
-        std::vector<HalfSpinColourVectorD *> xbuf(8);
-        std::vector<HalfSpinColourVectorD *> rbuf(8);
-        // Grid.ShmBufferFreeAll();
-        uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
-        for (int d = 0; d < 8; d++)
-        {
-          xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-          rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-          //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-          //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-        }
-
-        //	int ncomm;
-        double dbytes;
-
-        for (int dir = 0; dir < 8; dir++)
-        {
-          int mu = dir % 4;
-          if (mpi_layout[mu] > 1)
-          {
-
-            std::vector<double> times(Nloop);
-            for (int i = 0; i < Nloop; i++)
-            {
-
-              dbytes = 0;
-              double start = usecond();
-              int xmit_to_rank;
-              int recv_from_rank;
-
-              if (dir == mu)
-              {
-                int comm_proc = 1;
-                Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              }
-              else
-              {
-                int comm_proc = mpi_layout[mu] - 1;
-                Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              }
-              Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
-                                  (void *)&rbuf[dir][0], recv_from_rank,
-                                  bytes);
-              dbytes += bytes;
-
-              double stop = usecond();
-              t_time[i] = stop - start; // microseconds
-            }
-            timestat.statistics(t_time);
-
-            dbytes = dbytes * ppn;
-            double xbytes = dbytes * 0.5;
-            double bidibytes = dbytes;
-
-            std::cout << GridLogMessage << lat << "\t" << Ls << "\t "
-                      << bytes << " \t "
-                      << xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t "
-                      << xbytes / timestat.max << " " << xbytes / timestat.min
-                      << "\t\t" << bidibytes / timestat.mean << "  " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " "
-                      << bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl;
-          }
-        }
-        for (int d = 0; d < 8; d++)
-        {
-          acceleratorFreeDevice(xbuf[d]);
-          acceleratorFreeDevice(rbuf[d]);
-        }
-      }
-    }
-    return;
-  }
-
-  static void Memory(void)
-  {
-    const int Nvec = 8;
-    typedef Lattice<iVector<vReal, Nvec>> LatticeVec;
-    typedef iVector<vReal, Nvec> Vec;
-
-    Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
-    Coordinate mpi_layout = GridDefaultMpi();
-
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "  L  "
-              << "\t\t"
-              << "bytes"
-              << "\t\t\t"
-              << "GB/s"
-              << "\t\t"
-              << "Gflop/s"
-              << "\t\t seconds"
-              << "\t\tGB/s / node" << std::endl;
-    std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl;
-
-    //    uint64_t NP;
-    uint64_t NN;
-
-    uint64_t lmax = 32;
-#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat)
-
-    GridSerialRNG sRNG;
-    sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
-    for (int lat = 8; lat <= lmax; lat += 8)
-    {
-
-      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]});
-      int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
-
-      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
-
-      //      NP= Grid.RankCount();
-      NN = Grid.NodeCount();
-
-      Vec rn;
-      random(sRNG, rn);
-
-      LatticeVec z(&Grid);
-      z = Zero();
-      LatticeVec x(&Grid);
-      x = Zero();
-      LatticeVec y(&Grid);
-      y = Zero();
-      double a = 2.0;
-
-      uint64_t Nloop = NLOOP;
-
-      double start = usecond();
-      for (int i = 0; i < Nloop; i++)
-      {
-        z = a * x - y;
-      }
-      double stop = usecond();
-      double time = (stop - start) / Nloop * 1000;
-
-      double flops = vol * Nvec * 2; // mul,add
-      double bytes = 3.0 * vol * Nvec * sizeof(Real);
-      std::cout << GridLogMessage << std::setprecision(3)
-                << lat << "\t\t" << bytes << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000.
-                << "\t\t" << bytes / time / NN << std::endl;
-    }
-  };
-
-  static void SU4(void)
-  {
-    const int Nc4 = 4;
-    typedef Lattice<iMatrix<vComplexF, Nc4>> LatticeSU4;
-
-    Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
-    Coordinate mpi_layout = GridDefaultMpi();
-
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "  L  "
-              << "\t\t"
-              << "bytes"
-              << "\t\t\t"
-              << "GB/s"
-              << "\t\t"
-              << "Gflop/s"
-              << "\t\t seconds"
-              << "\t\tGB/s / node" << std::endl;
-    std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl;
-
-    uint64_t NN;
-
-    uint64_t lmax = 32;
-
-    GridSerialRNG sRNG;
-    sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
-    for (int lat = 8; lat <= lmax; lat += 8)
-    {
-
-      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]});
-      int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
-
-      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
-
-      NN = Grid.NodeCount();
-
-      LatticeSU4 z(&Grid);
-      z = Zero();
-      LatticeSU4 x(&Grid);
-      x = Zero();
-      LatticeSU4 y(&Grid);
-      y = Zero();
-      //      double a=2.0;
-
-      uint64_t Nloop = NLOOP;
-
-      double start = usecond();
-      for (int i = 0; i < Nloop; i++)
-      {
-        z = x * y;
-      }
-      double stop = usecond();
-      double time = (stop - start) / Nloop * 1000;
-
-      double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
-      double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
-      std::cout << GridLogMessage << std::setprecision(3)
-                << lat << "\t\t" << bytes << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000.
-                << "\t\t" << bytes / time / NN << std::endl;
-    }
-  };
-
-  static double DWF(int Ls, int L)
-  {
-    RealD mass = 0.1;
-    RealD M5 = 1.8;
-
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst = 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi();
-    assert(mpi.size() == 4);
-    Coordinate local({L, L, L, L});
-    Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
-
-    GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
-                                                            GridDefaultSimd(Nd, vComplex::Nsimd()),
-                                                            GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global = NN;
-    uint64_t SHM = NP / NN;
-
-    ///////// Welcome message ////////////
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " << std::endl;
-    std::cout << GridLogMessage << "* Nc             : " << Nc << std::endl;
-    std::cout << GridLogMessage << "* Global volume  : " << GridCmdVectorIntToString(latt4) << std::endl;
-    std::cout << GridLogMessage << "* Ls             : " << Ls << std::endl;
-    std::cout << GridLogMessage << "* ranks          : " << NP << std::endl;
-    std::cout << GridLogMessage << "* nodes          : " << NN << std::endl;
-    std::cout << GridLogMessage << "* ranks/node     : " << SHM << std::endl;
-    std::cout << GridLogMessage << "* ranks geom     : " << GridCmdVectorIntToString(mpi) << std::endl;
-    std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
-    GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-    GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
-    GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
-
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1, 2, 3, 4});
-    std::vector<int> seeds5({5, 6, 7, 8});
-    GridParallelRNG RNG4(UGrid);
-    RNG4.SeedFixedIntegers(seeds4);
-    GridParallelRNG RNG5(FGrid);
-    RNG5.SeedFixedIntegers(seeds5);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    typedef DomainWallFermionF Action;
-    typedef typename Action::FermionField Fermion;
-    typedef LatticeGaugeFieldF Gauge;
-
-    ///////// Source preparation ////////////
-    Gauge Umu(UGrid);
-    SU<Nc>::HotConfiguration(RNG4, Umu);
-    Fermion src(FGrid);
-    random(RNG5, src);
-    Fermion src_e(FrbGrid);
-    Fermion src_o(FrbGrid);
-    Fermion r_e(FrbGrid);
-    Fermion r_o(FrbGrid);
-    Fermion r_eo(FGrid);
-    Action Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
-
-    {
-
-      pickCheckerboard(Even, src_e, src);
-      pickCheckerboard(Odd, src_o, src);
-
-      const int num_cases = 4;
-      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
-
-      controls Cases[] = {
-          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
-          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
-          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential},
-          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}};
-
-      for (int c = 0; c < num_cases; c++)
-      {
-
-        WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
-        WilsonKernelsStatic::Opt = Cases[c].Opt;
-        CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-
-        std::cout << GridLogMessage << "==================================================================================" << std::endl;
-        if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
-          std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
-        if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
-          std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
-        if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
-          std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
-        std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
-        std::cout << GridLogMessage << "==================================================================================" << std::endl;
-
-        int nwarm = 10;
-        double t0 = usecond();
-        FGrid->Barrier();
-        for (int i = 0; i < nwarm; i++)
-        {
-          Dw.DhopEO(src_o, r_e, DaggerNo);
-        }
-        FGrid->Barrier();
-        double t1 = usecond();
-        uint64_t ncall = 500;
-
-        FGrid->Broadcast(0, &ncall, sizeof(ncall));
-
-        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-        Dw.ZeroCounters();
-
-        time_statistics timestat;
-        std::vector<double> t_time(ncall);
-        for (uint64_t i = 0; i < ncall; i++)
-        {
-          t0 = usecond();
-          Dw.DhopEO(src_o, r_e, DaggerNo);
-          t1 = usecond();
-          t_time[i] = t1 - t0;
-        }
-        FGrid->Barrier();
-
-        double volume = Ls;
-        for (int mu = 0; mu < Nd; mu++)
-          volume = volume * latt4[mu];
-
-          // Nc=3 gives
-          // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
-          // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
-          //	double flops=(1344.0*volume)/2;
-#if 0
-	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns  + Nd*Nc*Ns*2;
-#else
-        double fps = Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
-#endif
-        double flops = (fps * volume) / 2;
-        double mf_hi, mf_lo, mf_err;
-
-        timestat.statistics(t_time);
-        mf_hi = flops / timestat.min;
-        mf_lo = flops / timestat.max;
-        mf_err = flops / timestat.min * timestat.err / timestat.mean;
-
-        mflops = flops / timestat.mean;
-        mflops_all.push_back(mflops);
-        if (mflops_best == 0)
-          mflops_best = mflops;
-        if (mflops_worst == 0)
-          mflops_worst = mflops;
-        if (mflops > mflops_best)
-          mflops_best = mflops;
-        if (mflops < mflops_worst)
-          mflops_worst = mflops;
-
-        std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank   " << mflops / NP << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node   " << mflops / NN << std::endl;
-      }
-
-      std::cout << GridLogMessage << "==================================================================================" << std::endl;
-      std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Best  mflop/s        =   " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl;
-      std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Worst mflop/s        =   " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl;
-      std::cout << GridLogMessage << fmt << std::endl;
-      std::cout << GridLogMessage;
-
-      for (int i = 0; i < mflops_all.size(); i++)
-      {
-        std::cout << mflops_all[i] / NN << " ; ";
-      }
-      std::cout << std::endl;
-      std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    }
-    return mflops_best;
-  }
-
-  static double Staggered(int L)
-  {
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst = 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi();
-    assert(mpi.size() == 4);
-    Coordinate local({L, L, L, L});
-    Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
-
-    GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
-                                                            GridDefaultSimd(Nd, vComplex::Nsimd()),
-                                                            GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global = NN;
-    uint64_t SHM = NP / NN;
-
-    ///////// Welcome message ////////////
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L << "^4 local volume " << std::endl;
-    std::cout << GridLogMessage << "* Global volume  : " << GridCmdVectorIntToString(latt4) << std::endl;
-    std::cout << GridLogMessage << "* ranks          : " << NP << std::endl;
-    std::cout << GridLogMessage << "* nodes          : " << NN << std::endl;
-    std::cout << GridLogMessage << "* ranks/node     : " << SHM << std::endl;
-    std::cout << GridLogMessage << "* ranks geom     : " << GridCmdVectorIntToString(mpi) << std::endl;
-    std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
-    GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
-
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1, 2, 3, 4});
-    GridParallelRNG RNG4(FGrid);
-    RNG4.SeedFixedIntegers(seeds4);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    RealD mass = 0.1;
-    RealD c1 = 9.0 / 8.0;
-    RealD c2 = -1.0 / 24.0;
-    RealD u0 = 1.0;
-
-    typedef ImprovedStaggeredFermionF Action;
-    typedef typename Action::FermionField Fermion;
-    typedef LatticeGaugeFieldF Gauge;
-
-    Gauge Umu(FGrid);
-    SU<Nc>::HotConfiguration(RNG4, Umu);
-
-    typename Action::ImplParams params;
-    Action Ds(Umu, Umu, *FGrid, *FrbGrid, mass, c1, c2, u0, params);
-
-    ///////// Source preparation ////////////
-    Fermion src(FGrid);
-    random(RNG4, src);
-    Fermion src_e(FrbGrid);
-    Fermion src_o(FrbGrid);
-    Fermion r_e(FrbGrid);
-    Fermion r_o(FrbGrid);
-    Fermion r_eo(FGrid);
-
-    {
-
-      pickCheckerboard(Even, src_e, src);
-      pickCheckerboard(Odd, src_o, src);
-
-      const int num_cases = 4;
-      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
-
-      controls Cases[] = {
-          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
-          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
-          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential},
-          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}};
-
-      for (int c = 0; c < num_cases; c++)
-      {
-
-        StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
-        StaggeredKernelsStatic::Opt = Cases[c].Opt;
-        CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-
-        std::cout << GridLogMessage << "==================================================================================" << std::endl;
-        if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric)
-          std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" << std::endl;
-        if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute)
-          std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
-        if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute)
-          std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
-        std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
-        std::cout << GridLogMessage << "==================================================================================" << std::endl;
-
-        int nwarm = 10;
-        double t0 = usecond();
-        FGrid->Barrier();
-        for (int i = 0; i < nwarm; i++)
-        {
-          Ds.DhopEO(src_o, r_e, DaggerNo);
-        }
-        FGrid->Barrier();
-        double t1 = usecond();
-        uint64_t ncall = 500;
-
-        FGrid->Broadcast(0, &ncall, sizeof(ncall));
-
-        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-        Ds.ZeroCounters();
-
-        time_statistics timestat;
-        std::vector<double> t_time(ncall);
-        for (uint64_t i = 0; i < ncall; i++)
-        {
-          t0 = usecond();
-          Ds.DhopEO(src_o, r_e, DaggerNo);
-          t1 = usecond();
-          t_time[i] = t1 - t0;
-        }
-        FGrid->Barrier();
-
-        double volume = 1;
-        for (int mu = 0; mu < Nd; mu++)
-          volume = volume * latt4[mu];
-        double flops = (1146.0 * volume) / 2;
-        double mf_hi, mf_lo, mf_err;
-
-        timestat.statistics(t_time);
-        mf_hi = flops / timestat.min;
-        mf_lo = flops / timestat.max;
-        mf_err = flops / timestat.min * timestat.err / timestat.mean;
-
-        mflops = flops / timestat.mean;
-        mflops_all.push_back(mflops);
-        if (mflops_best == 0)
-          mflops_best = mflops;
-        if (mflops_worst == 0)
-          mflops_worst = mflops;
-        if (mflops > mflops_best)
-          mflops_best = mflops;
-        if (mflops < mflops_worst)
-          mflops_worst = mflops;
-
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank   " << mflops / NP << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node   " << mflops / NN << std::endl;
-      }
-
-      std::cout << GridLogMessage << "==================================================================================" << std::endl;
-      std::cout << GridLogMessage << L << "^4  Deo Best  mflop/s        =   " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl;
-      std::cout << GridLogMessage << L << "^4  Deo Worst mflop/s        =   " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl;
-      std::cout << GridLogMessage << fmt << std::endl;
-      std::cout << GridLogMessage;
-
-      for (int i = 0; i < mflops_all.size(); i++)
-      {
-        std::cout << mflops_all[i] / NN << " ; ";
-      }
-      std::cout << std::endl;
-    }
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    return mflops_best;
-  }
-};
-
-int main(int argc, char **argv)
-{
-  Grid_init(&argc, &argv);
-
-  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
-#ifdef KNL
-  LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2});
-#else
-  LebesgueOrder::Block = std::vector<int>({2, 2, 2, 2});
-#endif
-  Benchmark::Decomposition();
-
-  int do_su4 = 1;
-  int do_memory = 1;
-  int do_comms = 1;
-
-  int sel = 4;
-  std::vector<int> L_list({8, 12, 16, 24, 32});
-  int selm1 = sel - 1;
-
-  std::vector<double> wilson;
-  std::vector<double> dwf4;
-  std::vector<double> staggered;
-
-  int Ls = 1;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
-  {
-    wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
-  }
-
-  Ls = 12;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
-  {
-    double result = Benchmark::DWF(Ls, L_list[l]);
-    dwf4.push_back(result);
-  }
-
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
-  {
-    double result = Benchmark::Staggered(L_list[l]);
-    staggered.push_back(result);
-  }
-
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
-  {
-    std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " << dwf4[l] << " \t\t " << staggered[l] << std::endl;
-  }
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-
-  int NN = NN_global;
-  if (do_memory)
-  {
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << " Memory benchmark " << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    Benchmark::Memory();
-  }
-
-  if (do_su4)
-  {
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    Benchmark::SU4();
-  }
-
-  if (do_comms)
-  {
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << " Communications benchmark " << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    Benchmark::Comms();
-  }
-
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
-  {
-    std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
-  }
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Comparison point     result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN << " Mflop/s per node" << std::endl;
-  std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl;
-  std::cout << std::setprecision(3);
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-
-  Grid_finalize();
-}
--- a/Grid/Benchmark_comms_host_device.cpp
+++ b/Grid/Benchmark_comms_host_device.cpp
@@ -21,231 +21,245 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 using namespace std;
 using namespace Grid;

-struct time_statistics{
+struct time_statistics
+{
  double mean;
  double err;
  double min;
  double max;

-  void statistics(std::vector<double> v){
-      double sum = std::accumulate(v.begin(), v.end(), 0.0);
-      mean = sum / v.size();
+  void statistics(std::vector<double> v)
+  {
+    double sum = std::accumulate(v.begin(), v.end(), 0.0);
+    mean = sum / v.size();

-      std::vector<double> diff(v.size());
-      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
-      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
-      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+    std::vector<double> diff(v.size());
+    std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+    double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+    err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));

-      auto result = std::minmax_element(v.begin(), v.end());
-      min = *result.first;
-      max = *result.second;
-}
+    auto result = std::minmax_element(v.begin(), v.end());
+    min = *result.first;
+    max = *result.second;
+  }
 };

-void header(){
-  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
-            <<std::setw(11)<<"bytes\t\t"<<"MB/s uni"<<"\t"<<"MB/s bidi"<<std::endl;
-};
-
-int main (int argc, char ** argv)
+void header()
 {
-  Grid_init(&argc,&argv);
+  std::cout << GridLogMessage << " L  "
+            << "\t"
+            << " Ls  "
+            << "\t" << std::setw(11) << "bytes\t\t"
+            << "MB/s uni"
+            << "\t"
+            << "MB/s bidi" << std::endl;
+};

-  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
-  Coordinate mpi_layout  = GridDefaultMpi();
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
+            << std::endl;

-  int Nloop=250;
-  int nmu=0;
-  int maxlat=32;
-  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
+  int Nloop = 250;
+  int nmu = 0;
+  int maxlat = 32;
+  for (int mu = 0; mu < Nd; mu++)
+    if (mpi_layout[mu] > 1)
+      nmu++;

-  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
+  std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
+            << std::endl;
  std::vector<double> t_time(Nloop);
  //  time_statistics timestat;

-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
+  std::cout << GridLogMessage
+            << "= Benchmarking sequential halo exchange from host memory " << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
  header();

-  for(int lat=8;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=8;Ls*=2){
+  for (int lat = 8; lat <= maxlat; lat += 4)
+  {
+    for (int Ls = 8; Ls <= 8; Ls *= 2)
+    {

-      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
+                            lat * mpi_layout[3]});

-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank/Nnode;
+      RealD ppn = Nrank / Nnode;

-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD>> xbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD>> rbuf(8);

-      for(int mu=0;mu<8;mu++){
-	xbuf[mu].resize(lat*lat*lat*Ls);
-	rbuf[mu].resize(lat*lat*lat*Ls);
+      for (int mu = 0; mu < 8; mu++)
+      {
+        xbuf[mu].resize(lat * lat * lat * Ls);
+        rbuf[mu].resize(lat * lat * lat * Ls);
      }
-      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);

      int ncomm;

-      for(int mu=0;mu<4;mu++){
-	if (mpi_layout[mu]>1 ) {
-	double start=usecond();
-	for(int i=0;i<Nloop;i++){
+      for (int mu = 0; mu < 4; mu++)
+      {
+        if (mpi_layout[mu] > 1)
+        {
+          double start = usecond();
+          for (int i = 0; i < Nloop; i++)
+          {

-	  ncomm=0;
-	
-	  
-	    ncomm++;
-	    int comm_proc=1;
-	    int xmit_to_rank;
-	    int recv_from_rank;
-	    
-	    {
-	      std::vector<CommsRequest_t> requests;
-	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
-				  xmit_to_rank,
-				  (void *)&rbuf[mu][0],
-				  recv_from_rank,
-				  bytes);
-	    }
+            ncomm = 0;

-	    comm_proc = mpi_layout[mu]-1;
-	    {
-	      std::vector<CommsRequest_t> requests;
-	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
-				  xmit_to_rank,
-				  (void *)&rbuf[mu+4][0],
-				  recv_from_rank,
-				  bytes);
-	    }
-	}
-	Grid.Barrier();
-	double stop=usecond();
-        double mean=(stop-start)/Nloop;      
-      double dbytes    = bytes*ppn;
-      double xbytes    = dbytes*2.0*ncomm;
-      double rbytes    = xbytes;
-      double bidibytes = xbytes+rbytes;
+            ncomm++;
+            int comm_proc = 1;
+            int xmit_to_rank;
+            int recv_from_rank;

-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
-               <<std::right<< xbytes/mean<<"  "
-               << "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
+            {
+              std::vector<CommsRequest_t> requests;
+              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
+              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
+                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
+            }

+            comm_proc = mpi_layout[mu] - 1;
+            {
+              std::vector<CommsRequest_t> requests;
+              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
+              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
+                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
+            }
+          }
+          Grid.Barrier();
+          double stop = usecond();
+          double mean = (stop - start) / Nloop;
+          double dbytes = bytes * ppn;
+          double xbytes = dbytes * 2.0 * ncomm;
+          double rbytes = xbytes;
+          double bidibytes = xbytes + rbytes;

-	
-	}
+          std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
+                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
+                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
+                    << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
+        }
      }
-
-
-      
    }
  }

-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory "<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
+  std::cout << GridLogMessage
+            << "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
  header();

-  for(int lat=8;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=8;Ls*=2){
+  for (int lat = 8; lat <= maxlat; lat += 4)
+  {
+    for (int Ls = 8; Ls <= 8; Ls *= 2)
+    {

-      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
+                            lat * mpi_layout[3]});

-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank/Nnode;
-
+      RealD ppn = Nrank / Nnode;

      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);

-      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-	rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+      uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
+      for (int d = 0; d < 8; d++)
+      {
+        xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+        rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
      }

      int ncomm;

-      for(int mu=0;mu<4;mu++){
-	if (mpi_layout[mu]>1 ) {
-	double start=usecond();
-	for(int i=0;i<Nloop;i++){
+      for (int mu = 0; mu < 4; mu++)
+      {
+        if (mpi_layout[mu] > 1)
+        {
+          double start = usecond();
+          for (int i = 0; i < Nloop; i++)
+          {

-	  ncomm=0;
-	
-	  
-	    ncomm++;
-	    int comm_proc=1;
-	    int xmit_to_rank;
-	    int recv_from_rank;
-	    
-	    {
-	      std::vector<CommsRequest_t> requests;
-	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
-				  xmit_to_rank,
-				  (void *)&rbuf[mu][0],
-				  recv_from_rank,
-				  bytes);
-	    }
+            ncomm = 0;

-	    comm_proc = mpi_layout[mu]-1;
-	    {
-	      std::vector<CommsRequest_t> requests;
-	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
-				  xmit_to_rank,
-				  (void *)&rbuf[mu+4][0],
-				  recv_from_rank,
-				  bytes);
-	    }
-	}
-	Grid.Barrier();
-	double stop=usecond();
-        double mean=(stop-start)/Nloop;      
-      double dbytes    = bytes*ppn;
-      double xbytes    = dbytes*2.0*ncomm;
-      double rbytes    = xbytes;
-      double bidibytes = xbytes+rbytes;
+            ncomm++;
+            int comm_proc = 1;
+            int xmit_to_rank;
+            int recv_from_rank;

-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
-               <<std::right<< xbytes/mean<<"  "
-               << "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
+            {
+              std::vector<CommsRequest_t> requests;
+              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
+              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
+                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
+            }

+            comm_proc = mpi_layout[mu] - 1;
+            {
+              std::vector<CommsRequest_t> requests;
+              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
+              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
+                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
+            }
+          }
+          Grid.Barrier();
+          double stop = usecond();
+          double mean = (stop - start) / Nloop;
+          double dbytes = bytes * ppn;
+          double xbytes = dbytes * 2.0 * ncomm;
+          double rbytes = xbytes;
+          double bidibytes = xbytes + rbytes;

-	
-	}
+          std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
+                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
+                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
+                    << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
+        }
      }

-      for(int d=0;d<8;d++){
-	acceleratorFreeDevice(xbuf[d]);
-	acceleratorFreeDevice(rbuf[d]);
+      for (int d = 0; d < 8; d++)
+      {
+        acceleratorFreeDevice(xbuf[d]);
+        acceleratorFreeDevice(rbuf[d]);
      }
-
-      
    }
  }

-
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
+  std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;

  Grid_finalize();
 }
--- a/Grid/Benchmark_dwf_fp32.cpp
+++ b/Grid/Benchmark_dwf_fp32.cpp
@@ -1,6 +1,7 @@
 /*
 Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
 Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+Copyright © 2023 Simon Bürger <simon.buerger@rwth-aachen.de>

 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
@@ -16,6 +17,7 @@ You should have received a copy of the GNU General Public License
 along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

+#include "json.hpp"
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -28,17 +30,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 using namespace std;
 using namespace Grid;

-template <class d>
-struct scal
+template <class d> struct scal
 {
  d internal;
 };

-Gamma::Algebra Gmu[] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT};
+Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
+                        Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};

 int main(int argc, char **argv)
 {
@@ -48,24 +46,41 @@ int main(int argc, char **argv)

  Coordinate latt4 = GridDefaultLatt();
  int Ls = 16;
+  std::string json_filename = ""; // empty indicates no json output
+  nlohmann::json json;
+
+  // benchmark specific command line arguments
  for (int i = 0; i < argc; i++)
+  {
    if (std::string(argv[i]) == "-Ls")
    {
      std::stringstream ss(argv[i + 1]);
      ss >> Ls;
    }
+    if (std::string(argv[i]) == "--json-out")
+      json_filename = argv[i + 1];
+  }

  GridLogLayout();

  long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);

-  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  json["single_site_flops"] = single_site_flops;
+
+  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
  GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
  GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);

+  json["grid"] = FGrid->FullDimensions().toVector();
+  json["local_grid"] = FGrid->LocalDimensions().toVector();
+
  std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
-  GridCartesian *sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
+  GridCartesian *sUGrid =
+      SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
+
  GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
  GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
@@ -177,13 +192,28 @@ int main(int argc, char **argv)
  RealD NP = UGrid->_Nprocessors;
  RealD NN = UGrid->NodeCount();

-  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
-  std::cout << GridLogMessage << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" << std::endl;
-  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
-  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
-  std::cout << GridLogMessage << "* Benchmarking DomainWallFermionR::Dhop                  " << std::endl;
-  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl;
-  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B" << std::endl;
+  json["ranks"] = NP;
+  json["nodes"] = NN;
+
+  std::cout << GridLogMessage
+            << "*****************************************************************"
+            << std::endl;
+  std::cout << GridLogMessage
+            << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
+            << std::endl;
+  std::cout << GridLogMessage
+            << "*****************************************************************"
+            << std::endl;
+  std::cout << GridLogMessage
+            << "*****************************************************************"
+            << std::endl;
+  std::cout << GridLogMessage
+            << "* Benchmarking DomainWallFermionR::Dhop                  " << std::endl;
+  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
+            << std::endl;
+  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
+            << std::endl;
+
  if (sizeof(RealF) == 4)
    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
  if (sizeof(RealF) == 8)
@@ -199,8 +229,11 @@ int main(int argc, char **argv)
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
+
    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
-  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
+  std::cout << GridLogMessage
+            << "*****************************************************************"
+            << std::endl;

  DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
  int ncall = 300;
@@ -230,19 +263,38 @@ int main(int argc, char **argv)
    auto simdwidth = sizeof(vComplex);

    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
-    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.);
+    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
+                     nsimd * ncall / (1024. * 1024. * 1024.);

    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
-    double data_mem = (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.);
+    double data_mem =
+        (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
+        nsimd * ncall / (1024. * 1024. * 1024.);

-    std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0 << " us" << std::endl;
+    json["Dw"]["calls"] = ncall;
+    json["Dw"]["time"] = t1 - t0;
+    json["Dw"]["mflops"] = flops / (t1 - t0);
+    json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
+    json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN;
+    json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
+    json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
+
+    std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
+              << " us" << std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout << GridLogMessage << "mflop/s =   " << flops / (t1 - t0) << std::endl;
-    std::cout << GridLogMessage << "mflop/s per rank =  " << flops / (t1 - t0) / NP << std::endl;
-    std::cout << GridLogMessage << "mflop/s per node =  " << flops / (t1 - t0) / NN << std::endl;
-    std::cout << GridLogMessage << "RF  GiB/s (base 2) =   " << 1000000. * data_rf / ((t1 - t0)) << std::endl;
-    std::cout << GridLogMessage << "mem GiB/s (base 2) =   " << 1000000. * data_mem / ((t1 - t0)) << std::endl;
+    std::cout << GridLogMessage << "mflop/s per rank =  " << flops / (t1 - t0) / NP
+              << std::endl;
+    std::cout << GridLogMessage << "mflop/s per node =  " << flops / (t1 - t0) / NN
+              << std::endl;
+
+    std::cout << GridLogMessage
+              << "RF  GiB/s (base 2) =   " << 1000000. * data_rf / ((t1 - t0))
+              << std::endl;
+    std::cout << GridLogMessage
+              << "mem GiB/s (base 2) =   " << 1000000. * data_mem / ((t1 - t0))
+              << std::endl;
    err = ref - result;
    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
    // exit(0);
@@ -313,7 +365,10 @@ int main(int argc, char **argv)
  }
  //  dump=1;
  Dw.Dhop(src, result, 1);
-  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+
+  std::cout << GridLogMessage
+            << "Compare to naive wilson implementation Dag to verify correctness"
+            << std::endl;
  std::cout << GridLogMessage << "Called DwDag" << std::endl;
  std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
  std::cout << GridLogMessage << "norm dag ref    " << norm2(ref) << std::endl;
@@ -333,7 +388,8 @@ int main(int argc, char **argv)
  LatticeFermionF r_o(FrbGrid);
  LatticeFermionF r_eo(FGrid);

-  std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec" << std::endl;
+  std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
+            << std::endl;
  pickCheckerboard(Even, src_e, src);
  pickCheckerboard(Odd, src_o, src);

@@ -341,9 +397,14 @@ int main(int argc, char **argv)
  std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;

  // S-direction is INNERMOST and takes no part in the parity.
-  std::cout << GridLogMessage << "*********************************************************" << std::endl;
-  std::cout << GridLogMessage << "* Benchmarking DomainWallFermionF::DhopEO                " << std::endl;
-  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl;
+  std::cout << GridLogMessage
+
+            << "*********************************************************" << std::endl;
+  std::cout << GridLogMessage
+            << "* Benchmarking DomainWallFermionF::DhopEO                " << std::endl;
+  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
+            << std::endl;
+
  if (sizeof(RealF) == 4)
    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
  if (sizeof(RealF) == 8)
@@ -360,7 +421,9 @@ int main(int argc, char **argv)
    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
-  std::cout << GridLogMessage << "*********************************************************" << std::endl;
+  std::cout << GridLogMessage
+            << "*********************************************************" << std::endl;
+
  {
    Dw.ZeroCounters();
    FGrid->Barrier();
@@ -386,9 +449,18 @@ int main(int argc, char **argv)
      volume = volume * latt4[mu];
    double flops = (single_site_flops * volume * ncall) / 2.0;

+    json["Deo"]["calls"] = ncall;
+    json["Deo"]["time"] = t1 - t0;
+    json["Deo"]["mflops"] = flops / (t1 - t0);
+    json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
+    json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
+
    std::cout << GridLogMessage << "Deo mflop/s =   " << flops / (t1 - t0) << std::endl;
-    std::cout << GridLogMessage << "Deo mflop/s per rank   " << flops / (t1 - t0) / NP << std::endl;
-    std::cout << GridLogMessage << "Deo mflop/s per node   " << flops / (t1 - t0) / NN << std::endl;
+    std::cout << GridLogMessage << "Deo mflop/s per rank   " << flops / (t1 - t0) / NP
+              << std::endl;
+    std::cout << GridLogMessage << "Deo mflop/s per node   " << flops / (t1 - t0) / NN
+              << std::endl;
+
    Dw.Report();
  }
  Dw.DhopEO(src_o, r_e, DaggerNo);
@@ -420,6 +492,21 @@ int main(int argc, char **argv)

  assert(norm2(src_e) < 1.0e-4);
  assert(norm2(src_o) < 1.0e-4);
+
+  if (!json_filename.empty())
+  {
+    std::cout << GridLogMessage << "writing benchmark results to " << json_filename
+              << std::endl;
+
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
+    if (me == 0)
+    {
+      std::ofstream json_file(json_filename);
+      json_file << std::setw(4) << json;
+    }
+  }
+
  Grid_finalize();
  exit(0);
 }
--- a/Grid/Common.hpp
+++ b/Grid/Common.hpp
@@ -0,0 +1,36 @@
+/*
+Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef Grid_Benchmarks_Common_hpp_
+#define Grid_Benchmarks_Common_hpp_
+
+#ifndef GRID_MSG
+#define GRID_MSG std::cout << GridLogMessage
+#endif
+
+#ifndef GRID_MSG_MAXSIZE
+#define GRID_MSG_MAXSIZE 1024
+#endif
+
+#define grid_printf(...)                                                                 \
+  {                                                                                      \
+    char _buf[GRID_MSG_MAXSIZE];                                                         \
+    snprintf(_buf, GRID_MSG_MAXSIZE, __VA_ARGS__);                                       \
+    GRID_MSG << _buf;                                                                    \
+  }
+
+#endif // Grid_Benchmarks_Common_hpp_
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@@ -3,10 +3,10 @@ ACLOCAL_AMFLAGS = -I .buildutils/m4
 bin_PROGRAMS =                \
  Benchmark_comms_host_device \
  Benchmark_dwf_fp32          \
-  Benchmark_ITT               \
+  Benchmark_Grid              \
  Benchmark_IO
  
 Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp
 Benchmark_dwf_fp32_SOURCES          = Benchmark_dwf_fp32.cpp
-Benchmark_ITT_SOURCES               = Benchmark_ITT.cpp
+Benchmark_Grid_SOURCES              = Benchmark_Grid.cpp
 Benchmark_IO_SOURCES                = Benchmark_IO.cpp
--- a/Grid/Readme.md
+++ b/Grid/Readme.md
@@ -0,0 +1,69 @@
+# Grid benchmarks
+
+This folder contains benchmarks for the [Grid](https://github.com/aportelli/) library.
+The benchmarks can be summarised as follows
+
+- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
+matrices, as well as bandwidth measurement for different operations. Measurements are
+performed for a fixed range of problem sizes.
+
+## TL;DR
+Build and install Grid, all dependencies, and the benchmark with
+```bash
+systems/<system>/bootstrap-env.sh <env_dir> # build dependencies, takes a long time
+./build-grid.sh <env_dir> <config>          # build Grid
+./build-benchmark.sh <env_dir> <config>     # build benchmarks
+```
+where `<env_dir>` is an arbitrary directory where every product will be stored, `<system>`
+is a sub-directory of `systems` containing system-specific scripts 
+(an existing preset or your own), and finally `<config>` is the name of a build config
+in `systems/<system>/grid-config.json`. After a successful execution the benchmark binaries
+will be in `<env_dir>/prefix/gridbench_<config>`.
+
+## Environment setup
+A complete runtime environnement can be deploy using scripts from this repository. System-specific scripts are in the `systems` directory.
+
+You should first deploy the environment for the specific system you are using, for example
+```bash
+systems/tursa/bootstrap-env.sh ./env
+```
+will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
+of packages, and might take some time to complete.
+
+After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
+```bash
+source ./env/env.sh
+```
+Additional scripts `env-*.sh` can be sourced after to activate more specific environments,
+this should be done after sourcing `env.sh` as above.
+
+## Building the benchmarks
+The environnement directory contains a `grid-config.json` file specifying compilation flag
+configurations for Grid (please see Grid's repository for documentation). All entries have 
+the form
+```json
+{
+  "name": "foo",          // name of the configuration
+  "env-script": "bar.sh", // script to source before building 
+                          // (path relative to the environment directory)
+  "commit": "...",        // Grid commit to use 
+                          // (anything that can be an argument of git checkout)
+  "config-options": "..." // options to pass to the configure script,
+  "env" : {               // environment variables
+    "VAR": "value"        // export VAR="value" before building
+  }
+}
+```
+Grid can then be built with
+```
+./build-grid.sh <env_dir> <config>
+```
+where `<env_dir>` is the environment directory and `<config>` is the build config name in 
+`grid-config.json`. Similarly, the benchmarks can then be built with
+```
+./build-grid <env_dir> <config>
+```
+
+## Running the benchmarks
+After building the benchmarks as above you can find the binaries in 
+`<env_dir>/prefix/gridbench_<config>`.
--- a/Grid/bootstrap.sh
+++ b/Grid/bootstrap.sh
@@ -2,5 +2,10 @@

 set -euo pipefail

+json_url='https://raw.githubusercontent.com/nlohmann/json/bc889afb4c5bf1c0d8ee29ef35eaaf4c8bef8a5d/single_include/nlohmann/json.hpp'
+
+if [ ! -f json.hpp ]; then
+  wget ${json_url}
+fi
 mkdir -p .buildutils/m4
 autoreconf -fvi
--- a/Grid/build-benchmark.sh
+++ b/Grid/build-benchmark.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1090,SC1091
+
+set -euo pipefail
+
+if (( $# != 2 )); then
+    echo "usage: $(basename "$0") <environment directory> <config>" 1>&2
+    exit 1
+fi
+env_dir=$1
+cfg=$2
+
+call_dir=$(pwd -P)
+script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
+cd "${env_dir}"
+env_dir=$(pwd -P)
+cd "${call_dir}"
+build_dir="${env_dir}/build/Grid-benchmarks/${cfg}"
+mkdir -p "${build_dir}"
+source "${env_dir}/env.sh"
+entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
+env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
+cd "${build_dir}" || return
+source "${env_dir}/${env_script}"
+if [ ! -f Makefile ]; then
+    "${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
+                            --prefix="${env_dir}/prefix/gridbench_${cfg}"
+fi
+make -j 128
+make install
+cd "${call_dir}"
--- a/Grid/build-grid.sh
+++ b/Grid/build-grid.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1090,SC1091
+
+set -euo pipefail
+
+if (( $# != 2 )); then
+    echo "usage: $(basename "$0") <environment directory> <config>" 1>&2
+    exit 1
+fi
+env_dir=$1
+cfg=$2
+
+call_dir=$(pwd -P)
+cd "${env_dir}"
+env_dir=$(pwd -P)
+cd "${call_dir}"
+build_dir="${env_dir}/build/Grid/${cfg}"
+if [ -d "${build_dir}" ]; then
+    echo "error: directory '${build_dir}' exists"
+    exit 1
+fi
+mkdir -p "${build_dir}"
+source "${env_dir}/env.sh"
+entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
+IFS=" " read -r -a args <<< "$(echo "${entry}" | jq -r ".\"config-options\"")"
+env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
+cd "${build_dir}" || return
+source "${env_dir}/${env_script}"
+extra_env=$(mktemp)
+echo "${entry}" | jq -r '.env|to_entries|map("export \(.key)='\''\(.value|tostring)'\''")|.[]' > "${extra_env}"
+commit=$(echo "${entry}" | jq -r ".commit")
+git clone https://github.com/paboyle/Grid.git "${build_dir}"
+cd "${build_dir}"
+git checkout "${commit}"
+./bootstrap.sh
+mkdir build; cd build
+source "${extra_env}"
+../configure --prefix="${env_dir}/prefix/grid_${cfg}" "${args[@]}"
+make -j128
+make install
+rm -rf "${extra_env}"
+cd "${call_dir}"
--- a/Grid/configure.ac
+++ b/Grid/configure.ac
@@ -3,55 +3,69 @@ AC_INIT([lattice-bench], [0.1], [antonin.portelli@me.com])
 AC_CANONICAL_BUILD
 AC_CANONICAL_HOST
 AC_CANONICAL_TARGET
-AC_CONFIG_SRCDIR([Benchmark_ITT.cpp])
+AC_CONFIG_SRCDIR([Benchmark_Grid.cpp])
 AC_CONFIG_MACRO_DIR([.buildutils/m4])
 AC_CONFIG_HEADERS([config.h])
 AM_INIT_AUTOMAKE([-Wall -Werror foreign])
 m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])

-# Checks for programs.
-AC_PROG_CXX
-AC_PROG_CC
-AC_PROG_RANLIB
-AM_PROG_AR
-AC_LANG([C++])
-
 AC_ARG_WITH([grid],
    [AS_HELP_STRING([--with-grid=<prefix>],
    [try this for a non-standard install prefix of Grid])],
    [PATH="$with_grid/bin$PATH_SEPARATOR$PATH"]
    [CXXFLAGS="$CXXFLAGS -I$with_grid/include"]
    [LDFLAGS="$LDFLAGS -L$with_grid/lib"])
+
 AC_CHECK_PROG([GRIDCONF],[grid-config],[yes])
 if test x"$GRIDCONF" != x"yes" ; then
    AC_MSG_ERROR([grid-config not found])
 fi
+if test x"$CXX" == x ; then
+    CXX="`grid-config --cxx`"
+elif test "$CXX" != "`grid-config --cxx`" ; then
+    AC_MSG_WARN([CXX differs from that reported by grid-config])
+fi
+if test x"$CXXLD" == x ; then
+    CXXLD="`grid-config --cxxld`"
+elif test "$CXXLD" != "`grid-config --cxxld`" ; then
+    AC_MSG_WARN([CXXLD differs from that reported by grid-config])
+fi
 CXXFLAGS="$CXXFLAGS `grid-config --cxxflags`"
-LDFLAGS="$LDFLAGS `grid-config --ldflags`"
 CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
+
+AC_PROG_CC
+AM_PROG_CC_C_O
+AC_PROG_CXX
+
+LDFLAGS="$LDFLAGS `grid-config --ldflags`"
 LDFLAGS="$AM_LDFLAGS $LDFLAGS"
-LIBS=" -lGrid $LIBS `grid-config --libs`"
+LIBS=" -ldl -lGrid $LIBS `grid-config --libs`"
+
+AC_PROG_RANLIB
+AM_PROG_AR
+AC_LANG([C++])

 AC_MSG_CHECKING([that a minimal Grid program compiles]);
 AC_LINK_IFELSE(
-        [AC_LANG_SOURCE([[
+	[AC_LANG_SOURCE([[
    #include <Grid/Grid.h>
-
+    
    using namespace Grid;
-
+    
    int main(int argc, char *argv[])
    {
        Grid_init(&argc, &argv);
        Grid_finalize();
-
+        
        return 0;
    }
-
+    
    ]])],
-        [AC_MSG_RESULT([yes])],
+	[AC_MSG_RESULT([yes])],
    [AC_MSG_RESULT([no])]
-    [AC_MSG_ERROR([Could not compile a minimal Grid program])])
+    [AC_MSG_ERROR([impossible to compile a minimal Grid program])])

+AC_SUBST([CXXLD])
 AC_SUBST([AM_CXXFLAGS])
 AC_SUBST([AM_LDFLAGS])
 AC_CONFIG_FILES([Makefile])
--- a/Grid/systems/tursa/bootstrap-env.sh
+++ b/Grid/systems/tursa/bootstrap-env.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if (( $# != 1 )); then
+    echo "usage: $(basename "$0") <environment directory>" 1>&2
+    exit 1
+fi
+dir=$1
+
+call_dir=$(pwd -P)
+script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
+if [ -d "${dir}" ]; then
+    echo "error: directory '${dir}' exists"
+    exit 1
+fi
+mkdir -p "${dir}"
+cd "${dir}"
+git clone https://github.com/spack/spack.git
+cd "${call_dir}"
+cp "${script_dir}"/files/* "${dir}"
+cp "${script_dir}/env.sh" "${script_dir}/grid-config.json" "${dir}"
+source "${dir}"/spack/share/spack/setup-env.sh
+"${script_dir}"/spack-bootstrap.sh "${dir}"
--- a/Grid/systems/tursa/env-cpu.sh
+++ b/Grid/systems/tursa/env-cpu.sh
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-# shellcheck disable=SC2046
-
-script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
-spack load $(cat "${script_dir}"/grid-cpu.spack)
--- a/Grid/systems/tursa/env-gpu.sh
+++ b/Grid/systems/tursa/env-gpu.sh
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-# shellcheck disable=SC2046
-
-script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
-spack load $(cat "${script_dir}"/grid-gpu.spack)
--- a/Grid/systems/tursa/env.sh
+++ b/Grid/systems/tursa/env.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091
+
+GRIDENVDIR="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
+export GRIDENVDIR
+export PATH="${GRIDENVDIR}/prefix/base/bin:${PATH}"
+export ACLOCAL_PATH="${GRIDENVDIR}/prefix/base/share/aclocal:${ACLOCAL_PATH}"
+source "${GRIDENVDIR}"/spack/share/spack/setup-env.sh
--- a/Grid/systems/tursa/files/cpu-mpi-wrapper.sh
+++ b/Grid/systems/tursa/files/cpu-mpi-wrapper.sh
--- a/Grid/systems/tursa/files/env-cpu.sh
+++ b/Grid/systems/tursa/files/env-cpu.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+spack env activate grid-cpu
+spack env status
--- a/Grid/systems/tursa/files/env-gpu.sh
+++ b/Grid/systems/tursa/files/env-gpu.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+spack env activate grid-gpu
+spack env status
--- a/Grid/systems/tursa/files/gpu-mpi-wrapper.sh
+++ b/Grid/systems/tursa/files/gpu-mpi-wrapper.sh
--- a/Grid/systems/tursa/grid-config.json
+++ b/Grid/systems/tursa/grid-config.json
@@ -0,0 +1,25 @@
+{
+  "configs": [
+    {
+      "name": "gpu",
+      "env-script": "env-gpu.sh",
+      "commit": "796abfad80625d81bb16af7ff6ec612a836f17d8",
+      "config-options": "--enable-comms=mpi --enable-simd=GPU --enable-alloc-align=4k --enable-accelerator-cshift --enable-shm=nvlink --enable-gen-simd-width=64 --enable-accelerator=cuda --disable-unified --enable-gparity=no --enable-fermion-reps=no",
+      "env" : {
+        "CXX": "nvcc",
+        "LDFLAGS": "-cudart shared",
+        "CXXFLAGS": "-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
+      }
+    },
+    {
+      "name": "cpu",
+      "env-script": "env-cpu.sh",
+      "commit": "796abfad80625d81bb16af7ff6ec612a836f17d8",
+      "config-options": "--enable-comms=mpi-auto --enable-simd=AVX2 --enable-alloc-align=4k --enable-shm=shmget --enable-gparity=no --enable-fermion-reps=no",
+      "env" : {
+        "CXX": "clang++",
+        "MPICXX": "mpicxx"
+      }
+    }
+  ]
+}
--- a/Grid/systems/tursa/grid-cpu.spack
+++ b/Grid/systems/tursa/grid-cpu.spack
@@ -1,10 +0,0 @@
-gcc@9.4.0 
-llvm@12.0.1 
-ucx@1.12.0.CPU%gcc@9.4.0 
-openmpi@4.1.1.CPU%gcc@9.4.0
-hdf5^openmpi@4.1.1.CPU%gcc@9.4.0
-fftw^openmpi@4.1.1.CPU%gcc@9.4.0
-openssl
-gmp%gcc@9.4.0 
-mpfr%gcc@9.4.0
-c-lime
--- a/Grid/systems/tursa/grid-gpu.spack
+++ b/Grid/systems/tursa/grid-gpu.spack
@@ -1,10 +0,0 @@
-gcc@9.4.0
-cuda@11.4.0
-ucx@1.12.0.GPU%gcc@9.4.0
-openmpi@4.1.1.GPU%gcc@9.4.0
-hdf5^openmpi@4.1.1.GPU%gcc@9.4.0
-fftw^openmpi@4.1.1.GPU%gcc@9.4.0
-openssl
-gmp%gcc@9.4.0
-mpfr%gcc@9.4.0
-c-lime
--- a/Grid/systems/tursa/spack-bootstrap.sh
+++ b/Grid/systems/tursa/spack-bootstrap.sh
@@ -2,16 +2,19 @@
 # shellcheck disable=SC2016
 set -euo pipefail

-GCC='gcc@9.4.0'
-CUDA='cuda@11.4.0'
-HDF5='hdf5@1.10.7'
+gcc_spec='gcc@9.4.0'
+cuda_spec='cuda@11.4.0'
+hdf5_spec='hdf5@1.10.7'

 if (( $# != 1 )); then
    echo "usage: $(basename "$0") <env dir>" 1>&2
    exit 1
 fi
-ENVDIR=$1
-CWD=$(pwd -P)
+dir=$1
+cwd=$(pwd -P)
+cd "${dir}"
+dir=$(pwd -P)
+cd "${cwd}"

 # General configuration ########################################################
 # build with 128 tasks
@@ -33,107 +36,92 @@ echo 'packages:
 spack config --scope site add -f external.yaml
 rm external.yaml

+# Base compilers ###############################################################
 # configure system base
 spack compiler find --scope site

-# Base packages ################################################################
-# install GCC
-spack install ${GCC}
-spack load ${GCC}
+# install GCC, CUDA & LLVM
+spack install ${gcc_spec} ${cuda_spec} llvm
+
+spack load llvm
 spack compiler find --scope site
-spack unload ${GCC}
+spack unload llvm

-# clean
-spack clean
-spack gc -y
-
-# install CUDA
-spack install ${CUDA}
-
-# install development tools
-dev_tools=("autoconf" "automake" "libtool" "git")
-spack install "${dev_tools[@]}"
-
-# create view for CLI & dev tools
-spack view symlink -i "${ENVDIR}/prefix/base" "${dev_tools[@]}"
-
-# install clang
-spack install llvm@12.0.1
-
-# locate new compilers
-spack load llvm@12.0.1
+spack load ${gcc_spec}
 spack compiler find --scope site
-spack unload llvm@12.0.1
+spack unload ${gcc_spec}

 # Manual compilation of OpenMPI & UCX ##########################################
 # set build directories
-mkdir -p "${ENVDIR}"/build
-cd "${ENVDIR}"/build
+mkdir -p "${dir}"/build
+cd "${dir}"/build

-spack load ${GCC} ${CUDA}
+spack load ${gcc_spec} ${cuda_spec}

-CUDA_PATH=$(which nvcc | sed "s/bin/@/g" | cut -d "@" -f1)
-GDRCOPY_PATH=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1
+cuda_path=$(spack find --format "{prefix}" cuda)
+gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1

 # Install ucx 1.12.0
-UCX_URL=https://github.com/openucx/ucx/releases/download/v1.12.0/ucx-1.12.0.tar.gz
+ucx_url=https://github.com/openucx/ucx/releases/download/v1.12.0/ucx-1.12.0.tar.gz

 echo "-- building UCX from source"
-wget ${UCX_URL}
-UCX_AR=$(basename ${UCX_URL})
-tar -xvf "${UCX_AR}"
-cd "${UCX_AR%.tar.gz}"
+wget ${ucx_url}
+ucx_ar=$(basename ${ucx_url})
+tar -xvf "${ucx_ar}"
+cd "${ucx_ar%.tar.gz}"

 # ucx gpu build
-mkdir build_gpu; cd build_gpu
+mkdir -p build_gpu; cd build_gpu
 ../configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu    \
-             --disable-dependency-tracking --prefix="${ENVDIR}"/prefix/ucx_gpu \
+             --disable-dependency-tracking --prefix="${dir}"/prefix/ucx_gpu    \
             --enable-devel-headers --enable-examples --enable-optimizations   \
-             --with-gdrcopy=${GDRCOPY_PATH} --with-verbs --disable-logging     \
+             --with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging     \
             --disable-debug --disable-assertions --enable-cma                 \
             --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm                \
             --without-rocm --without-ugni --without-java                      \
-             --enable-compiler-opt=3 --with-cuda="${CUDA_PATH}" --without-cm   \
+             --enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm   \
             --with-rc --with-ud --with-dc --with-mlx5-dv --with-dm            \
-             --enable-mt LDFLAGS=-L${GDRCOPY_PATH}/lib
+             --enable-mt --without-go LDFLAGS=-L${gdrcopy_path}/lib
 make -j 128
 make install
 cd ..

 # ucx cpu build
-mkdir build_cpu; cd build_cpu
+mkdir -p build_cpu; cd build_cpu
 ../configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu    \
-             --disable-dependency-tracking --prefix="${ENVDIR}"/prefix/ucx_cpu \
+             --disable-dependency-tracking --prefix="${dir}"/prefix/ucx_cpu    \
             --enable-devel-headers --enable-examples --enable-optimizations   \
             --with-verbs --disable-logging --disable-debug                    \
             --disable-assertions --enable-mt --enable-cma                     \
             --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm                \
             --without-rocm --without-ugni --without-java                      \
             --enable-compiler-opt=3 --without-cm --without-ugni --with-rc     \
-             --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt
+             --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go
 make -j 128
 make install

-cd "${ENVDIR}"/build
+cd "${dir}"/build

-# Install openmpi 4.1.1 (needs to be done on a gpu node)
-OMPI_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz
+# Install openmpi 4.1.1
+ompi_url=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz

 echo "-- building OpenMPI from source"

-wget ${OMPI_URL}
-OMPI_AR=$(basename ${OMPI_URL})
-tar -xvf "${OMPI_AR}"
-cd "${OMPI_AR%.tar.gz}"
+wget ${ompi_url}
+ompi_ar=$(basename ${ompi_url})
+tar -xvf "${ompi_ar}"
+cd "${ompi_ar%.tar.gz}"
+export AUTOMAKE_JOBS=128
+./autogen.pl -f

 # openmpi gpu build
 mkdir build_gpu; cd build_gpu
-../configure --prefix="${ENVDIR}"/prefix/ompi_gpu --without-xpmem \
-             --with-ucx="${ENVDIR}"/prefix/ucx_gpu                \
-             --with-ucx-libdir="${ENVDIR}"/prefix/ucx_gpu/lib     \
+../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem    \
+             --with-ucx="${dir}"/prefix/ucx_gpu                   \
+             --with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib        \
             --with-knem=/opt/knem-1.1.4.90mlnx1/                 \
             --enable-mca-no-build=btl-uct                        \
-             --with-cuda="${CUDA_PATH}" --disable-getpwuid        \
+             --with-cuda="${cuda_path}" --disable-getpwuid        \
             --with-verbs --with-slurm --enable-mpi-fortran=all   \
             --with-pmix=internal --with-libevent=internal
 make -j 128 
@@ -142,61 +130,76 @@ cd ..

 # openmpi cpu build
 mkdir build_cpu; cd build_cpu
-../configure --prefix="${ENVDIR}"/prefix/ompi_cpu --without-xpmem \
-             --with-ucx="${ENVDIR}"/prefix/ucx_cpu                \
-             --with-ucx-libdir="${ENVDIR}"/prefix/ucx_cpu/lib     \
+../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem    \
+             --with-ucx="${dir}"/prefix/ucx_cpu                   \
+             --with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib        \
             --with-knem=/opt/knem-1.1.4.90mlnx1/                 \
             --enable-mca-no-build=btl-uct --disable-getpwuid     \
             --with-verbs --with-slurm --enable-mpi-fortran=all   \
             --with-pmix=internal --with-libevent=internal
 make -j 128 
 make install
-cd "${ENVDIR}"
+cd "${dir}"

 # Add externals to spack
 echo "packages:
  ucx:
    externals:
    - spec: \"ucx@1.12.0.GPU%gcc@9.4.0\"
-      prefix: ${ENVDIR}/prefix/ucx_gpu
+      prefix: ${dir}/prefix/ucx_gpu
    - spec: \"ucx@1.12.0.CPU%gcc@9.4.0\"
-      prefix: ${ENVDIR}/prefix/ucx_cpu
+      prefix: ${dir}/prefix/ucx_cpu
    buildable: False
  openmpi:
    externals:
    - spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\"
-      prefix: ${ENVDIR}/prefix/ompi_gpu
+      prefix: ${dir}/prefix/ompi_gpu
    - spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\"
-      prefix: ${ENVDIR}/prefix/ompi_cpu
+      prefix: ${dir}/prefix/ompi_cpu
    buildable: False" > spack.yaml

 spack config --scope site add -f spack.yaml
 rm spack.yaml
-spack install ucx@1.12.0.GPU%gcc@9.4.0
-spack install ucx@1.12.0.CPU%gcc@9.4.0
-spack install openmpi@4.1.1.GPU%gcc@9.4.0
-spack install openmpi@4.1.1.CPU%gcc@9.4.0
+spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
+spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0

-# Install Grid dependencies ####################################################
-cd "${CWD}"
+cd "${cwd}"

-OPENMPIGPUHASH=$(spack find --format "{hash}" openmpi@4.1.1.GPU)
-OPENMPICPUHASH=$(spack find --format "{hash}" openmpi@4.1.1.CPU)
+# environments #################################################################
+dev_tools=("autoconf" "automake" "libtool" "jq")
+ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU)
+ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU)

-spack install ${HDF5}+cxx+threadsafe ^/"${OPENMPIGPUHASH}"
-spack install ${HDF5}+cxx+threadsafe ^/"${OPENMPICPUHASH}"
-spack install fftw ^/"${OPENMPIGPUHASH}"
-spack install fftw ^/"${OPENMPICPUHASH}"
-spack install openssl gmp mpfr c-lime
+spack env create grid-gpu
+spack env activate grid-gpu
+spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}" 
+spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
+spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}"
+spack add fftw ^/"${ompi_gpu_hash}"
+spack add openssl gmp mpfr c-lime
+spack install
+spack env deactivate
+
+spack env create grid-cpu
+spack env activate grid-cpu
+spack add llvm "${dev_tools[@]}" 
+spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
+spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}"
+spack add fftw ^/"${ompi_cpu_hash}"
+spack add openssl gmp mpfr c-lime
+spack install
+spack env deactivate

 # Final setup ##################################################################
 spack clean
+spack gc -y

 # add more environment variables in module loading
-spack config --scope site add 'modules:prefix_inspections:lib:[LIBRARY_PATH]'
+spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]'
+spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]'
 spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]'
 spack module tcl refresh -y

 # permission change for group access
-chmod -R g+rw "${ENVDIR}/spack/var/spack/cache"
-setfacl -d -R -m g::rwX "${ENVDIR}/spack/var/spack/cache"
+chmod -R g+rw "${dir}/spack/var/spack/cache"
+setfacl -d -R -m g::rwX "${dir}/spack/var/spack/cache"
--- a/Readme.md
+++ b/Readme.md
@@ -0,0 +1,8 @@
+# Lattice benchmarks
+
+This repository is an attempt at packaging benchmarks for various libraries used for
+lattice field theory simulations. It is currently only featuring the Grid library but
+more will be added later.
+
+Libraries:
+- [Grid](https://github.com/aportelli/) - [Documentation](Grid/Readme.md)