Merge pull request 'Fix: Remove deprecated global LebesgueOrder::Block' (#9 ) from RChHill/lattice-benchmarks:fix/UVM-deprecation into main

Reviewed-on: portelli/lattice-benchmarks#9
Fix: Remove deprecated global LebesgueOrder::Block
2025-06-29 21:14:41 +01:00 · 2025-06-23 11:46:20 +01:00 · 2025-03-31 16:08:45 +01:00 · 2024-12-17 13:46:43 +00:00 · 2024-11-26 10:57:28 +00:00 · 2024-11-25 15:45:59 +00:00
23 changed files with 1543 additions and 1339 deletions
--- a/Grid/Benchmark_Grid.cpp
+++ b/Grid/Benchmark_Grid.cpp
--- a/Grid/Benchmark_IO.cpp
+++ b/Grid/Benchmark_IO.cpp
@ -32,7 +32,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 #ifdef HAVE_LIME
 using namespace Grid;
-std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
+std::string filestem(const int l) { return "io/iobench_l" + std::to_string(l); }
 int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
@ -56,13 +56,6 @@ template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat
  mean /= n;
 }
 #define grid_printf(...)                                                                 \
  {                                                                                      \
    char _buf[1024];                                                                     \
    sprintf(_buf, __VA_ARGS__);                                                          \
    MSG << _buf;                                                                         \
  }
 enum
 {
  sRead = 0,
@ -83,58 +76,58 @@ int main(int argc, char **argv)
  std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
  std::vector<int> latt;
-  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
+  GRID_MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << "MPI partition " << mpi << std::endl;
+  GRID_MSG << "MPI partition " << mpi << std::endl;
  for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
  {
-    MSG << BIGSEP << std::endl;
+    grid_big_sep();
-    MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
+    GRID_MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
-    MSG << BIGSEP << std::endl;
+    grid_big_sep();
-    MSG << SEP << std::endl;
+    grid_small_sep();
-    MSG << "Benchmark std write" << std::endl;
+    GRID_MSG << "Benchmark std write" << std::endl;
-    MSG << SEP << std::endl;
+    grid_small_sep();
    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
    {
      latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
-      MSG << "-- Local volume " << l << "^4" << std::endl;
+      GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
      writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
      perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
    }
-    MSG << SEP << std::endl;
+    grid_small_sep();
-    MSG << "Benchmark std read" << std::endl;
+    GRID_MSG << "Benchmark std read" << std::endl;
-    MSG << SEP << std::endl;
+    grid_small_sep();
    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
    {
      latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
-      MSG << "-- Local volume " << l << "^4" << std::endl;
+      GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
      readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
      perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
    }
 #ifdef HAVE_LIME
-    MSG << SEP << std::endl;
+    grid_small_sep();
-    MSG << "Benchmark Grid C-Lime write" << std::endl;
+    GRID_MSG << "Benchmark Grid C-Lime write" << std::endl;
-    MSG << SEP << std::endl;
+    grid_small_sep();
    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
    {
      latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
-      MSG << "-- Local volume " << l << "^4" << std::endl;
+      GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
      writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
      perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
    }
-    MSG << SEP << std::endl;
+    grid_small_sep();
-    MSG << "Benchmark Grid C-Lime read" << std::endl;
+    GRID_MSG << "Benchmark Grid C-Lime read" << std::endl;
-    MSG << SEP << std::endl;
+    grid_small_sep();
    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
    {
      latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
-      MSG << "-- Local volume " << l << "^4" << std::endl;
+      GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
      readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
      perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
    }
@ -159,13 +152,13 @@ int main(int argc, char **argv)
  avRob.fill(100.);
  avRob -= 100. * avStdDev.cwiseQuotient(avMean.cwiseAbs());
-  MSG << BIGSEP << std::endl;
+  grid_big_sep();
-  MSG << "SUMMARY" << std::endl;
+  GRID_MSG << "SUMMARY" << std::endl;
-  MSG << BIGSEP << std::endl;
+  grid_big_sep();
-  MSG << "Summary of individual results (all results in MB/s)." << std::endl;
+  GRID_MSG << "Summary of individual results (all results in MB/s)." << std::endl;
-  MSG << "Every second colum gives the standard deviation of the previous column."
+  GRID_MSG << "Every second colum gives the standard deviation of the previous column."
-      << std::endl;
+           << std::endl;
-  MSG << std::endl;
+  GRID_MSG << std::endl;
  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
              "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
@ -176,10 +169,10 @@ int main(int argc, char **argv)
                stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
                stdDev(volInd(l), gWrite));
  }
-  MSG << std::endl;
+  GRID_MSG << std::endl;
-  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
+  GRID_MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
-      << std::endl;
+           << std::endl;
-  MSG << std::endl;
+  GRID_MSG << std::endl;
  grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
              "Grid write");
  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
@ -187,21 +180,21 @@ int main(int argc, char **argv)
    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
                rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
  }
-  MSG << std::endl;
+  GRID_MSG << std::endl;
-  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
+  GRID_MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
-      << "^4 (all results in MB/s)." << std::endl;
+           << "^4 (all results in MB/s)." << std::endl;
-  MSG << "Every second colum gives the standard deviation of the previous column."
+  GRID_MSG << "Every second colum gives the standard deviation of the previous column."
-      << std::endl;
+           << std::endl;
-  MSG << std::endl;
+  GRID_MSG << std::endl;
  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
              "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
              avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
              avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
-  MSG << std::endl;
+  GRID_MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
+  GRID_MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
-      << std::endl;
+           << std::endl;
-  MSG << std::endl;
+  GRID_MSG << std::endl;
  grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
              "Grid write");
  grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
--- a/Grid/Benchmark_IO.hpp
+++ b/Grid/Benchmark_IO.hpp
@ -18,12 +18,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 #ifndef Benchmark_IO_hpp_
 #define Benchmark_IO_hpp_
 #include "Common.hpp"
 #include <Grid/Grid.h>
 #define MSG std::cout << GridLogMessage
 #define SEP                                                                              \
  "-----------------------------------------------------------------------------"
 #define BIGSEP                                                                           \
  "============================================================================="
 #ifdef HAVE_LIME
 namespace Grid
@ -50,9 +46,9 @@ namespace Grid
  //   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
  //   std::fwrite(&crc, sizeof(uint32_t), 1, file);
  //   crcWatch.Stop();
-  //   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+  //   GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec <<
-  //   ioWatch.Start();
+  //   std::endl; ioWatch.Start(); std::fwrite(vec_v.cpu_ptr, sizeof(typename
-  //   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
+  //   Field::scalar_object),
  //               vec.Grid()->lSites(), file);
  //   ioWatch.Stop();
  //   std::fclose(file);
@ -61,11 +57,11 @@ namespace Grid
  //   p.size = size;
  //   p.time = ioWatch.useconds();
  //   p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
-  //   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
+  //   GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
  //       << ",
  //          "
  //       << p.mbytesPerSecond << " MB/s" << std::endl;
-  //   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+  //   GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
  // }
  // template <typename Field> void stdRead(Field &vec, const std::string filestem)
@ -94,16 +90,14 @@ namespace Grid
  //     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
  //     crcWatch.Stop();
  //   }
-  //   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+  //   GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec <<
-  //   assert(crcData == crcRead);
+  //   std::endl; assert(crcData == crcRead); size *= vec.Grid()->ProcessorCount(); auto
-  //   size *= vec.Grid()->ProcessorCount();
+  //   &p = BinaryIO::lastPerf; p.size = size; p.time = ioWatch.useconds();
  //   auto &p = BinaryIO::lastPerf;
  //   p.size = size;
  //   p.time = ioWatch.useconds();
  //   p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
-  //   MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+  //   GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() <<
  //   ", "
  //       << p.mbytesPerSecond << " MB/s" << std::endl;
-  //   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+  //   GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
  // }
  template <typename Field> void stdWrite(const std::string filestem, Field &vec)
@ -122,7 +116,7 @@ namespace Grid
    crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
    file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t) / sizeof(char));
    crcWatch.Stop();
-    MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+    GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
    ioWatch.Start();
    file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
    file.flush();
@ -132,9 +126,9 @@ namespace Grid
    p.size = size;
    p.time = ioWatch.useconds();
    p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
-    MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+    GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
-        << p.mbytesPerSecond << " MB/s" << std::endl;
+             << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
-    MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+    GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
  }
  template <typename Field> void stdRead(Field &vec, const std::string filestem)
@ -163,16 +157,17 @@ namespace Grid
      crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
      crcWatch.Stop();
    }
-    MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+    GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec
             << std::endl;
    assert(crcData == crcRead);
    size *= vec.Grid()->ProcessorCount();
    auto &p = BinaryIO::lastPerf;
    p.size = size;
    p.time = ioWatch.useconds();
    p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
-    MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+    GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
-        << p.mbytesPerSecond << " MB/s" << std::endl;
+             << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
-    MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+    GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
  }
  template <typename Field> void limeWrite(const std::string filestem, Field &vec)
--- a/Grid/Benchmark_comms_host_device.cpp
+++ b/Grid/Benchmark_comms_host_device.cpp
@ -1,265 +0,0 @@
 /*
 Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
 Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 struct time_statistics
 {
  double mean;
  double err;
  double min;
  double max;
  void statistics(std::vector<double> v)
  {
    double sum = std::accumulate(v.begin(), v.end(), 0.0);
    mean = sum / v.size();
    std::vector<double> diff(v.size());
    std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
    double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
    err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
    auto result = std::minmax_element(v.begin(), v.end());
    min = *result.first;
    max = *result.second;
  }
 };
 void header()
 {
  std::cout << GridLogMessage << " L  "
            << "\t"
            << " Ls  "
            << "\t" << std::setw(11) << "bytes\t\t"
            << "MB/s uni"
            << "\t"
            << "MB/s bidi" << std::endl;
 };
 int main(int argc, char **argv)
 {
  Grid_init(&argc, &argv);
  Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
  Coordinate mpi_layout = GridDefaultMpi();
  int threads = GridThread::GetThreads();
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
            << std::endl;
  int Nloop = 250;
  int nmu = 0;
  int maxlat = 32;
  for (int mu = 0; mu < Nd; mu++)
    if (mpi_layout[mu] > 1)
      nmu++;
  std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
            << std::endl;
  std::vector<double> t_time(Nloop);
  //  time_statistics timestat;
  std::cout << GridLogMessage
            << "========================================================================="
               "==========================="
            << std::endl;
  std::cout << GridLogMessage
            << "= Benchmarking sequential halo exchange from host memory " << std::endl;
  std::cout << GridLogMessage
            << "========================================================================="
               "==========================="
            << std::endl;
  header();
  for (int lat = 8; lat <= maxlat; lat += 4)
  {
    for (int Ls = 8; Ls <= 8; Ls *= 2)
    {
      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
                            lat * mpi_layout[3]});
      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank / Nnode;
      std::vector<std::vector<HalfSpinColourVectorD>> xbuf(8);
      std::vector<std::vector<HalfSpinColourVectorD>> rbuf(8);
      for (int mu = 0; mu < 8; mu++)
      {
        xbuf[mu].resize(lat * lat * lat * Ls);
        rbuf[mu].resize(lat * lat * lat * Ls);
      }
      uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
      int ncomm;
      for (int mu = 0; mu < 4; mu++)
      {
        if (mpi_layout[mu] > 1)
        {
          double start = usecond();
          for (int i = 0; i < Nloop; i++)
          {
            ncomm = 0;
            ncomm++;
            int comm_proc = 1;
            int xmit_to_rank;
            int recv_from_rank;
            {
              std::vector<CommsRequest_t> requests;
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
            }
            comm_proc = mpi_layout[mu] - 1;
            {
              std::vector<CommsRequest_t> requests;
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
            }
          }
          Grid.Barrier();
          double stop = usecond();
          double mean = (stop - start) / Nloop;
          double dbytes = bytes * ppn;
          double xbytes = dbytes * 2.0 * ncomm;
          double rbytes = xbytes;
          double bidibytes = xbytes + rbytes;
          std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
                    << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
        }
      }
    }
  }
  std::cout << GridLogMessage
            << "========================================================================="
               "==========================="
            << std::endl;
  std::cout << GridLogMessage
            << "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
  std::cout << GridLogMessage
            << "========================================================================="
               "==========================="
            << std::endl;
  header();
  for (int lat = 8; lat <= maxlat; lat += 4)
  {
    for (int Ls = 8; Ls <= 8; Ls *= 2)
    {
      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
                            lat * mpi_layout[3]});
      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank / Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
      for (int d = 0; d < 8; d++)
      {
        xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
        rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
      }
      int ncomm;
      for (int mu = 0; mu < 4; mu++)
      {
        if (mpi_layout[mu] > 1)
        {
          double start = usecond();
          for (int i = 0; i < Nloop; i++)
          {
            ncomm = 0;
            ncomm++;
            int comm_proc = 1;
            int xmit_to_rank;
            int recv_from_rank;
            {
              std::vector<CommsRequest_t> requests;
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
            }
            comm_proc = mpi_layout[mu] - 1;
            {
              std::vector<CommsRequest_t> requests;
              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
            }
          }
          Grid.Barrier();
          double stop = usecond();
          double mean = (stop - start) / Nloop;
          double dbytes = bytes * ppn;
          double xbytes = dbytes * 2.0 * ncomm;
          double rbytes = xbytes;
          double bidibytes = xbytes + rbytes;
          std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
                    << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
        }
      }
      for (int d = 0; d < 8; d++)
      {
        acceleratorFreeDevice(xbuf[d]);
        acceleratorFreeDevice(rbuf[d]);
      }
    }
  }
  std::cout << GridLogMessage
            << "========================================================================="
               "==========================="
            << std::endl;
  std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
  std::cout << GridLogMessage
            << "========================================================================="
               "==========================="
            << std::endl;
  Grid_finalize();
 }
--- a/Grid/Benchmark_dwf_fp32.cpp
+++ b/Grid/Benchmark_dwf_fp32.cpp
@ -1,512 +0,0 @@
 /*
 Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
 Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
 Copyright © 2023 Simon Bürger <simon.buerger@rwth-aachen.de>
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include "json.hpp"
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
 #endif
 #ifdef CUDA_PROFILE
 #include <cuda_profiler_api.h>
 #endif
 using namespace std;
 using namespace Grid;
 template <class d> struct scal
 {
  d internal;
 };
 Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
                        Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
 int main(int argc, char **argv)
 {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  Coordinate latt4 = GridDefaultLatt();
  int Ls = 16;
  std::string json_filename = ""; // empty indicates no json output
  nlohmann::json json;
  // benchmark specific command line arguments
  for (int i = 0; i < argc; i++)
  {
    if (std::string(argv[i]) == "-Ls")
    {
      std::stringstream ss(argv[i + 1]);
      ss >> Ls;
    }
    if (std::string(argv[i]) == "--json-out")
      json_filename = argv[i + 1];
  }
  GridLogLayout();
  long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
  json["single_site_flops"] = single_site_flops;
  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
  GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
  json["grid"] = FGrid->FullDimensions().toVector();
  json["local_grid"] = FGrid->LocalDimensions().toVector();
  std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
  GridCartesian *sUGrid =
      SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
  GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
  GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedUniqueString(std::string("The 4D RNG"));
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG RNG5(FGrid);
  RNG5.SeedUniqueString(std::string("The 5D RNG"));
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
  LatticeFermionF src(FGrid);
  random(RNG5, src);
 #if 0
  src = Zero();
  {
    Coordinate origin({0,0,0,latt4[2]-1,0});
    SpinColourVectorF tmp;
    tmp=Zero();
    tmp()(0)(0)=Complex(-2.0,0.0);
    std::cout << " source site 0 " << tmp<<std::endl;
    pokeSite(tmp,src,origin);
  }
 #else
  RealD N2 = 1.0 / ::sqrt(norm2(src));
  src = src * N2;
 #endif
  LatticeFermionF result(FGrid);
  result = Zero();
  LatticeFermionF ref(FGrid);
  ref = Zero();
  LatticeFermionF tmp(FGrid);
  LatticeFermionF err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeFieldF Umu(UGrid);
  SU<Nc>::HotConfiguration(RNG4, Umu);
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 #if 0
  Umu=1.0;
  for(int mu=0;mu<Nd;mu++){
    LatticeColourMatrixF ttmp(UGrid);
    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
    //    if (mu !=2 ) ttmp = 0;
    //    ttmp = ttmp* pow(10.0,mu);
    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
  }
  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
 #endif
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  // replicate across fifth dimension
  //  LatticeGaugeFieldF Umu5d(FGrid);
  std::vector<LatticeColourMatrixF> U(4, UGrid);
  for (int mu = 0; mu < Nd; mu++)
  {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
  if (1)
  {
    ref = Zero();
    for (int mu = 0; mu < Nd; mu++)
    {
      tmp = Cshift(src, mu + 1, 1);
      {
        autoView(tmp_v, tmp, CpuWrite);
        autoView(U_v, U[mu], CpuRead);
        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
        {
          for (int s = 0; s < Ls; s++)
          {
            tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s];
          }
        }
      }
      ref = ref + tmp - Gamma(Gmu[mu]) * tmp;
      {
        autoView(tmp_v, tmp, CpuWrite);
        autoView(U_v, U[mu], CpuRead);
        autoView(src_v, src, CpuRead);
        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
        {
          for (int s = 0; s < Ls; s++)
          {
            tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
          }
        }
      }
      tmp = Cshift(tmp, mu + 1, -1);
      ref = ref + tmp + Gamma(Gmu[mu]) * tmp;
    }
    ref = -0.5 * ref;
  }
  RealD mass = 0.1;
  RealD M5 = 1.8;
  RealD NP = UGrid->_Nprocessors;
  RealD NN = UGrid->NodeCount();
  json["ranks"] = NP;
  json["nodes"] = NN;
  std::cout << GridLogMessage
            << "*****************************************************************"
            << std::endl;
  std::cout << GridLogMessage
            << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
            << std::endl;
  std::cout << GridLogMessage
            << "*****************************************************************"
            << std::endl;
  std::cout << GridLogMessage
            << "*****************************************************************"
            << std::endl;
  std::cout << GridLogMessage
            << "* Benchmarking DomainWallFermionR::Dhop                  " << std::endl;
  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
            << std::endl;
  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
            << std::endl;
  if (sizeof(RealF) == 4)
    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
  if (sizeof(RealF) == 8)
    std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
 #ifdef GRID_OMP
  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
    std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
 #endif
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
  std::cout << GridLogMessage
            << "*****************************************************************"
            << std::endl;
  DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
  int ncall = 300;
  if (1)
  {
    FGrid->Barrier();
    Dw.ZeroCounters();
    Dw.Dhop(src, result, 0);
    std::cout << GridLogMessage << "Called warmup" << std::endl;
    double t0 = usecond();
    for (int i = 0; i < ncall; i++)
    {
      __SSC_START;
      Dw.Dhop(src, result, 0);
      __SSC_STOP;
    }
    double t1 = usecond();
    FGrid->Barrier();
    double volume = Ls;
    for (int mu = 0; mu < Nd; mu++)
      volume = volume * latt4[mu];
    double flops = single_site_flops * volume * ncall;
    auto nsimd = vComplex::Nsimd();
    auto simdwidth = sizeof(vComplex);
    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
                     nsimd * ncall / (1024. * 1024. * 1024.);
    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
    double data_mem =
        (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
        nsimd * ncall / (1024. * 1024. * 1024.);
    json["Dw"]["calls"] = ncall;
    json["Dw"]["time"] = t1 - t0;
    json["Dw"]["mflops"] = flops / (t1 - t0);
    json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
    json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN;
    json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
    json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
    std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
              << " us" << std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout << GridLogMessage << "mflop/s =   " << flops / (t1 - t0) << std::endl;
    std::cout << GridLogMessage << "mflop/s per rank =  " << flops / (t1 - t0) / NP
              << std::endl;
    std::cout << GridLogMessage << "mflop/s per node =  " << flops / (t1 - t0) / NN
              << std::endl;
    std::cout << GridLogMessage
              << "RF  GiB/s (base 2) =   " << 1000000. * data_rf / ((t1 - t0))
              << std::endl;
    std::cout << GridLogMessage
              << "mem GiB/s (base 2) =   " << 1000000. * data_mem / ((t1 - t0))
              << std::endl;
    err = ref - result;
    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
    // exit(0);
    if ((norm2(err) > 1.0e-4))
    {
      /*
      std::cout << "RESULT\n " << result<<std::endl;
      std::cout << "REF   \n " << ref   <<std::endl;
      std::cout << "ERR   \n " << err   <<std::endl;
      */
      std::cout << GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      exit(-1);
    }
    assert(norm2(err) < 1.0e-4);
    Dw.Report();
  }
  if (1)
  { // Naive wilson dag implementation
    ref = Zero();
    for (int mu = 0; mu < Nd; mu++)
    {
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
      tmp = Cshift(src, mu + 1, 1);
      {
        autoView(ref_v, ref, CpuWrite);
        autoView(tmp_v, tmp, CpuRead);
        autoView(U_v, U[mu], CpuRead);
        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
        {
          for (int s = 0; s < Ls; s++)
          {
            int i = s + Ls * ss;
            ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]);
            ;
          }
        }
      }
      {
        autoView(tmp_v, tmp, CpuWrite);
        autoView(U_v, U[mu], CpuRead);
        autoView(src_v, src, CpuRead);
        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
        {
          for (int s = 0; s < Ls; s++)
          {
            tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
          }
        }
      }
      //      tmp =adj(U[mu])*src;
      tmp = Cshift(tmp, mu + 1, -1);
      {
        autoView(ref_v, ref, CpuWrite);
        autoView(tmp_v, tmp, CpuRead);
        for (int i = 0; i < ref_v.size(); i++)
        {
          ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i];
          ;
        }
      }
    }
    ref = -0.5 * ref;
  }
  //  dump=1;
  Dw.Dhop(src, result, 1);
  std::cout << GridLogMessage
            << "Compare to naive wilson implementation Dag to verify correctness"
            << std::endl;
  std::cout << GridLogMessage << "Called DwDag" << std::endl;
  std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
  std::cout << GridLogMessage << "norm dag ref    " << norm2(ref) << std::endl;
  err = ref - result;
  std::cout << GridLogMessage << "norm dag diff   " << norm2(err) << std::endl;
  if ((norm2(err) > 1.0e-4))
  {
    /*
      std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
      std::cout<< "DAG sRESULT\n " <<result  << std::endl;
      std::cout<< "DAG ERR   \n "  << err    <<std::endl;
    */
  }
  LatticeFermionF src_e(FrbGrid);
  LatticeFermionF src_o(FrbGrid);
  LatticeFermionF r_e(FrbGrid);
  LatticeFermionF r_o(FrbGrid);
  LatticeFermionF r_eo(FGrid);
  std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
            << std::endl;
  pickCheckerboard(Even, src_e, src);
  pickCheckerboard(Odd, src_o, src);
  std::cout << GridLogMessage << "src_e" << norm2(src_e) << std::endl;
  std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage
            << "*********************************************************" << std::endl;
  std::cout << GridLogMessage
            << "* Benchmarking DomainWallFermionF::DhopEO                " << std::endl;
  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
            << std::endl;
  if (sizeof(RealF) == 4)
    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
  if (sizeof(RealF) == 8)
    std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
 #ifdef GRID_OMP
  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
    std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
 #endif
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
  std::cout << GridLogMessage
            << "*********************************************************" << std::endl;
  {
    Dw.ZeroCounters();
    FGrid->Barrier();
    Dw.DhopEO(src_o, r_e, DaggerNo);
    double t0 = usecond();
    for (int i = 0; i < ncall; i++)
    {
 #ifdef CUDA_PROFILE
      if (i == 10)
        cudaProfilerStart();
 #endif
      Dw.DhopEO(src_o, r_e, DaggerNo);
 #ifdef CUDA_PROFILE
      if (i == 20)
        cudaProfilerStop();
 #endif
    }
    double t1 = usecond();
    FGrid->Barrier();
    double volume = Ls;
    for (int mu = 0; mu < Nd; mu++)
      volume = volume * latt4[mu];
    double flops = (single_site_flops * volume * ncall) / 2.0;
    json["Deo"]["calls"] = ncall;
    json["Deo"]["time"] = t1 - t0;
    json["Deo"]["mflops"] = flops / (t1 - t0);
    json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
    json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
    std::cout << GridLogMessage << "Deo mflop/s =   " << flops / (t1 - t0) << std::endl;
    std::cout << GridLogMessage << "Deo mflop/s per rank   " << flops / (t1 - t0) / NP
              << std::endl;
    std::cout << GridLogMessage << "Deo mflop/s per node   " << flops / (t1 - t0) / NN
              << std::endl;
    Dw.Report();
  }
  Dw.DhopEO(src_o, r_e, DaggerNo);
  Dw.DhopOE(src_e, r_o, DaggerNo);
  Dw.Dhop(src, result, DaggerNo);
  std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl;
  std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl;
  std::cout << GridLogMessage << "res" << norm2(result) << std::endl;
  setCheckerboard(r_eo, r_o);
  setCheckerboard(r_eo, r_e);
  err = r_eo - result;
  std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
  if ((norm2(err) > 1.0e-4))
  {
    /*
  std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
  std::cout<< "Deo REF\n " <<result  << std::endl;
  std::cout<< "Deo ERR   \n " << err <<std::endl;
    */
  }
  pickCheckerboard(Even, src_e, err);
  pickCheckerboard(Odd, src_o, err);
  std::cout << GridLogMessage << "norm diff even  " << norm2(src_e) << std::endl;
  std::cout << GridLogMessage << "norm diff odd   " << norm2(src_o) << std::endl;
  assert(norm2(src_e) < 1.0e-4);
  assert(norm2(src_o) < 1.0e-4);
  if (!json_filename.empty())
  {
    std::cout << GridLogMessage << "writing benchmark results to " << json_filename
              << std::endl;
    int me = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    if (me == 0)
    {
      std::ofstream json_file(json_filename);
      json_file << std::setw(4) << json;
    }
  }
  Grid_finalize();
  exit(0);
 }
--- a/Grid/Common.hpp
+++ b/Grid/Common.hpp
@ -26,6 +26,20 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 #define GRID_MSG_MAXSIZE 1024
 #endif
 #define GRID_BIG_SEP                                                                     \
  "==============================================================================="
 #define GRID_SMALL_SEP "------------------------------------------"
 #define grid_big_sep()                                                                   \
  {                                                                                      \
    GRID_MSG << GRID_BIG_SEP << std::endl;                                               \
  }
 #define grid_small_sep()                                                                 \
  {                                                                                      \
    GRID_MSG << GRID_SMALL_SEP << std::endl;                                             \
  }
 #define grid_printf(...)                                                                 \
  {                                                                                      \
    char _buf[GRID_MSG_MAXSIZE];                                                         \
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@ -1,12 +1,8 @@
 ACLOCAL_AMFLAGS = -I .buildutils/m4
 bin_PROGRAMS =                \
  Benchmark_comms_host_device \
  Benchmark_dwf_fp32          \
  Benchmark_Grid              \
  Benchmark_IO
-  
+
 Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp
 Benchmark_dwf_fp32_SOURCES          = Benchmark_dwf_fp32.cpp
 Benchmark_Grid_SOURCES              = Benchmark_Grid.cpp
 Benchmark_IO_SOURCES                = Benchmark_IO.cpp
--- a/Grid/Readme.md
+++ b/Grid/Readme.md
@ -6,6 +6,7 @@ The benchmarks can be summarised as follows
 - `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
 matrices, as well as bandwidth measurement for different operations. Measurements are
 performed for a fixed range of problem sizes.
 - `Benchmark_IO`: Parallel I/O benchmark.
 ## TL;DR
 Build and install Grid, all dependencies, and the benchmark with
@ -28,7 +29,7 @@ You should first deploy the environment for the specific system you are using, f
 systems/tursa/bootstrap-env.sh ./env
 ```
 will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
-of packages, and might take some time to complete.
+of packages, and take some time to complete.
 After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
 ```bash
@ -66,4 +67,84 @@ where `<env_dir>` is the environment directory and `<config>` is the build confi
 ## Running the benchmarks
 After building the benchmarks as above you can find the binaries in 
-`<env_dir>/prefix/gridbench_<config>`.
+`<env_dir>/prefix/gridbench_<config>`. Depending on the system selected, the environment
 directory might also contain batch script examples. More information about the benchmarks
 is provided below.
 ### `Benchmark_Grid`
 This benchmark performs flop/s measurement for typical lattice QCD sparse matrices, as
 well as memory and inter-process bandwidth measurement using Grid routines. The benchmark
 command accept any Grid flag (see complete list with `--help`), as well as a 
 `--json-out <file>` flag to save the measurement results in JSON to `<file>`. The 
 benchmarks are performed on a fix set of problem sizes, and the Grid flag `--grid` will
 be ignored.
 The resulting metrics are as follows, all data size units are in base 2 
 (i.e. 1 kB = 1024 B).
 *Memory bandwidth*
 One sub-benchmark measure the memory bandwidth using a lattice version of the `axpy` BLAS
 routine, in a similar fashion to the STREAM benchmark. The JSON entries under `"axpy"` 
 have the form
 ```json
 {
  "GBps": 215.80653375861607,   // bandwidth in GB/s/node
  "GFlops": 19.310041765757834, // FP performance (double precision)
  "L": 8,                       // local lattice volume
  "size_MB": 3.0                // memory size in MB/node
 }
 ```
 A second benchmark performs site-wise SU(4) matrix multiplication, and has a higher
 arithmetic intensity than the `axpy` one (although it is still memory-bound). 
 The JSON entries under `"SU4"` have the form
 ```json
 {
  "GBps": 394.76639187026865,  // bandwidth in GB/s/node
  "GFlops": 529.8464820758512, // FP performance (single precision)
  "L": 8,                      // local lattice size
  "size_MB": 6.0               // memory size in MB/node
 }
 ```
 *Inter-process bandwidth*
 This sub-benchmark measures the achieved bidirectional bandwidth in threaded halo exchange
 using routines in Grid. The exchange is performed in each direction on the MPI Cartesian
 grid which is parallelised across at least 2 processes. The resulting bandwidth is related
 to node-local transfers (inter-CPU, NVLink, ...) or network transfers depending on the MPI
 decomposition. he JSON entries under `"comms"` have the form
 ```json
 {
  "L": 40,                       // local lattice size
  "bytes": 73728000,             // payload size in B/rank
  "dir": 2,                      // direction of the exchange, 8 possible directions
                                 // (0: +x, 1: +y, ..., 5: -x, 6: -y, ...)
  "rate_GBps": {
    "error": 6.474271894240327,  // standard deviation across measurements (GB/s/node)
    "max": 183.10546875,         // maximum measured bandwidth (GB/s/node)
    "mean": 175.21747026766676   // average measured bandwidth (GB/s/node)
  },
  "time_usec": 3135.055          // average transfer time (microseconds)
 }
 ```
 *Floating-point performances*
 This sub-benchmark measures the achieved floating-point performances using the 
 Wilson fermion, domain-wall fermion, and staggered fermion sparse matrices from Grid.
 In the `"flops"` and `"results"` section of the JSON output are recorded the best 
 performances, e.g.
 ```json
 {
  "Gflops_dwf4": 366.5251173474483,       // domain-wall in Gflop/s/node (single precision)
  "Gflops_staggered": 7.5982861018529455, // staggered in Gflop/s/node (single precision)
  "Gflops_wilson": 15.221839719288932,    // Wilson in Gflop/s/node (single precision)
  "L": 8                                  // local lattice size
 }
 ```
 Here "best" means across a number of different implementations of the routines. Please
 see the log of the benchmark for an additional breakdown. Finally, the JSON output
 contains a "comparison point", which is the average of the L=24 and L=32 best
 domain-wall performances.
--- a/Grid/build-benchmark.sh
+++ b/Grid/build-benchmark.sh
@ -20,8 +20,12 @@ mkdir -p "${build_dir}"
 source "${env_dir}/env.sh"
 entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
 env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
 cd "${build_dir}" || return
 source "${env_dir}/${env_script}"
 cd "${script_dir}"
 if [ ! -f configure ]; then
    ./bootstrap.sh
 fi
 cd "${build_dir}"
 if [ ! -f Makefile ]; then
    "${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
                            --prefix="${env_dir}/prefix/gridbench_${cfg}"
--- a/Grid/systems/tursa/env.sh
+++ b/Grid/systems/tursa/env.sh
@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091
-GRIDENVDIR="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
+env_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
-export GRIDENVDIR
+mkdir -p ~/.config/lattice-benchmarks
-export PATH="${GRIDENVDIR}/prefix/base/bin:${PATH}"
+echo "${env_dir}" > ~/.config/lattice-benchmarks/grid-env
-export ACLOCAL_PATH="${GRIDENVDIR}/prefix/base/share/aclocal:${ACLOCAL_PATH}"
+source "${env_dir}/spack/share/spack/setup-env.sh"
-source "${GRIDENVDIR}"/spack/share/spack/setup-env.sh
+spack load jq git
--- a/Grid/systems/tursa/files/cpu-mpi-wrapper.sh
+++ b/Grid/systems/tursa/files/cpu-mpi-wrapper.sh
--- a/Grid/systems/tursa/files/gpu-mpi-wrapper.sh
+++ b/Grid/systems/tursa/files/gpu-mpi-wrapper.sh
@ -1,13 +1,12 @@
 #!/usr/bin/env bash
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
-numa1=$(( 2 * lrank))
+numa1=$((lrank))
 numa2=$(( 2 * lrank + 1 ))
 netdev=mlx5_${lrank}:1
 export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
 export UCX_NET_DEVICES=${netdev}
-BINDING="--interleave=$numa1,$numa2"
+BINDING="--interleave=$numa1"
 echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
--- a/Grid/systems/tursa/files/ompi-gpu.sh
+++ b/Grid/systems/tursa/files/ompi-gpu.sh
@ -0,0 +1,17 @@
 #!/usr/bin/env bash
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=8
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 export OMPI_MCA_io=romio321
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
--- a/Grid/systems/tursa/files/run.gpu.16nodes.sh
+++ b/Grid/systems/tursa/files/run.gpu.16nodes.sh
@ -0,0 +1,60 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 #SBATCH -J benchmark-grid-16
 #SBATCH -t 1:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --qos=standard
 #SBATCH --no-requeue
 #SBATCH --gpu-freq=1410
 set -euo pipefail
 # load environment #############################################################
 env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
 if [ ! -f "${env_cfg}" ]; then
 	echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
 	exit 1
 fi
 env_dir="$(readlink -f "$(cat "${env_cfg}")")"
 source "${env_dir}/env.sh"      # load base Spack environment
 source "${env_dir}/env-gpu.sh"  # load GPU-sepcific packages
 source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
 # application and parameters ###################################################
 app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd "${app}"                 > "${job_info_dir}/ldd"
 md5sum "${app}"              > "${job_info_dir}/app-hash"
 readelf -a "${app}"          > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	"${env_dir}/gpu-mpi-wrapper.sh" \
  "${app}" \
 	--json-out "${job_info_dir}/result.json" \
 	--mpi 1.4.4.4 \
  --accelerator-threads 8 \
 	--threads 8 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 ################################################################################
--- a/Grid/systems/tursa/files/run.gpu.1nodes.sh
+++ b/Grid/systems/tursa/files/run.gpu.1nodes.sh
@ -0,0 +1,60 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 #SBATCH -J benchmark-grid-1
 #SBATCH -t 1:00:00
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --qos=standard
 #SBATCH --no-requeue
 #SBATCH --gpu-freq=1410
 set -euo pipefail
 # load environment #############################################################
 env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
 if [ ! -f "${env_cfg}" ]; then
 	echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
 	exit 1
 fi
 env_dir="$(readlink -f "$(cat "${env_cfg}")")"
 source "${env_dir}/env.sh"      # load base Spack environment
 source "${env_dir}/env-gpu.sh"  # load GPU-sepcific packages
 source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
 # application and parameters ###################################################
 app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd "${app}"                 > "${job_info_dir}/ldd"
 md5sum "${app}"              > "${job_info_dir}/app-hash"
 readelf -a "${app}"          > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	"${env_dir}/gpu-mpi-wrapper.sh" \
  "${app}" \
 	--json-out "${job_info_dir}/result.json" \
 	--mpi 1.1.1.4 \
  --accelerator-threads 8 \
 	--threads 8 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 ################################################################################
--- a/Grid/systems/tursa/files/run.gpu.32nodes.sh
+++ b/Grid/systems/tursa/files/run.gpu.32nodes.sh
@ -0,0 +1,60 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 #SBATCH -J benchmark-grid-32
 #SBATCH -t 1:00:00
 #SBATCH --nodes=32
 #SBATCH --ntasks=128
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --qos=standard
 #SBATCH --no-requeue
 #SBATCH --gpu-freq=1410
 set -euo pipefail
 # load environment #############################################################
 env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
 if [ ! -f "${env_cfg}" ]; then
 	echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
 	exit 1
 fi
 env_dir="$(readlink -f "$(cat "${env_cfg}")")"
 source "${env_dir}/env.sh"      # load base Spack environment
 source "${env_dir}/env-gpu.sh"  # load GPU-sepcific packages
 source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
 # application and parameters ###################################################
 app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd "${app}"                 > "${job_info_dir}/ldd"
 md5sum "${app}"              > "${job_info_dir}/app-hash"
 readelf -a "${app}"          > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	"${env_dir}/gpu-mpi-wrapper.sh" \
  "${app}" \
 	--json-out "${job_info_dir}/result.json" \
 	--mpi 1.4.4.8 \
  --accelerator-threads 8 \
 	--threads 8 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 ################################################################################
--- a/Grid/systems/tursa/spack-bootstrap.sh
+++ b/Grid/systems/tursa/spack-bootstrap.sh
@ -4,7 +4,13 @@ set -euo pipefail
 gcc_spec='gcc@9.4.0'
 cuda_spec='cuda@11.4.0'
-hdf5_spec='hdf5@1.10.7'
+
 # hdf5 and fftw depend on OpenMPI, which we install manually. To make sure this
 # dependency is picked by spack, we specify the compiler here explicitly. For
 # most other packages we dont really care about the compiler (i.e. system
 # compiler versus ${gcc_spec})
 hdf5_spec="hdf5@1.10.7+cxx+threadsafe%${gcc_spec}"
 fftw_spec="fftw%${gcc_spec}"
 if (( $# != 1 )); then
    echo "usage: $(basename "$0") <env dir>" 1>&2
@ -18,7 +24,7 @@ cd "${cwd}"
 # General configuration ########################################################
 # build with 128 tasks
-echo 'config:                                  
+echo 'config:
  build_jobs: 128
  build_stage:
    - $spack/var/spack/stage
@ -38,26 +44,23 @@ rm external.yaml
 # Base compilers ###############################################################
 # configure system base
 spack env create base
 spack env activate base
 spack compiler find --scope site
-# install GCC, CUDA & LLVM
+# install GCC, CUDA
-spack install ${gcc_spec} ${cuda_spec} llvm
+spack add ${gcc_spec} ${cuda_spec}
-
+spack concretize
-spack load llvm
+spack env depfile -o Makefile.tmp
 make -j128 -f Makefile.tmp
 spack compiler find --scope site
 spack unload llvm
 spack load ${gcc_spec}
 spack compiler find --scope site
 spack unload ${gcc_spec}
 # Manual compilation of OpenMPI & UCX ##########################################
 # set build directories
 mkdir -p "${dir}"/build
 cd "${dir}"/build
 spack load ${gcc_spec} ${cuda_spec}
 cuda_path=$(spack find --format "{prefix}" cuda)
 gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1
@ -77,7 +80,7 @@ mkdir -p build_gpu; cd build_gpu
             --enable-devel-headers --enable-examples --enable-optimizations   \
             --with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging     \
             --disable-debug --disable-assertions --enable-cma                 \
-             --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm                \
+             --with-knem=/opt/knem-1.1.4.90mlnx2/ --with-rdmacm                \
             --without-rocm --without-ugni --without-java                      \
             --enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm   \
             --with-rc --with-ud --with-dc --with-mlx5-dv --with-dm            \
@ -93,7 +96,7 @@ mkdir -p build_cpu; cd build_cpu
             --enable-devel-headers --enable-examples --enable-optimizations   \
             --with-verbs --disable-logging --disable-debug                    \
             --disable-assertions --enable-mt --enable-cma                     \
-             --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm                \
+              --with-knem=/opt/knem-1.1.4.90mlnx2/--with-rdmacm                \
             --without-rocm --without-ugni --without-java                      \
             --enable-compiler-opt=3 --without-cm --without-ugni --with-rc     \
             --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go
@ -119,13 +122,13 @@ mkdir build_gpu; cd build_gpu
 ../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem    \
             --with-ucx="${dir}"/prefix/ucx_gpu                   \
             --with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib        \
-             --with-knem=/opt/knem-1.1.4.90mlnx1/                 \
+             --with-knem=/opt/knem-1.1.4.90mlnx2/                 \
             --enable-mca-no-build=btl-uct                        \
             --with-cuda="${cuda_path}" --disable-getpwuid        \
             --with-verbs --with-slurm --enable-mpi-fortran=all   \
             --with-pmix=internal --with-libevent=internal
-make -j 128 
+make -j 128
-make install 
+make install
 cd ..
 # openmpi cpu build
@ -133,7 +136,7 @@ mkdir build_cpu; cd build_cpu
 ../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem    \
             --with-ucx="${dir}"/prefix/ucx_cpu                   \
             --with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib        \
-             --with-knem=/opt/knem-1.1.4.90mlnx1/                 \
+             --with-knem=/opt/knem-1.1.4.90mlnx2/                 \
             --enable-mca-no-build=btl-uct --disable-getpwuid     \
             --with-verbs --with-slurm --enable-mpi-fortran=all   \
             --with-pmix=internal --with-libevent=internal
@ -141,65 +144,65 @@ make -j 128
 make install
 cd "${dir}"
 ucx_spec_gpu="ucx@1.12.0.GPU%${gcc_spec}"
 ucx_spec_cpu="ucx@1.12.0.CPU%${gcc_spec}"
 openmpi_spec_gpu="openmpi@4.1.1.GPU%${gcc_spec}"
 openmpi_spec_cpu="openmpi@4.1.1.CPU%${gcc_spec}"
 # Add externals to spack
 echo "packages:
  ucx:
    externals:
-    - spec: \"ucx@1.12.0.GPU%gcc@9.4.0\"
+    - spec: \"${ucx_spec_gpu}\"
      prefix: ${dir}/prefix/ucx_gpu
-    - spec: \"ucx@1.12.0.CPU%gcc@9.4.0\"
+    - spec: \"${ucx_spec_cpu}\"
      prefix: ${dir}/prefix/ucx_cpu
    buildable: False
  openmpi:
    externals:
-    - spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\"
+    - spec: \"${openmpi_spec_gpu}\"
      prefix: ${dir}/prefix/ompi_gpu
-    - spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\"
+    - spec: \"${openmpi_spec_cpu}\"
      prefix: ${dir}/prefix/ompi_cpu
    buildable: False" > spack.yaml
 spack config --scope site add -f spack.yaml
 rm spack.yaml
-spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
+spack env deactivate
 spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
 cd "${cwd}"
 # environments #################################################################
-dev_tools=("autoconf" "automake" "libtool" "jq")
+dev_tools=("autoconf" "automake" "libtool" "jq" "git")
 ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU)
 ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU)
 spack env create grid-gpu
 spack env activate grid-gpu
-spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}" 
+spack compiler find --scope site
-spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
+spack add ${gcc_spec} ${cuda_spec} ${ucx_spec_gpu} ${openmpi_spec_gpu}
-spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}"
+spack add ${hdf5_spec} ${fftw_spec}
-spack add fftw ^/"${ompi_gpu_hash}"
+spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
-spack add openssl gmp mpfr c-lime
+spack concretize
-spack install
+spack env depfile -o Makefile.tmp
 make -j128 -f Makefile.tmp
 spack env deactivate
 spack env create grid-cpu
 spack env activate grid-cpu
-spack add llvm "${dev_tools[@]}" 
+spack compiler find --scope site
-spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
+spack add ${gcc_spec} ${ucx_spec_cpu} ${openmpi_spec_cpu}
-spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}"
+spack add ${hdf5_spec} ${fftw_spec}
-spack add fftw ^/"${ompi_cpu_hash}"
+spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
-spack add openssl gmp mpfr c-lime
+spack concretize
-spack install
+spack env depfile -o Makefile.tmp
 make -j128 -f Makefile.tmp
 spack env deactivate
 # Final setup ##################################################################
 spack clean
-spack gc -y
+#spack gc -y  # "spack gc" tends to get hung up for unknown reasons
 # add more environment variables in module loading
 spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]'
 spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]'
 spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]'
 spack module tcl refresh -y
 # permission change for group access
 chmod -R g+rw "${dir}/spack/var/spack/cache"
 setfacl -d -R -m g::rwX "${dir}/spack/var/spack/cache"
--- a/Quda/.clang-format
+++ b/Quda/.clang-format
@ -0,0 +1,14 @@
 {
  BasedOnStyle: LLVM,
  UseTab: Never,
  IndentWidth: 2,
  TabWidth: 2,
  BreakBeforeBraces: Allman,
  AllowShortIfStatementsOnASingleLine: false,
  IndentCaseLabels: false,
  ColumnLimit: 90,
  AccessModifierOffset: -4,
  NamespaceIndentation: All,
  FixNamespaceComments: false,
  SortIncludes: true,
 }
--- a/Quda/Benchmark_Quda.cpp
+++ b/Quda/Benchmark_Quda.cpp
@ -0,0 +1,458 @@
 #include <algorithm>
 #include <array>
 #include <blas_quda.h>
 #include <cassert>
 #include <chrono>
 #include <color_spinor_field.h>
 #include <communicator_quda.h>
 #include <dirac_quda.h>
 #include <fstream>
 #include <gauge_tools.h>
 #include <memory>
 #include <mpi.h>
 #include <stdio.h>
 #include <stdlib.h>
 // remove to use QUDA's own flop counting instead of Grid's convention
 #define FLOP_COUNTING_GRID
 #include "json.hpp"
 using nlohmann::json;
 json json_results;
 using namespace quda;
 // thanks chatGPT :)
 std::string get_timestamp()
 {
  // Get the current time
  auto now = std::chrono::system_clock::now();
  // Convert the current time to a time_t object
  std::time_t currentTime = std::chrono::system_clock::to_time_t(now);
  // Format the time using std::put_time
  std::stringstream ss;
  ss << std::put_time(std::localtime(&currentTime), "%Y%m%d %H:%M:%S");
  return ss.str();
 }
 // This is the MPI grid, i.e. the layout of ranks
 int nranks = -1;
 std::array<int, 4> mpi_grid = {1, 1, 1, 1};
 // run f() in a loop for roughly target_time seconds
 // returns seconds per iteration it took
 template <class F> double bench(F const &f, double target_time, int niter_warmup = 5)
 {
  device_timer_t timer;
  timer.start();
  for (int iter = 0; iter < niter_warmup; ++iter)
    f();
  timer.stop();
  double secs = timer.last() / niter_warmup;
  int niter = std::max(1, int(target_time / secs));
  // niter = std::min(1000, niter);
  // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter);
  // important: each rank has its own timer, so their measurements can slightly vary. But
  // 'niter' needs to be consistent (bug took me a couple hours to track down)
  comm_broadcast_global(&niter, sizeof(niter), 0);
  timer.reset(__FUNCTION__, __FILE__, __LINE__);
  timer.start();
  for (int iter = 0; iter < niter; ++iter)
    f();
  timer.stop();
  return timer.last() / niter;
 }
 void initComms(int argc, char **argv)
 {
  // init MPI communication
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
  assert(1 <= nranks && nranks <= 100000);
  mpi_grid[3] = nranks;
  // this maps coordinates to rank number
  auto lex_rank_from_coords = [](int const *coords, void *)
  {
    int rank = coords[0];
    for (int i = 1; i < 4; i++)
      rank = mpi_grid[i] * rank + coords[i];
    return rank;
  };
  initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr);
  for (int d = 0; d < 4; d++)
    if (mpi_grid[d] > 1)
      commDimPartitionedSet(d);
  json_results["geometry"]["ranks"] = nranks;
  json_results["geometry"]["mpi"] = mpi_grid;
 }
 // creates a random gauge field. L = local(!) size
 cudaGaugeField make_gauge_field(int L)
 {
  GaugeFieldParam param;
  // dimension and type of the lattice object
  param.nDim = 4;
  param.x[0] = L;
  param.x[1] = L;
  param.x[2] = L;
  param.x[3] = L;
  // number of colors. potentially confusingly, QUDA sometimes uses the word "color" to
  // things unrelated with physical color. things like "nColor=32" do pop up in deflation
  // solvers where it (to my understanding) refers to the number of (parallely processed)
  // deflation vectors.
  param.nColor = 3;
  // boundary conditions (dont really care for benchmark)
  param.t_boundary = QUDA_PERIODIC_T;
  // for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller
  // precisions are available in QUDA too
  param.setPrecision(QUDA_SINGLE_PRECISION);
  // no even/odd subset, we want a full lattice
  param.siteSubset = QUDA_FULL_SITE_SUBSET;
  // what kind of 3x3 matrices the field contains. A proper gauge field has SU(3)
  // matrices, but (for example) smeared/thick links could have non-unitary links.
  param.link_type = QUDA_SU3_LINKS;
  // "NULL" does not initialize the field upon creation, "ZERO" would set everything to 0
  param.create = QUDA_NULL_FIELD_CREATE;
  // field should be allocated directly on the accelerator/GPU
  param.location = QUDA_CUDA_FIELD_LOCATION;
  // "reconstruct" here means reconstructing a SU(3) matrix from fewer than 18 real
  // numbers (=3x3 complex numbers). Great feature in production (saving
  // memory/cache/network bandwidth), not used for this benchmark.
  param.reconstruct = QUDA_RECONSTRUCT_NO;
  // "ghostExchange" would often be called "halo exchange" outside of Quda. This has
  // nothing to do with ghost fields from continuum/perturbative qcd.
  param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
  // This controls the physical order of elements. "float2" is the the default
  param.order = QUDA_FLOAT2_GAUGE_ORDER;
  // this means the field is a LORENTZ vector (which a gauge field must be). Has nothing
  // to do with spin.
  param.geometry = QUDA_VECTOR_GEOMETRY;
  // create the field and fill with random SU(3) matrices
  // std::cout << param << std::endl; // double-check parameters
  auto U = cudaGaugeField(param);
  gaugeGauss(U, /*seed=*/1234, 1.0);
  return U;
 }
 // create a random source vector (L = local size)
 ColorSpinorField make_source(int L, int Ls = 1)
 {
  // NOTE: `param.x` directly determines the size of the (local, per rank) memory
  // allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a
  // physical local volume of L^4, thus implicity choosing a dimension for the
  // checkerboarding (shouldnt really matter of course which one).
  ColorSpinorParam param;
  param.nColor = 3;
  param.nSpin = 4;
  param.nVec = 1; // only a single vector
  param.pad = 0;
  param.siteSubset = QUDA_PARITY_SITE_SUBSET;
  param.nDim = Ls == 1 ? 4 : 5;
  param.x[0] = L / 2;
  param.x[1] = L;
  param.x[2] = L;
  param.x[3] = L;
  param.x[4] = Ls;
  param.pc_type = QUDA_4D_PC;
  param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  // somewhat surprisingly, the DiracWilson::Dslash(...) function only works with the
  // UKQCD_GAMMA_BASIS
  param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
  param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field
  param.setPrecision(QUDA_SINGLE_PRECISION);
  param.location = QUDA_CUDA_FIELD_LOCATION;
  // create the field and fill it with random values
  auto src = ColorSpinorField(param);
  quda::RNG rng(src, 1234);
  spinorNoise(src, rng, QUDA_NOISE_GAUSS);
  /*printfQuda(
      "created src with norm = %f (sanity check: should be close to %f) and %f bytes\n",
      blas::norm2(src), 2.0 * 12 * geom[0] * geom[1] * geom[2] * geom[3],
      src.Bytes() * 1.0);*/
  // src.PrintDims();
  return src;
 }
 void benchmark_wilson(std::vector<int> const &L_list, double target_time)
 {
  printfQuda("==================== wilson dirac operator ====================\n");
 #ifdef FLOP_COUNTING_GRID
  printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
 #else
  printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from "
             "Benchmark_Grid)\n");
 #endif
  printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
  for (int L : L_list)
  {
    // printfQuda("starting wilson L=%d\n", L);
    auto U = make_gauge_field(L);
    auto src = make_source(L);
    // create (Wilson) dirac operator
    DiracParam param;
    param.kappa = 0.10;
    param.dagger = QUDA_DAG_NO;
    param.matpcType = QUDA_MATPC_EVEN_EVEN;
    auto dirac = DiracWilson(param);
    // insert gauge field into the dirac operator
    // (the additional nullptr's are for smeared links and fancy preconditioners and such.
    // Not used for simple Wilson fermions)
    dirac.updateFields(&U, nullptr, nullptr, nullptr);
    auto res = ColorSpinorField(ColorSpinorParam(src));
    auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
    // first run to get the quda tuning out of the way
    dirac.Flops(); // reset flops counter
    f();
    double flops = 1.0 * dirac.Flops();
    // actual benchmarking
    auto start_time = get_timestamp();
    double secs = bench(f, target_time);
    auto end_time = get_timestamp();
 #ifdef FLOP_COUNTING_GRID
    // this is the flop counting from Benchmark_Grid
    double Nc = 3;
    double Nd = 4;
    double Ns = 4;
    flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
    flops *= L * L * L * L / 2.0;
 #endif
    printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
    json tmp;
    tmp["L"] = L;
    tmp["Gflops_wilson"] = flops / secs * 1e-9;
    tmp["start_time"] = start_time;
    tmp["end_time"] = end_time;
    json_results["flops"]["results"].push_back(tmp);
  }
 }
 void benchmark_dwf(std::vector<int> const &L_list, double target_time)
 {
  printfQuda("==================== domain wall dirac operator ====================\n");
 #ifdef FLOP_COUNTING_GRID
  printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
 #else
  printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from "
             "Benchmark_Grid)\n");
 #endif
  printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
  int Ls = 12;
  for (int L : L_list)
  {
    // printfQuda("starting dwf L=%d\n", L);
    auto U = make_gauge_field(L);
    auto src = make_source(L, Ls);
    // create dirac operator
    DiracParam param;
    param.kappa = 0.10;
    param.Ls = Ls;
    param.m5 = 0.1;
    param.dagger = QUDA_DAG_NO;
    param.matpcType = QUDA_MATPC_EVEN_EVEN;
    auto dirac = DiracDomainWall(param);
    // insert gauge field into the dirac operator
    // (the additional nullptr's are for smeared links and fancy preconditioners and such)
    dirac.updateFields(&U, nullptr, nullptr, nullptr);
    auto res = ColorSpinorField(ColorSpinorParam(src));
    auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
    // first run to get the quda tuning out of the way
    dirac.Flops(); // reset flops counter
    f();
    double flops = 1.0 * dirac.Flops();
    // actual benchmarking
    auto start_time = get_timestamp();
    double secs = bench(f, target_time);
    auto end_time = get_timestamp();
 #ifdef FLOP_COUNTING_GRID
    // this is the flop counting from Benchmark_Grid
    double Nc = 3;
    double Nd = 4;
    double Ns = 4;
    flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
    flops *= L * L * L * L * Ls / 2.0;
 #endif
    printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
    json tmp;
    tmp["L"] = L;
    tmp["Gflops_dwf4"] = flops / secs * 1e-9;
    tmp["start_time"] = start_time;
    tmp["end_time"] = end_time;
    json_results["flops"]["results"].push_back(tmp);
  }
 }
 void benchmark_axpy(std::vector<int> const &L_list, double target_time)
 {
  // number of iterations for warmup / measurement
  // (feel free to change for noise/time tradeoff)
  constexpr int niter_warmup = 5;
  printfQuda("==================== axpy / memory ====================\n");
  ColorSpinorParam param;
  param.nDim = 4;   // 4-dimensional lattice
  param.x[4] = 1;   // no fifth dimension
  param.nColor = 3; // supported values for nSpin/nColor are configured when compiling
                    // QUDA. "3*4" will probably always be enabled, so we stick with this
  param.nSpin = 4;
  param.nVec = 1;                            // just a single vector
  param.siteSubset = QUDA_FULL_SITE_SUBSET;  // full lattice = no odd/even
  param.pad = 0;                             // no padding
  param.create = QUDA_NULL_FIELD_CREATE;     // do not (zero-) initilize the field
  param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU
  param.setPrecision(QUDA_SINGLE_PRECISION);
  // the following dont matter for an axpy benchmark, but need to choose something
  param.pc_type = QUDA_4D_PC;
  param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)",
             "GiB/s/rank", "Gflop/s/rank");
  for (int L : L_list)
  {
    // printfQuda("starting axpy L=%d\n", L);
    //  IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
    //             are LOCAL, i.e. per rank / per GPU
    param.x[0] = L;
    param.x[1] = L;
    param.x[2] = L;
    param.x[3] = L;
    // number of (real) elements in one (local) field
    size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] *
                            param.nColor * param.nSpin;
    // create the field(s)
    auto fieldA = ColorSpinorField(param);
    auto fieldB = ColorSpinorField(param);
    assert(fieldA.Bytes() == sizeof(float) * field_elements); // sanity check
    assert(fieldB.Bytes() == sizeof(float) * field_elements); // sanity check
    // fill fields with random values
    quda::RNG rng(fieldA, 1234);
    spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS);
    spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS);
    // number of operations / bytes per iteration
    // axpy is one addition, one multiplication, two read, one write
    double flops = 2 * field_elements;
    double memory = 3 * sizeof(float) * field_elements;
    auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); };
    // first run to get the quda tuning out of the way
    f();
    // actual benchmarking
    auto start_time = get_timestamp();
    double secs = bench(f, target_time);
    auto end_time = get_timestamp();
    double mem_MiB = memory / 1024. / 1024.;
    double GBps = mem_MiB / 1024 / secs;
    printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
               flops / secs * 1e-9);
    json tmp;
    tmp["L"] = L;
    tmp["size_MB"] = mem_MiB;
    tmp["GBps"] = GBps;
    tmp["GFlops"] = flops / secs * 1e-9;
    tmp["start_time"] = start_time;
    tmp["end_time"] = end_time;
    json_results["axpy"].push_back(tmp);
  }
 }
 int main(int argc, char **argv)
 {
  std::string json_filename = ""; // empty indicates no json output
  for (int i = 0; i < argc; i++)
  {
    if (std::string(argv[i]) == "--json-out")
      json_filename = argv[i + 1];
  }
  initComms(argc, argv);
  initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used
  //  verbosity options are:
  //  SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE
  setVerbosity(QUDA_SUMMARIZE);
  printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
             mpi_grid[3]);
  benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0);
  setVerbosity(QUDA_SILENT);
  benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0);
  benchmark_dwf({8, 12, 16, 24, 32}, 1.0);
  setVerbosity(QUDA_SUMMARIZE);
  printfQuda("==================== done with all benchmarks ====================\n");
  if (!json_filename.empty())
  {
    printfQuda("writing benchmark results to %s\n", json_filename.c_str());
    int me = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    if (me == 0)
    {
      std::ofstream json_file(json_filename);
      json_file << std::setw(2) << json_results;
    }
  }
  endQuda();
  quda::comm_finalize();
  MPI_Finalize();
 }
--- a/Quda/Readme.md
+++ b/Quda/Readme.md
@ -0,0 +1,30 @@
 # QUDA benchmarks
 This folder contains benchmarks for the [QUDA](https://github.com/lattice/quda) library.
 - `Benchmark_Quda`: This benchmark measure floating point performances of fermion
 matrices (Wilson and DWF), as well as memory bandwidth (using a simple `axpy` operation). Measurements are
 performed for a fixed range of problem sizes.
 ## Building
 After setting up your compilation environment (Tursa: `source /home/dp207/dp207/shared/env/production/env-{base,gpu}.sh`):
 ```bash
 ./build-quda.sh <env_dir>          # build Quda
 ./build-benchmark.sh <env_dir>     # build benchmark
 ```
 where `<env_dir>` is an arbitrary directory where every product will be stored.
 ## Running the Benchmark
 The benchmark should be run as
 ```bash
 mpirun -np <ranks> <env_dir>/prefix/qudabench/Benchmark_Quda
 ```
 where `<ranks>` is the total number of GPU's to use. On Tursa this is 4 times the number of nodes.
 Note:
 - on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary.
 - due to Qudas automatic tuning, the benchmark might take significantly longer to run than `Benchmark_Grid` (even though it does fewer things).
  - setting `QUDA_ENABLE_TUNING=0` disables all tuning (degrades performance severely). By default, it is turned on.
  - setting `QUDA_RESOURCE_PATH=<some folder>` enables Quda to save and reuse optimal tuning parameters, making repeated runs much faster
--- a/Quda/build-benchmark.sh
+++ b/Quda/build-benchmark.sh
@ -0,0 +1,32 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1090,SC1091
 set -euo pipefail
 if (( $# != 1 )); then
    echo "usage: $(basename "$0") <environment directory>" 1>&2
    exit 1
 fi
 env_dir=$1
 # TODO: this is Tursa specific. have not figured out the correct way to do this.
 EXTRA_LIBS="/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so"
 # NOTE: these flags need to be in sync with Qudas compilation options (see build-quda.sh)
 BUILD_FLAGS="-O3 -std=c++17 -DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=12 -DQUDA_RECONSTRUCT=4"
 call_dir=$(pwd -P)
 script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
 cd "${env_dir}"
 env_dir=$(pwd -P)
 cd "${call_dir}"
 BUILD_DIR="${env_dir}/build/Quda-benchmarks"
 PREFIX_DIR="${env_dir}/prefix/qudabench"
 QUDA_DIR=${env_dir}/prefix/quda
 mkdir -p "${BUILD_DIR}"
 mkdir -p "${PREFIX_DIR}"
 LINK_FLAGS="-Wl,-rpath,$QUDA_DIR/lib: $QUDA_DIR/lib/libquda.so $EXTRA_LIBS -lpthread -lmpi"
 g++ $BUILD_FLAGS -I$QUDA_DIR/include/targets/cuda  -I$QUDA_DIR/include   -c -o $BUILD_DIR/Benchmark_Quda.o  $script_dir/Benchmark_Quda.cpp
 g++ -g -O3 $BUILD_DIR/Benchmark_Quda.o -o $PREFIX_DIR/Benchmark_Quda $LINK_FLAGS -lmpi
--- a/Quda/build-quda.sh
+++ b/Quda/build-quda.sh
@ -0,0 +1,36 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1090,SC1091
 BUILD_FLAGS="-O3 -std=c++17"
 QUDA_FLAGS="-DQUDA_MPI=ON -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=4 -DQUDA_GPU_ARCH=sm_80"
 set -euo pipefail
 if (( $# != 1 )); then
    echo "usage: $(basename "$0") <environment directory>" 1>&2
    exit 1
 fi
 env_dir=$1
 call_dir=$(pwd -P)
 mkdir -p ${env_dir}
 cd "${env_dir}"
 env_dir=$(pwd -P)
 cd "${call_dir}"
 build_dir="${env_dir}/build/quda"
 if [ -d "${build_dir}" ]; then
    echo "error: directory '${build_dir}' exists"
    exit 1
 fi
 mkdir -p "${build_dir}"
 git clone https://github.com/lattice/quda.git "${build_dir}"
 cd "${build_dir}"
 mkdir build; cd build
 cmake .. $QUDA_FLAGS -DCMAKE_INSTALL_PREFIX=${env_dir}/prefix/quda
 make -j128
 make install
 cd "${call_dir}"
--- a/Quda/env.sh
+++ b/Quda/env.sh
@ -0,0 +1,21 @@
 module load gcc/9.3.0
 module load cuda/11.4.1
 module load openmpi/4.1.1-cuda11.4
 export QUDA_RESOURCE_PATH=$(pwd)/tuning
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx # by fabian. no idea what this is
 #export UCX_TLS=rc,rc_x,sm,cuda_copy,cuda_ipc,gdr_copy
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_THRESH=16384
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 export OMPI_MCA_io=romio321
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 export QUDA_REORDER_LOCATION=GPU # this is the default anyway
Author	SHA1	Message	Date
Antonin Portelli	e1e69c6032	Merge pull request 'Fix: Remove deprecated global LebesgueOrder::Block' (#9 ) from RChHill/lattice-benchmarks:fix/UVM-deprecation into main Reviewed-on: portelli/lattice-benchmarks#9	2025-06-29 21:14:41 +01:00
Ryan Hill	2f1ac65b5d	Fix: Remove deprecated global LebesgueOrder::Block	2025-06-23 11:46:20 +01:00
Antonin Portelli	cb538bfbf1	Merge pull request 'Updates GPU launch wrapper and install script' (#8 ) from aturner/lattice-benchmarks:main into main Reviewed-on: portelli/lattice-benchmarks#8	2025-03-31 16:08:45 +01:00
Andy Turner	9056e9023c	Updates knem location in install script	2024-12-17 13:46:43 +00:00
Antonin Portelli	c049a2ad0b	Merge pull request 'benchmark-quda' (#3 ) from simon.buerger/lattice-benchmarks:benchmark-quda into main Reviewed-on: portelli/lattice-benchmarks#3	2024-11-26 10:57:28 +00:00
Simon Bürger	fb43d16830	log iso-timestamp instead of seconds-since-start	2024-11-25 15:45:59 +00:00
Simon Bürger	6fa2e6bcd0	fix bug that made benchmark_quda hang randomly	2024-11-25 15:45:59 +00:00
Simon Bürger	fb4c456776	choose iteration count automatically	2024-11-25 15:45:58 +00:00
Simon Bürger	3fbb8ea346	add timestamps to benchmarks	2024-11-25 15:45:58 +00:00
Simon Bürger	86b160cb5c	add json output to Benchmark_Quda	2024-11-25 15:45:58 +00:00
Simon Bürger	dc411017bb	Update 'Quda/Readme.md'	2024-11-25 15:45:58 +00:00
Simon Bürger	b2cc780690	add Readme.md to Quda benchmark	2024-11-25 15:45:58 +00:00
Simon Bürger	6d87396576	clean up build script a bit	2024-11-25 15:45:58 +00:00
Simon Bürger	e9d084ce09	better range of lattice sizes	2024-11-25 15:45:58 +00:00
Simon Bürger	32e301fc67	add DWF benchmark	2024-11-25 15:45:58 +00:00
Simon Bürger	eaa4feee43	benchmark Dslash(...) instead of full M(...)	2024-11-25 15:45:58 +00:00
Simon Bürger	025f9dab50	fix scaling conventions for multi-gpu	2024-11-25 15:45:57 +00:00
Simon Bürger	3a561091d9	tidy up the wilson benchmark and add environment script	2024-11-25 15:45:57 +00:00
Simon Bürger	191c0cfca5	add quda axpy/memory benchmark	2024-11-25 15:45:57 +00:00
Simon Bürger	6f9af8acad	first draft of Quda Benchmark	2024-11-25 15:45:57 +00:00
Antonin Portelli	371a329457	Merge pull request 'Point-to-Point latency' (#7 ) from simon.buerger/lattice-benchmarks:latency_benchmark into main Reviewed-on: portelli/lattice-benchmarks#7 Reviewed-by: Antonin Portelli <antonin.portelli@me.com>	2024-11-19 10:37:26 +00:00
Simon Bürger	f81cb198ab	add command line options to Benchmark_Grid	2024-11-18 23:50:45 +00:00
Simon Bürger	a7e1d9e67f	lower loop counts a bit for p2p/latency	2024-10-11 18:27:00 +01:00
Simon Bürger	19c9dcb6ae	fix order of ranks in latency/p2p	2024-10-10 11:40:44 +01:00
Simon Bürger	7d89380b80	point-to-point bandwith benchmark	2024-10-07 17:22:26 +01:00
Simon Bürger	4cd67805b9	make Latency benchmark proper one-way and increase statistics	2024-09-26 09:31:22 +01:00
Simon Bürger	f7e607eae4	proper warmup loop for latency	2024-05-09 23:33:04 +01:00
Simon Bürger	a267986800	naming consitency	2024-05-09 23:25:06 +01:00
Simon Bürger	a1ec08cdb3	point-to-point latency	2024-05-09 23:17:54 +01:00
Antonin Portelli	fb6c79d9ca	shm direction fix	2024-03-21 13:51:49 +09:00
Simon Bürger	d7647afa72	Merge remote-tracking branch 'upstream/main' into main	2024-01-23 10:10:52 +00:00
Antonin Portelli	ba00493c7d	Merge pull request 'fix incompatibility with latest Grid' (#5 ) from simon.buerger/lattice-benchmarks:main into main Reviewed-on: portelli/lattice-benchmarks#5	2023-12-20 15:06:41 +00:00
Simon Bürger	6055e0503c	simple latency benchmark	2023-12-20 13:43:51 +00:00
Simon Bürger	6ea093fc80	fix incompatibility with latest Grid	2023-12-18 16:48:19 +00:00
Antonin Portelli	fa47ec5bbe	Merge pull request 'refactor and repair the spack environment' (#4 ) from simon.buerger/lattice-benchmarks:fix_spack_environment into main Reviewed-on: portelli/lattice-benchmarks#4	2023-07-05 15:11:40 +01:00
Simon Bürger	7235bfde4c	refactor and repair the spack environment	2023-07-04 22:30:54 +01:00
Antonin Portelli	e5c61c2db1	Merge pull request 'add indication of shared-memory directions in comms benchmark' (#2 ) from simon.buerger/lattice-benchmarks:feature/grid-shared-mem into main Reviewed-on: portelli/lattice-benchmarks#2	2023-04-12 15:05:39 +01:00
Simon Bürger	80c80049d7	add indication of shared-memory directions in comms benchmark	2023-04-12 11:40:39 +01:00
Antonin Portelli	ce0d4d9457	Grid comms warmup sequence	2023-02-03 20:59:20 +00:00
Antonin Portelli	cc4c0255bc	Grid Tursa 32 node MPI layout change	2023-02-03 20:59:02 +00:00
Antonin Portelli	bdfb94bf11	Grid overflow fix	2023-02-03 20:58:41 +00:00
Antonin Portelli	af950e6e28	Grid Tursa 32 node batch script	2023-02-01 23:55:50 +00:00
Antonin Portelli	14fb2fddc2	Grid benchmark build script fix	2023-02-01 23:55:37 +00:00
Antonin Portelli	5198bbe1cd	Grid benchmark documentation update	2023-01-30 18:28:38 +00:00
Antonin Portelli	5f9abbb8d0	minor label fix	2023-01-30 18:28:17 +00:00
Antonin Portelli	9b6c6d4d40	Grid: geometry JSON output and cleanup	2023-01-30 17:43:35 +00:00
Antonin Portelli	43e264d29b	Proper runscript example for Tursa	2023-01-30 16:58:54 +00:00
Antonin Portelli	51eae5723e	Grid IO benchmark cleanup	2023-01-29 14:56:37 +00:00
Antonin Portelli	ce890a8fc2	Merge branch 'main' of git.dev.dirac.ed.ac.uk:portelli/lattice-benchmarks	2023-01-28 15:28:34 +00:00
Antonin Portelli	8f1a556afa	Finale cleanup!	2023-01-28 15:28:23 +00:00
Antonin Portelli	58080730ae	FInale cleanup!	2023-01-28 15:26:03 +00:00
Antonin Portelli	f180cbb8ec	JSON fix	2023-01-28 14:44:32 +00:00
Antonin Portelli	5098f57f08	Grid Tursa wrapper permission fix	2023-01-28 14:19:47 +00:00
Antonin Portelli	0e2b7225db	Gird Makefile.am fix	2023-01-28 14:19:25 +00:00
Antonin Portelli	7b689a8c94	removing old benchmarks	2023-01-28 14:18:56 +00:00
Antonin Portelli	77c75ea5b8	Memory benchmarks cleanup and fix	2023-01-28 14:18:48 +00:00
Antonin Portelli	78c464d1d8	Grid benchmark big formatting cleanup	2023-01-27 18:49:53 +00:00
Antonin Portelli	8f043343fb	Trusa Grid env fix	2023-01-27 17:56:10 +00:00