Initial commit

2023-01-13 19:00:20 +00:00
commit 2a64c41a8c
10 changed files with 2404 additions and 0 deletions
@@ -0,0 +1,15 @@
+/.vscode
+build*
+.buildutils
+autom4te.cache
+config.*
+compile
+ar-lib
+aclocal.m4
+configure
+depcomp
+install-sh
+missing
+Makefile.in
+.DS_Store
+*~
@@ -0,0 +1,224 @@
+/*
+Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "Benchmark_IO.hpp"
+
+#ifndef BENCH_IO_LMIN
+#define BENCH_IO_LMIN 8
+#endif
+
+#ifndef BENCH_IO_LMAX
+#define BENCH_IO_LMAX 32
+#endif
+
+#ifndef BENCH_IO_NPASS
+#define BENCH_IO_NPASS 10
+#endif
+
+#ifdef HAVE_LIME
+using namespace Grid;
+
+std::string filestem(const int l)
+{
+  return "iobench_l" + std::to_string(l);
+}
+
+int vol(const int i)
+{
+  return BENCH_IO_LMIN + 2 * i;
+}
+
+int volInd(const int l)
+{
+  return (l - BENCH_IO_LMIN) / 2;
+}
+
+template <typename Mat>
+void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
+{
+  auto nr = data[0].rows(), nc = data[0].cols();
+  Eigen::MatrixXd sqSum(nr, nc);
+  double n = static_cast<double>(data.size());
+
+  assert(n > 1.);
+  mean = Mat::Zero(nr, nc);
+  sqSum = Mat::Zero(nr, nc);
+  for (auto &d : data)
+  {
+    mean += d;
+    sqSum += d.cwiseProduct(d);
+  }
+  stdDev = ((sqSum - mean.cwiseProduct(mean) / n) / (n - 1.)).cwiseSqrt();
+  mean /= n;
+}
+
+#define grid_printf(...)        \
+  {                             \
+    char _buf[1024];            \
+    sprintf(_buf, __VA_ARGS__); \
+    MSG << _buf;                \
+  }
+
+enum
+{
+  sRead = 0,
+  sWrite = 1,
+  gRead = 2,
+  gWrite = 3
+};
+
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  int64_t threads = GridThread::GetThreads();
+  auto mpi = GridDefaultMpi();
+  unsigned int nVol = (BENCH_IO_LMAX - BENCH_IO_LMIN) / 2 + 1;
+  unsigned int nRelVol = (BENCH_IO_LMAX - 24) / 2 + 1;
+  std::vector<Eigen::MatrixXd> perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4));
+  std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
+  std::vector<int> latt;
+
+  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
+  MSG << "MPI partition " << mpi << std::endl;
+  for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
+  {
+    MSG << BIGSEP << std::endl;
+    MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
+    MSG << BIGSEP << std::endl;
+    MSG << SEP << std::endl;
+    MSG << "Benchmark std write" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
+      perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+    MSG << SEP << std::endl;
+    MSG << "Benchmark std read" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
+      perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+#ifdef HAVE_LIME
+    MSG << SEP << std::endl;
+    MSG << "Benchmark Grid C-Lime write" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
+      perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+    MSG << SEP << std::endl;
+    MSG << "Benchmark Grid C-Lime read" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
+      perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+#endif
+    avPerf[i].fill(0.);
+    for (int f = 0; f < 4; ++f)
+      for (int l = 24; l <= BENCH_IO_LMAX; l += 2)
+      {
+        avPerf[i](f) += perf[i](volInd(l), f);
+      }
+    avPerf[i] /= nRelVol;
+  }
+
+  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
+  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
+  //  double          n = BENCH_IO_NPASS;
+
+  stats(mean, stdDev, perf);
+  stats(avMean, avStdDev, avPerf);
+  rob.fill(100.);
+  rob -= 100. * stdDev.cwiseQuotient(mean.cwiseAbs());
+  avRob.fill(100.);
+  avRob -= 100. * avStdDev.cwiseQuotient(avMean.cwiseAbs());
+
+  MSG << BIGSEP << std::endl;
+  MSG << "SUMMARY" << std::endl;
+  MSG << BIGSEP << std::endl;
+  MSG << "Summary of individual results (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << std::endl;
+  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
+              "L", "std read", "std dev", "std write", "std dev",
+              "Grid read", "std dev", "Grid write", "std dev");
+  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+  {
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
+                l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
+                mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
+                mean(volInd(l), gRead), stdDev(volInd(l), gRead),
+                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
+  }
+  MSG << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
+  MSG << std::endl;
+  grid_printf("%4s %12s %12s %12s %12s\n",
+              "L", "std read", "std write", "Grid read", "Grid write");
+  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+  {
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
+                l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
+                rob(volInd(l), gRead), rob(volInd(l), gWrite));
+  }
+  MSG << std::endl;
+  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << std::endl;
+  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
+              "std read", "std dev", "std write", "std dev",
+              "Grid read", "std dev", "Grid write", "std dev");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
+              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
+              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
+  MSG << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
+  MSG << std::endl;
+  grid_printf("%12s %12s %12s %12s\n",
+              "std read", "std write", "Grid read", "Grid write");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
+              avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));
+
+  Grid_finalize();
+
+  return EXIT_SUCCESS;
+}
+#else
+int main(int argc, char **argv) {}
+#endif
@@ -0,0 +1,273 @@
+/*
+Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef Benchmark_IO_hpp_
+#define Benchmark_IO_hpp_
+
+#include <Grid/Grid.h>
+#define MSG std::cout << GridLogMessage
+#define SEP \
+  "-----------------------------------------------------------------------------"
+#define BIGSEP \
+  "============================================================================="
+#ifdef HAVE_LIME
+
+namespace Grid
+{
+
+  template <typename Field>
+  using WriterFn = std::function<void(const std::string, Field &)>;
+  template <typename Field>
+  using ReaderFn = std::function<void(Field &, const std::string)>;
+
+  // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
+  //
+  // template <typename Field>
+  // void stdWrite(const std::string filestem, Field &vec)
+  // {
+  //   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  //   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
+  //   size_t        size;
+  //   uint32_t      crc;
+  //   GridStopWatch ioWatch, crcWatch;
+
+  //   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  //   autoView(vec_v, vec, CpuRead);
+  //   crcWatch.Start();
+  //   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+  //   std::fwrite(&crc, sizeof(uint32_t), 1, file);
+  //   crcWatch.Stop();
+  //   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+  //   ioWatch.Start();
+  //   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+  //   ioWatch.Stop();
+  //   std::fclose(file);
+  //   size *= vec.Grid()->ProcessorCount();
+  //   auto &p = BinaryIO::lastPerf;
+  //   p.size            = size;
+  //   p.time            = ioWatch.useconds();
+  //   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  //   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
+  //       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  //   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+  // }
+  //
+  // template <typename Field>
+  // void stdRead(Field &vec, const std::string filestem)
+  // {
+  //   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  //   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
+  //   size_t        size;
+  //   uint32_t      crcRead, crcData;
+  //   GridStopWatch ioWatch, crcWatch;
+
+  //   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  //   crcWatch.Start();
+  //   std::fread(&crcRead, sizeof(uint32_t), 1, file);
+  //   crcWatch.Stop();
+  //   {
+  //     autoView(vec_v, vec, CpuWrite);
+  //     ioWatch.Start();
+  //     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+  //     ioWatch.Stop();
+  //     std::fclose(file);
+  //   }
+  //   {
+  //     autoView(vec_v, vec, CpuRead);
+  //     crcWatch.Start();
+  //     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+  //     crcWatch.Stop();
+  //   }
+  //   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+  //   assert(crcData == crcRead);
+  //   size *= vec.Grid()->ProcessorCount();
+  //   auto &p = BinaryIO::lastPerf;
+  //   p.size            = size;
+  //   p.time            = ioWatch.useconds();
+  //   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  //   MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed()
+  //       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  //   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+  // }
+
+  template <typename Field>
+  void stdWrite(const std::string filestem, Field &vec)
+  {
+    std::string rankStr = std::to_string(vec.Grid()->ThisRank());
+    std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
+    size_t size, sizec;
+    uint32_t crc;
+    GridStopWatch ioWatch, crcWatch;
+
+    size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
+    sizec = size / sizeof(char); // just in case of...
+    autoView(vec_v, vec, CpuRead);
+    crcWatch.Start();
+    crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+    file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t) / sizeof(char));
+    crcWatch.Stop();
+    MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+    ioWatch.Start();
+    file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+    file.flush();
+    ioWatch.Stop();
+    size *= vec.Grid()->ProcessorCount();
+    auto &p = BinaryIO::lastPerf;
+    p.size = size;
+    p.time = ioWatch.useconds();
+    p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
+    MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
+        << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+    MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+  }
+
+  template <typename Field>
+  void stdRead(Field &vec, const std::string filestem)
+  {
+    std::string rankStr = std::to_string(vec.Grid()->ThisRank());
+    std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
+    size_t size, sizec;
+    uint32_t crcRead, crcData;
+    GridStopWatch ioWatch, crcWatch;
+
+    size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
+    sizec = size / sizeof(char); // just in case of...
+    crcWatch.Start();
+    file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t) / sizeof(char));
+    crcWatch.Stop();
+    {
+      autoView(vec_v, vec, CpuWrite);
+      ioWatch.Start();
+      file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+      ioWatch.Stop();
+    }
+    {
+      autoView(vec_v, vec, CpuRead);
+      crcWatch.Start();
+      crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+      crcWatch.Stop();
+    }
+    MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+    assert(crcData == crcRead);
+    size *= vec.Grid()->ProcessorCount();
+    auto &p = BinaryIO::lastPerf;
+    p.size = size;
+    p.time = ioWatch.useconds();
+    p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
+    MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
+        << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+    MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+  }
+
+  template <typename Field>
+  void limeWrite(const std::string filestem, Field &vec)
+  {
+    emptyUserRecord record;
+    ScidacWriter binWriter(vec.Grid()->IsBoss());
+
+    binWriter.open(filestem + ".lime.bin");
+    binWriter.writeScidacFieldRecord(vec, record);
+    binWriter.close();
+  }
+
+  template <typename Field>
+  void limeRead(Field &vec, const std::string filestem)
+  {
+    emptyUserRecord record;
+    ScidacReader binReader;
+
+    binReader.open(filestem + ".lime.bin");
+    binReader.readScidacFieldRecord(vec, record);
+    binReader.close();
+  }
+
+  inline void makeGrid(std::shared_ptr<GridBase> &gPt,
+                       const std::shared_ptr<GridCartesian> &gBasePt,
+                       const unsigned int Ls = 1, const bool rb = false)
+  {
+    if (rb)
+    {
+      if (Ls > 1)
+      {
+        gPt.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gBasePt.get()));
+      }
+      else
+      {
+        gPt.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gBasePt.get()));
+      }
+    }
+    else
+    {
+      if (Ls > 1)
+      {
+        gPt.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gBasePt.get()));
+      }
+      else
+      {
+        gPt = gBasePt;
+      }
+    }
+  }
+
+  template <typename Field>
+  void writeBenchmark(const Coordinate &latt, const std::string filename,
+                      const WriterFn<Field> &write,
+                      const unsigned int Ls = 1, const bool rb = false)
+  {
+    auto mpi = GridDefaultMpi();
+    auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
+    std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
+    std::shared_ptr<GridBase> gPt;
+    std::random_device rd;
+
+    makeGrid(gPt, gBasePt, Ls, rb);
+
+    GridBase *g = gPt.get();
+    GridParallelRNG rng(g);
+    Field vec(g);
+
+    rng.SeedFixedIntegers({static_cast<int>(rd()), static_cast<int>(rd()),
+                           static_cast<int>(rd()), static_cast<int>(rd()),
+                           static_cast<int>(rd()), static_cast<int>(rd()),
+                           static_cast<int>(rd()), static_cast<int>(rd())});
+
+    random(rng, vec);
+    write(filename, vec);
+  }
+
+  template <typename Field>
+  void readBenchmark(const Coordinate &latt, const std::string filename,
+                     const ReaderFn<Field> &read,
+                     const unsigned int Ls = 1, const bool rb = false)
+  {
+    auto mpi = GridDefaultMpi();
+    auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
+    std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
+    std::shared_ptr<GridBase> gPt;
+
+    makeGrid(gPt, gBasePt, Ls, rb);
+
+    GridBase *g = gPt.get();
+    Field vec(g);
+
+    read(vec, filename);
+  }
+
+}
+
+#endif // LIME
+#endif // Benchmark_IO_hpp_
@@ -0,0 +1,801 @@
+/*
+Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
+Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+std::vector<int> L_list;
+std::vector<int> Ls_list;
+std::vector<double> mflop_list;
+
+double mflop_ref;
+double mflop_ref_err;
+
+int NN_global;
+
+struct time_statistics
+{
+  double mean;
+  double err;
+  double min;
+  double max;
+
+  void statistics(std::vector<double> v)
+  {
+    double sum = std::accumulate(v.begin(), v.end(), 0.0);
+    mean = sum / v.size();
+
+    std::vector<double> diff(v.size());
+    std::transform(v.begin(), v.end(), diff.begin(), [=](double x)
+                   { return x - mean; });
+    double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+    err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
+
+    auto result = std::minmax_element(v.begin(), v.end());
+    min = *result.first;
+    max = *result.second;
+  }
+};
+
+void comms_header()
+{
+  std::cout << GridLogMessage << " L  "
+            << "\t"
+            << " Ls  "
+            << "\t"
+            << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
+};
+
+Gamma::Algebra Gmu[] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT};
+struct controls
+{
+  int Opt;
+  int CommsOverlap;
+  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
+};
+
+class Benchmark
+{
+public:
+  static void Decomposition(void)
+  {
+
+    int threads = GridThread::GetThreads();
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
+    std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() << std::endl;
+    std::cout << GridLogMessage << "\tMPI tasks      : " << GridCmdVectorIntToString(GridDefaultMpi()) << std::endl;
+    std::cout << GridLogMessage << "\tvReal          : " << sizeof(vReal) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) << std::endl;
+    std::cout << GridLogMessage << "\tvRealF         : " << sizeof(vRealF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) << std::endl;
+    std::cout << GridLogMessage << "\tvRealD         : " << sizeof(vRealD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) << std::endl;
+    std::cout << GridLogMessage << "\tvComplex       : " << sizeof(vComplex) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) << std::endl;
+    std::cout << GridLogMessage << "\tvComplexF      : " << sizeof(vComplexF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) << std::endl;
+    std::cout << GridLogMessage << "\tvComplexD      : " << sizeof(vComplexD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  }
+
+  static void Comms(void)
+  {
+    int Nloop = 200;
+    int nmu = 0;
+    int maxlat = 32;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
+    Coordinate mpi_layout = GridDefaultMpi();
+
+    for (int mu = 0; mu < Nd; mu++)
+      if (mpi_layout[mu] > 1)
+        nmu++;
+
+    std::vector<double> t_time(Nloop);
+    time_statistics timestat;
+
+    std::cout << GridLogMessage << "====================================================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl;
+    std::cout << GridLogMessage << "====================================================================================================" << std::endl;
+    comms_header();
+
+    for (int lat = 16; lat <= maxlat; lat += 8)
+    {
+      //      for(int Ls=8;Ls<=8;Ls*=2){
+      {
+        int Ls = 12;
+
+        Coordinate latt_size({lat * mpi_layout[0],
+                              lat * mpi_layout[1],
+                              lat * mpi_layout[2],
+                              lat * mpi_layout[3]});
+
+        GridCartesian Grid(latt_size, simd_layout, mpi_layout);
+        RealD Nrank = Grid._Nprocessors;
+        RealD Nnode = Grid.NodeCount();
+        RealD ppn = Nrank / Nnode;
+
+        std::vector<HalfSpinColourVectorD *> xbuf(8);
+        std::vector<HalfSpinColourVectorD *> rbuf(8);
+        // Grid.ShmBufferFreeAll();
+        uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
+        for (int d = 0; d < 8; d++)
+        {
+          xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+          rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+          //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+          //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+        }
+
+        //	int ncomm;
+        double dbytes;
+
+        for (int dir = 0; dir < 8; dir++)
+        {
+          int mu = dir % 4;
+          if (mpi_layout[mu] > 1)
+          {
+
+            std::vector<double> times(Nloop);
+            for (int i = 0; i < Nloop; i++)
+            {
+
+              dbytes = 0;
+              double start = usecond();
+              int xmit_to_rank;
+              int recv_from_rank;
+
+              if (dir == mu)
+              {
+                int comm_proc = 1;
+                Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
+              }
+              else
+              {
+                int comm_proc = mpi_layout[mu] - 1;
+                Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
+              }
+              Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+                                  (void *)&rbuf[dir][0], recv_from_rank,
+                                  bytes);
+              dbytes += bytes;
+
+              double stop = usecond();
+              t_time[i] = stop - start; // microseconds
+            }
+            timestat.statistics(t_time);
+
+            dbytes = dbytes * ppn;
+            double xbytes = dbytes * 0.5;
+            double bidibytes = dbytes;
+
+            std::cout << GridLogMessage << lat << "\t" << Ls << "\t "
+                      << bytes << " \t "
+                      << xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t "
+                      << xbytes / timestat.max << " " << xbytes / timestat.min
+                      << "\t\t" << bidibytes / timestat.mean << "  " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " "
+                      << bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl;
+          }
+        }
+        for (int d = 0; d < 8; d++)
+        {
+          acceleratorFreeDevice(xbuf[d]);
+          acceleratorFreeDevice(rbuf[d]);
+        }
+      }
+    }
+    return;
+  }
+
+  static void Memory(void)
+  {
+    const int Nvec = 8;
+    typedef Lattice<iVector<vReal, Nvec>> LatticeVec;
+    typedef iVector<vReal, Nvec> Vec;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
+    Coordinate mpi_layout = GridDefaultMpi();
+
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "  L  "
+              << "\t\t"
+              << "bytes"
+              << "\t\t\t"
+              << "GB/s"
+              << "\t\t"
+              << "Gflop/s"
+              << "\t\t seconds"
+              << "\t\tGB/s / node" << std::endl;
+    std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl;
+
+    //    uint64_t NP;
+    uint64_t NN;
+
+    uint64_t lmax = 32;
+#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat)
+
+    GridSerialRNG sRNG;
+    sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
+    for (int lat = 8; lat <= lmax; lat += 8)
+    {
+
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]});
+      int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
+
+      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
+
+      //      NP= Grid.RankCount();
+      NN = Grid.NodeCount();
+
+      Vec rn;
+      random(sRNG, rn);
+
+      LatticeVec z(&Grid);
+      z = Zero();
+      LatticeVec x(&Grid);
+      x = Zero();
+      LatticeVec y(&Grid);
+      y = Zero();
+      double a = 2.0;
+
+      uint64_t Nloop = NLOOP;
+
+      double start = usecond();
+      for (int i = 0; i < Nloop; i++)
+      {
+        z = a * x - y;
+      }
+      double stop = usecond();
+      double time = (stop - start) / Nloop * 1000;
+
+      double flops = vol * Nvec * 2; // mul,add
+      double bytes = 3.0 * vol * Nvec * sizeof(Real);
+      std::cout << GridLogMessage << std::setprecision(3)
+                << lat << "\t\t" << bytes << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000.
+                << "\t\t" << bytes / time / NN << std::endl;
+    }
+  };
+
+  static void SU4(void)
+  {
+    const int Nc4 = 4;
+    typedef Lattice<iMatrix<vComplexF, Nc4>> LatticeSU4;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
+    Coordinate mpi_layout = GridDefaultMpi();
+
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "  L  "
+              << "\t\t"
+              << "bytes"
+              << "\t\t\t"
+              << "GB/s"
+              << "\t\t"
+              << "Gflop/s"
+              << "\t\t seconds"
+              << "\t\tGB/s / node" << std::endl;
+    std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl;
+
+    uint64_t NN;
+
+    uint64_t lmax = 32;
+
+    GridSerialRNG sRNG;
+    sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
+    for (int lat = 8; lat <= lmax; lat += 8)
+    {
+
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]});
+      int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
+
+      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
+
+      NN = Grid.NodeCount();
+
+      LatticeSU4 z(&Grid);
+      z = Zero();
+      LatticeSU4 x(&Grid);
+      x = Zero();
+      LatticeSU4 y(&Grid);
+      y = Zero();
+      //      double a=2.0;
+
+      uint64_t Nloop = NLOOP;
+
+      double start = usecond();
+      for (int i = 0; i < Nloop; i++)
+      {
+        z = x * y;
+      }
+      double stop = usecond();
+      double time = (stop - start) / Nloop * 1000;
+
+      double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
+      double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
+      std::cout << GridLogMessage << std::setprecision(3)
+                << lat << "\t\t" << bytes << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000.
+                << "\t\t" << bytes / time / NN << std::endl;
+    }
+  };
+
+  static double DWF(int Ls, int L)
+  {
+    RealD mass = 0.1;
+    RealD M5 = 1.8;
+
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst = 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi();
+    assert(mpi.size() == 4);
+    Coordinate local({L, L, L, L});
+    Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
+
+    GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
+                                                            GridDefaultSimd(Nd, vComplex::Nsimd()),
+                                                            GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global = NN;
+    uint64_t SHM = NP / NN;
+
+    ///////// Welcome message ////////////
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " << std::endl;
+    std::cout << GridLogMessage << "* Nc             : " << Nc << std::endl;
+    std::cout << GridLogMessage << "* Global volume  : " << GridCmdVectorIntToString(latt4) << std::endl;
+    std::cout << GridLogMessage << "* Ls             : " << Ls << std::endl;
+    std::cout << GridLogMessage << "* ranks          : " << NP << std::endl;
+    std::cout << GridLogMessage << "* nodes          : " << NN << std::endl;
+    std::cout << GridLogMessage << "* ranks/node     : " << SHM << std::endl;
+    std::cout << GridLogMessage << "* ranks geom     : " << GridCmdVectorIntToString(mpi) << std::endl;
+    std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+    GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+    GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+    GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1, 2, 3, 4});
+    std::vector<int> seeds5({5, 6, 7, 8});
+    GridParallelRNG RNG4(UGrid);
+    RNG4.SeedFixedIntegers(seeds4);
+    GridParallelRNG RNG5(FGrid);
+    RNG5.SeedFixedIntegers(seeds5);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    typedef DomainWallFermionF Action;
+    typedef typename Action::FermionField Fermion;
+    typedef LatticeGaugeFieldF Gauge;
+
+    ///////// Source preparation ////////////
+    Gauge Umu(UGrid);
+    SU<Nc>::HotConfiguration(RNG4, Umu);
+    Fermion src(FGrid);
+    random(RNG5, src);
+    Fermion src_e(FrbGrid);
+    Fermion src_o(FrbGrid);
+    Fermion r_e(FrbGrid);
+    Fermion r_o(FrbGrid);
+    Fermion r_eo(FGrid);
+    Action Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
+
+    {
+
+      pickCheckerboard(Even, src_e, src);
+      pickCheckerboard(Odd, src_o, src);
+
+      const int num_cases = 4;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+
+      controls Cases[] = {
+          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
+          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
+          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential},
+          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}};
+
+      for (int c = 0; c < num_cases; c++)
+      {
+
+        WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
+        WilsonKernelsStatic::Opt = Cases[c].Opt;
+        CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+
+        std::cout << GridLogMessage << "==================================================================================" << std::endl;
+        if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
+          std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
+        if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
+          std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
+        if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
+          std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
+        std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
+        std::cout << GridLogMessage << "==================================================================================" << std::endl;
+
+        int nwarm = 10;
+        double t0 = usecond();
+        FGrid->Barrier();
+        for (int i = 0; i < nwarm; i++)
+        {
+          Dw.DhopEO(src_o, r_e, DaggerNo);
+        }
+        FGrid->Barrier();
+        double t1 = usecond();
+        uint64_t ncall = 500;
+
+        FGrid->Broadcast(0, &ncall, sizeof(ncall));
+
+        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+        Dw.ZeroCounters();
+
+        time_statistics timestat;
+        std::vector<double> t_time(ncall);
+        for (uint64_t i = 0; i < ncall; i++)
+        {
+          t0 = usecond();
+          Dw.DhopEO(src_o, r_e, DaggerNo);
+          t1 = usecond();
+          t_time[i] = t1 - t0;
+        }
+        FGrid->Barrier();
+
+        double volume = Ls;
+        for (int mu = 0; mu < Nd; mu++)
+          volume = volume * latt4[mu];
+
+          // Nc=3 gives
+          // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
+          // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
+          //	double flops=(1344.0*volume)/2;
+#if 0
+	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns  + Nd*Nc*Ns*2;
+#else
+        double fps = Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
+#endif
+        double flops = (fps * volume) / 2;
+        double mf_hi, mf_lo, mf_err;
+
+        timestat.statistics(t_time);
+        mf_hi = flops / timestat.min;
+        mf_lo = flops / timestat.max;
+        mf_err = flops / timestat.min * timestat.err / timestat.mean;
+
+        mflops = flops / timestat.mean;
+        mflops_all.push_back(mflops);
+        if (mflops_best == 0)
+          mflops_best = mflops;
+        if (mflops_worst == 0)
+          mflops_worst = mflops;
+        if (mflops > mflops_best)
+          mflops_best = mflops;
+        if (mflops < mflops_worst)
+          mflops_worst = mflops;
+
+        std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank   " << mflops / NP << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node   " << mflops / NN << std::endl;
+      }
+
+      std::cout << GridLogMessage << "==================================================================================" << std::endl;
+      std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Best  mflop/s        =   " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl;
+      std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Worst mflop/s        =   " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl;
+      std::cout << GridLogMessage << fmt << std::endl;
+      std::cout << GridLogMessage;
+
+      for (int i = 0; i < mflops_all.size(); i++)
+      {
+        std::cout << mflops_all[i] / NN << " ; ";
+      }
+      std::cout << std::endl;
+      std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    }
+    return mflops_best;
+  }
+
+  static double Staggered(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst = 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi();
+    assert(mpi.size() == 4);
+    Coordinate local({L, L, L, L});
+    Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
+
+    GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
+                                                            GridDefaultSimd(Nd, vComplex::Nsimd()),
+                                                            GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global = NN;
+    uint64_t SHM = NP / NN;
+
+    ///////// Welcome message ////////////
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L << "^4 local volume " << std::endl;
+    std::cout << GridLogMessage << "* Global volume  : " << GridCmdVectorIntToString(latt4) << std::endl;
+    std::cout << GridLogMessage << "* ranks          : " << NP << std::endl;
+    std::cout << GridLogMessage << "* nodes          : " << NN << std::endl;
+    std::cout << GridLogMessage << "* ranks/node     : " << SHM << std::endl;
+    std::cout << GridLogMessage << "* ranks geom     : " << GridCmdVectorIntToString(mpi) << std::endl;
+    std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+    GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1, 2, 3, 4});
+    GridParallelRNG RNG4(FGrid);
+    RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass = 0.1;
+    RealD c1 = 9.0 / 8.0;
+    RealD c2 = -1.0 / 24.0;
+    RealD u0 = 1.0;
+
+    typedef ImprovedStaggeredFermionF Action;
+    typedef typename Action::FermionField Fermion;
+    typedef LatticeGaugeFieldF Gauge;
+
+    Gauge Umu(FGrid);
+    SU<Nc>::HotConfiguration(RNG4, Umu);
+
+    typename Action::ImplParams params;
+    Action Ds(Umu, Umu, *FGrid, *FrbGrid, mass, c1, c2, u0, params);
+
+    ///////// Source preparation ////////////
+    Fermion src(FGrid);
+    random(RNG4, src);
+    Fermion src_e(FrbGrid);
+    Fermion src_o(FrbGrid);
+    Fermion r_e(FrbGrid);
+    Fermion r_o(FrbGrid);
+    Fermion r_eo(FGrid);
+
+    {
+
+      pickCheckerboard(Even, src_e, src);
+      pickCheckerboard(Odd, src_o, src);
+
+      const int num_cases = 4;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+
+      controls Cases[] = {
+          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
+          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
+          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential},
+          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}};
+
+      for (int c = 0; c < num_cases; c++)
+      {
+
+        StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
+        StaggeredKernelsStatic::Opt = Cases[c].Opt;
+        CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+
+        std::cout << GridLogMessage << "==================================================================================" << std::endl;
+        if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric)
+          std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" << std::endl;
+        if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute)
+          std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
+        if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute)
+          std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
+        std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
+        std::cout << GridLogMessage << "==================================================================================" << std::endl;
+
+        int nwarm = 10;
+        double t0 = usecond();
+        FGrid->Barrier();
+        for (int i = 0; i < nwarm; i++)
+        {
+          Ds.DhopEO(src_o, r_e, DaggerNo);
+        }
+        FGrid->Barrier();
+        double t1 = usecond();
+        uint64_t ncall = 500;
+
+        FGrid->Broadcast(0, &ncall, sizeof(ncall));
+
+        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+        Ds.ZeroCounters();
+
+        time_statistics timestat;
+        std::vector<double> t_time(ncall);
+        for (uint64_t i = 0; i < ncall; i++)
+        {
+          t0 = usecond();
+          Ds.DhopEO(src_o, r_e, DaggerNo);
+          t1 = usecond();
+          t_time[i] = t1 - t0;
+        }
+        FGrid->Barrier();
+
+        double volume = 1;
+        for (int mu = 0; mu < Nd; mu++)
+          volume = volume * latt4[mu];
+        double flops = (1146.0 * volume) / 2;
+        double mf_hi, mf_lo, mf_err;
+
+        timestat.statistics(t_time);
+        mf_hi = flops / timestat.min;
+        mf_lo = flops / timestat.max;
+        mf_err = flops / timestat.min * timestat.err / timestat.mean;
+
+        mflops = flops / timestat.mean;
+        mflops_all.push_back(mflops);
+        if (mflops_best == 0)
+          mflops_best = mflops;
+        if (mflops_worst == 0)
+          mflops_worst = mflops;
+        if (mflops > mflops_best)
+          mflops_best = mflops;
+        if (mflops < mflops_worst)
+          mflops_worst = mflops;
+
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank   " << mflops / NP << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node   " << mflops / NN << std::endl;
+      }
+
+      std::cout << GridLogMessage << "==================================================================================" << std::endl;
+      std::cout << GridLogMessage << L << "^4  Deo Best  mflop/s        =   " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl;
+      std::cout << GridLogMessage << L << "^4  Deo Worst mflop/s        =   " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl;
+      std::cout << GridLogMessage << fmt << std::endl;
+      std::cout << GridLogMessage;
+
+      for (int i = 0; i < mflops_all.size(); i++)
+      {
+        std::cout << mflops_all[i] / NN << " ; ";
+      }
+      std::cout << std::endl;
+    }
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    return mflops_best;
+  }
+};
+
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
+#ifdef KNL
+  LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2});
+#else
+  LebesgueOrder::Block = std::vector<int>({2, 2, 2, 2});
+#endif
+  Benchmark::Decomposition();
+
+  int do_su4 = 1;
+  int do_memory = 1;
+  int do_comms = 1;
+
+  int sel = 4;
+  std::vector<int> L_list({8, 12, 16, 24, 32});
+  int selm1 = sel - 1;
+
+  std::vector<double> wilson;
+  std::vector<double> dwf4;
+  std::vector<double> staggered;
+
+  int Ls = 1;
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  for (int l = 0; l < L_list.size(); l++)
+  {
+    wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
+  }
+
+  Ls = 12;
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  for (int l = 0; l < L_list.size(); l++)
+  {
+    double result = Benchmark::DWF(Ls, L_list[l]);
+    dwf4.push_back(result);
+  }
+
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl;
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  for (int l = 0; l < L_list.size(); l++)
+  {
+    double result = Benchmark::Staggered(L_list[l]);
+    staggered.push_back(result);
+  }
+
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
+  for (int l = 0; l < L_list.size(); l++)
+  {
+    std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " << dwf4[l] << " \t\t " << staggered[l] << std::endl;
+  }
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+
+  int NN = NN_global;
+  if (do_memory)
+  {
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << " Memory benchmark " << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    Benchmark::Memory();
+  }
+
+  if (do_su4)
+  {
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    Benchmark::SU4();
+  }
+
+  if (do_comms)
+  {
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << " Communications benchmark " << std::endl;
+    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    Benchmark::Comms();
+  }
+
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
+  for (int l = 0; l < L_list.size(); l++)
+  {
+    std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
+  }
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+  std::cout << GridLogMessage << " Comparison point     result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN << " Mflop/s per node" << std::endl;
+  std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl;
+  std::cout << std::setprecision(3);
+  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+
+  Grid_finalize();
+}
@@ -0,0 +1,251 @@
+/*
+Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
+Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+struct time_statistics{
+  double mean;
+  double err;
+  double min;
+  double max;
+
+  void statistics(std::vector<double> v){
+      double sum = std::accumulate(v.begin(), v.end(), 0.0);
+      mean = sum / v.size();
+
+      std::vector<double> diff(v.size());
+      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+      auto result = std::minmax_element(v.begin(), v.end());
+      min = *result.first;
+      max = *result.second;
+}
+};
+
+void header(){
+  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
+            <<std::setw(11)<<"bytes\t\t"<<"MB/s uni"<<"\t"<<"MB/s bidi"<<std::endl;
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  int Nloop=250;
+  int nmu=0;
+  int maxlat=32;
+  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
+
+  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
+  std::vector<double> t_time(Nloop);
+  //  time_statistics timestat;
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=8;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],
+	                      lat*mpi_layout[1],
+      			      lat*mpi_layout[2],
+      			      lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
+
+      for(int mu=0;mu<8;mu++){
+	xbuf[mu].resize(lat*lat*lat*Ls);
+	rbuf[mu].resize(lat*lat*lat*Ls);
+      }
+      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+      int ncomm;
+
+      for(int mu=0;mu<4;mu++){
+	if (mpi_layout[mu]>1 ) {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+
+	  ncomm=0;
+	
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+
+	    comm_proc = mpi_layout[mu]-1;
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+        double mean=(stop-start)/Nloop;      
+      double dbytes    = bytes*ppn;
+      double xbytes    = dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
+               <<std::right<< xbytes/mean<<"  "
+               << "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
+
+
+	
+	}
+      }
+
+
+      
+    }
+  }
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory "<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=8;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],
+	                      lat*mpi_layout[1],
+      			      lat*mpi_layout[2],
+      			      lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+
+      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+      }
+
+      int ncomm;
+
+      for(int mu=0;mu<4;mu++){
+	if (mpi_layout[mu]>1 ) {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+
+	  ncomm=0;
+	
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+
+	    comm_proc = mpi_layout[mu]-1;
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+        double mean=(stop-start)/Nloop;      
+      double dbytes    = bytes*ppn;
+      double xbytes    = dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
+               <<std::right<< xbytes/mean<<"  "
+               << "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
+
+
+	
+	}
+      }
+
+      for(int d=0;d<8;d++){
+	acceleratorFreeDevice(xbuf[d]);
+	acceleratorFreeDevice(rbuf[d]);
+      }
+
+      
+    }
+  }
+
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+
+  Grid_finalize();
+}
@@ -0,0 +1,425 @@
+/*
+Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
+Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+template <class d>
+struct scal
+{
+  d internal;
+};
+
+Gamma::Algebra Gmu[] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT};
+
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  int threads = GridThread::GetThreads();
+
+  Coordinate latt4 = GridDefaultLatt();
+  int Ls = 16;
+  for (int i = 0; i < argc; i++)
+    if (std::string(argv[i]) == "-Ls")
+    {
+      std::stringstream ss(argv[i + 1]);
+      ss >> Ls;
+    }
+
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
+
+  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+  GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+
+  std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
+  GridCartesian *sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
+  GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
+  GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionF src(FGrid);
+  random(RNG5, src);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0 / ::sqrt(norm2(src));
+  src = src * N2;
+#endif
+
+  LatticeFermionF result(FGrid);
+  result = Zero();
+  LatticeFermionF ref(FGrid);
+  ref = Zero();
+  LatticeFermionF tmp(FGrid);
+  LatticeFermionF err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+#if 0
+  Umu=1.0;
+  for(int mu=0;mu<Nd;mu++){
+    LatticeColourMatrixF ttmp(UGrid);
+    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
+    //    if (mu !=2 ) ttmp = 0;
+    //    ttmp = ttmp* pow(10.0,mu);
+    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
+  }
+  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
+#endif
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  // replicate across fifth dimension
+  //  LatticeGaugeFieldF Umu5d(FGrid);
+  std::vector<LatticeColourMatrixF> U(4, UGrid);
+  for (int mu = 0; mu < Nd; mu++)
+  {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+  }
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for (int mu = 0; mu < Nd; mu++)
+    {
+
+      tmp = Cshift(src, mu + 1, 1);
+      {
+        autoView(tmp_v, tmp, CpuWrite);
+        autoView(U_v, U[mu], CpuRead);
+        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
+        {
+          for (int s = 0; s < Ls; s++)
+          {
+            tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s];
+          }
+        }
+      }
+      ref = ref + tmp - Gamma(Gmu[mu]) * tmp;
+
+      {
+        autoView(tmp_v, tmp, CpuWrite);
+        autoView(U_v, U[mu], CpuRead);
+        autoView(src_v, src, CpuRead);
+        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
+        {
+          for (int s = 0; s < Ls; s++)
+          {
+            tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
+          }
+        }
+      }
+      tmp = Cshift(tmp, mu + 1, -1);
+      ref = ref + tmp + Gamma(Gmu[mu]) * tmp;
+    }
+    ref = -0.5 * ref;
+  }
+
+  RealD mass = 0.1;
+  RealD M5 = 1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
+  std::cout << GridLogMessage << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" << std::endl;
+  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
+  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
+  std::cout << GridLogMessage << "* Benchmarking DomainWallFermionR::Dhop                  " << std::endl;
+  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl;
+  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B" << std::endl;
+  if (sizeof(RealF) == 4)
+    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
+  if (sizeof(RealF) == 8)
+    std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
+#ifdef GRID_OMP
+  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
+    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
+  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
+    std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
+#endif
+  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
+    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
+  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
+    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
+  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
+    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
+  std::cout << GridLogMessage << "*****************************************************************" << std::endl;
+
+  DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
+  int ncall = 300;
+
+  if (1)
+  {
+    FGrid->Barrier();
+    Dw.ZeroCounters();
+    Dw.Dhop(src, result, 0);
+    std::cout << GridLogMessage << "Called warmup" << std::endl;
+    double t0 = usecond();
+    for (int i = 0; i < ncall; i++)
+    {
+      __SSC_START;
+      Dw.Dhop(src, result, 0);
+      __SSC_STOP;
+    }
+    double t1 = usecond();
+    FGrid->Barrier();
+
+    double volume = Ls;
+    for (int mu = 0; mu < Nd; mu++)
+      volume = volume * latt4[mu];
+    double flops = single_site_flops * volume * ncall;
+
+    auto nsimd = vComplex::Nsimd();
+    auto simdwidth = sizeof(vComplex);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.);
+
+    std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0 << " us" << std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout << GridLogMessage << "mflop/s =   " << flops / (t1 - t0) << std::endl;
+    std::cout << GridLogMessage << "mflop/s per rank =  " << flops / (t1 - t0) / NP << std::endl;
+    std::cout << GridLogMessage << "mflop/s per node =  " << flops / (t1 - t0) / NN << std::endl;
+    std::cout << GridLogMessage << "RF  GiB/s (base 2) =   " << 1000000. * data_rf / ((t1 - t0)) << std::endl;
+    std::cout << GridLogMessage << "mem GiB/s (base 2) =   " << 1000000. * data_mem / ((t1 - t0)) << std::endl;
+    err = ref - result;
+    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+    // exit(0);
+
+    if ((norm2(err) > 1.0e-4))
+    {
+      /*
+      std::cout << "RESULT\n " << result<<std::endl;
+      std::cout << "REF   \n " << ref   <<std::endl;
+      std::cout << "ERR   \n " << err   <<std::endl;
+      */
+      std::cout << GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+      exit(-1);
+    }
+    assert(norm2(err) < 1.0e-4);
+    Dw.Report();
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+    ref = Zero();
+    for (int mu = 0; mu < Nd; mu++)
+    {
+
+      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
+      tmp = Cshift(src, mu + 1, 1);
+      {
+        autoView(ref_v, ref, CpuWrite);
+        autoView(tmp_v, tmp, CpuRead);
+        autoView(U_v, U[mu], CpuRead);
+        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
+        {
+          for (int s = 0; s < Ls; s++)
+          {
+            int i = s + Ls * ss;
+            ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]);
+            ;
+          }
+        }
+      }
+
+      {
+        autoView(tmp_v, tmp, CpuWrite);
+        autoView(U_v, U[mu], CpuRead);
+        autoView(src_v, src, CpuRead);
+        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
+        {
+          for (int s = 0; s < Ls; s++)
+          {
+            tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
+          }
+        }
+      }
+      //      tmp =adj(U[mu])*src;
+      tmp = Cshift(tmp, mu + 1, -1);
+      {
+        autoView(ref_v, ref, CpuWrite);
+        autoView(tmp_v, tmp, CpuRead);
+        for (int i = 0; i < ref_v.size(); i++)
+        {
+          ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i];
+          ;
+        }
+      }
+    }
+    ref = -0.5 * ref;
+  }
+  //  dump=1;
+  Dw.Dhop(src, result, 1);
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout << GridLogMessage << "Called DwDag" << std::endl;
+  std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
+  std::cout << GridLogMessage << "norm dag ref    " << norm2(ref) << std::endl;
+  err = ref - result;
+  std::cout << GridLogMessage << "norm dag diff   " << norm2(err) << std::endl;
+  if ((norm2(err) > 1.0e-4))
+  {
+    /*
+      std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
+      std::cout<< "DAG sRESULT\n " <<result  << std::endl;
+      std::cout<< "DAG ERR   \n "  << err    <<std::endl;
+    */
+  }
+  LatticeFermionF src_e(FrbGrid);
+  LatticeFermionF src_o(FrbGrid);
+  LatticeFermionF r_e(FrbGrid);
+  LatticeFermionF r_o(FrbGrid);
+  LatticeFermionF r_eo(FGrid);
+
+  std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec" << std::endl;
+  pickCheckerboard(Even, src_e, src);
+  pickCheckerboard(Odd, src_o, src);
+
+  std::cout << GridLogMessage << "src_e" << norm2(src_e) << std::endl;
+  std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage << "*********************************************************" << std::endl;
+  std::cout << GridLogMessage << "* Benchmarking DomainWallFermionF::DhopEO                " << std::endl;
+  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl;
+  if (sizeof(RealF) == 4)
+    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
+  if (sizeof(RealF) == 8)
+    std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
+#ifdef GRID_OMP
+  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
+    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
+  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
+    std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
+#endif
+  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
+    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
+  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
+    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
+  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
+    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
+  std::cout << GridLogMessage << "*********************************************************" << std::endl;
+  {
+    Dw.ZeroCounters();
+    FGrid->Barrier();
+    Dw.DhopEO(src_o, r_e, DaggerNo);
+    double t0 = usecond();
+    for (int i = 0; i < ncall; i++)
+    {
+#ifdef CUDA_PROFILE
+      if (i == 10)
+        cudaProfilerStart();
+#endif
+      Dw.DhopEO(src_o, r_e, DaggerNo);
+#ifdef CUDA_PROFILE
+      if (i == 20)
+        cudaProfilerStop();
+#endif
+    }
+    double t1 = usecond();
+    FGrid->Barrier();
+
+    double volume = Ls;
+    for (int mu = 0; mu < Nd; mu++)
+      volume = volume * latt4[mu];
+    double flops = (single_site_flops * volume * ncall) / 2.0;
+
+    std::cout << GridLogMessage << "Deo mflop/s =   " << flops / (t1 - t0) << std::endl;
+    std::cout << GridLogMessage << "Deo mflop/s per rank   " << flops / (t1 - t0) / NP << std::endl;
+    std::cout << GridLogMessage << "Deo mflop/s per node   " << flops / (t1 - t0) / NN << std::endl;
+    Dw.Report();
+  }
+  Dw.DhopEO(src_o, r_e, DaggerNo);
+  Dw.DhopOE(src_e, r_o, DaggerNo);
+  Dw.Dhop(src, result, DaggerNo);
+
+  std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl;
+  std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl;
+  std::cout << GridLogMessage << "res" << norm2(result) << std::endl;
+
+  setCheckerboard(r_eo, r_o);
+  setCheckerboard(r_eo, r_e);
+
+  err = r_eo - result;
+  std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+  if ((norm2(err) > 1.0e-4))
+  {
+    /*
+  std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
+  std::cout<< "Deo REF\n " <<result  << std::endl;
+  std::cout<< "Deo ERR   \n " << err <<std::endl;
+    */
+  }
+
+  pickCheckerboard(Even, src_e, err);
+  pickCheckerboard(Odd, src_o, err);
+  std::cout << GridLogMessage << "norm diff even  " << norm2(src_e) << std::endl;
+  std::cout << GridLogMessage << "norm diff odd   " << norm2(src_o) << std::endl;
+
+  assert(norm2(src_e) < 1.0e-4);
+  assert(norm2(src_o) < 1.0e-4);
+  Grid_finalize();
+  exit(0);
+}
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
@@ -0,0 +1,12 @@
+ACLOCAL_AMFLAGS = -I .buildutils/m4
+ 
+bin_PROGRAMS =                \
+  Benchmark_comms_host_device \
+  Benchmark_dwf_fp32          \
+  Benchmark_ITT               \
+  Benchmark_IO
+  
+Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp
+Benchmark_dwf_fp32_SOURCES          = Benchmark_dwf_fp32.cpp
+Benchmark_ITT_SOURCES               = Benchmark_ITT.cpp
+Benchmark_IO_SOURCES                = Benchmark_IO.cpp
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+mkdir -p .buildutils/m4
+autoreconf -fvi
@@ -0,0 +1,58 @@
+AC_PREREQ([2.69])
+AC_INIT([lattice-bench], [0.1], [antonin.portelli@me.com])
+AC_CANONICAL_BUILD
+AC_CANONICAL_HOST
+AC_CANONICAL_TARGET
+AC_CONFIG_SRCDIR([Benchmark_ITT.cpp])
+AC_CONFIG_MACRO_DIR([.buildutils/m4])
+AC_CONFIG_HEADERS([config.h])
+AM_INIT_AUTOMAKE([-Wall -Werror foreign])
+m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CC
+AC_PROG_RANLIB
+AM_PROG_AR
+AC_LANG([C++])
+
+AC_ARG_WITH([grid],
+    [AS_HELP_STRING([--with-grid=<prefix>],
+    [try this for a non-standard install prefix of Grid])],
+    [PATH="$with_grid/bin$PATH_SEPARATOR$PATH"]
+    [CXXFLAGS="$CXXFLAGS -I$with_grid/include"]
+    [LDFLAGS="$LDFLAGS -L$with_grid/lib"])
+AC_CHECK_PROG([GRIDCONF],[grid-config],[yes])
+if test x"$GRIDCONF" != x"yes" ; then
+    AC_MSG_ERROR([grid-config not found])
+fi
+CXXFLAGS="$CXXFLAGS `grid-config --cxxflags`"
+LDFLAGS="$LDFLAGS `grid-config --ldflags`"
+CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
+LDFLAGS="$AM_LDFLAGS $LDFLAGS"
+LIBS=" -lGrid $LIBS `grid-config --libs`"
+
+AC_MSG_CHECKING([that a minimal Grid program compiles]);
+AC_LINK_IFELSE(
+        [AC_LANG_SOURCE([[
+    #include <Grid/Grid.h>
+
+    using namespace Grid;
+
+    int main(int argc, char *argv[])
+    {
+        Grid_init(&argc, &argv);
+        Grid_finalize();
+
+        return 0;
+    }
+
+    ]])],
+        [AC_MSG_RESULT([yes])],
+    [AC_MSG_RESULT([no])]
+    [AC_MSG_ERROR([Could not compile a minimal Grid program])])
+
+AC_SUBST([AM_CXXFLAGS])
+AC_SUBST([AM_LDFLAGS])
+AC_CONFIG_FILES([Makefile])
+AC_OUTPUT