Memory benchmarks cleanup and fix

2023-01-28 14:18:48 +00:00
parent 78c464d1d8
commit 77c75ea5b8
1 changed files with 34 additions and 50 deletions
--- a/Grid/Benchmark_Grid.cpp
+++ b/Grid/Benchmark_Grid.cpp
@@ -2,7 +2,7 @@
 Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
 Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
-This is a refactoring of Benchmark_ITT.cpp from Grid
+This is a fork of Benchmark_ITT.cpp from Grid
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
@@ -85,10 +85,8 @@ class Benchmark
    int threads = GridThread::GetThreads();
    grid_big_sep();
    std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads"
              << std::endl;
    grid_big_sep();
    std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
    grid_small_sep();
    std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads()
              << std::endl;
    std::cout << GridLogMessage
@@ -233,24 +231,15 @@ class Benchmark
    Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
    Coordinate mpi_layout = GridDefaultMpi();
-    grid_big_sep();
+    std::cout << GridLogMessage << "Benchmarking a*x + y bandwidth" << std::endl;
-    std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
+    grid_small_sep();
-    grid_big_sep();
+    grid_printf("%5s %15s %15s %15s %15s\n", "L", "size (MB/node)", "time (usec)",
-    std::cout << GridLogMessage << "  L  "
+                "GB/s/node", "Gflop/s/node");
              << "\t\t"
              << "bytes"
              << "\t\t\t"
              << "GB/s"
              << "\t\t"
              << "Gflop/s"
              << "\t\t seconds"
              << "\t\tGB/s / node" << std::endl;
    //    uint64_t NP;
    uint64_t NN;
-
+    uint64_t lmax = 64;
-    uint64_t lmax = 32;
+#define NLOOP (200 * lmax * lmax * lmax / lat / lat / lat)
-#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat)
+#define NWARMUP 50
    GridSerialRNG sRNG;
    sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
@@ -259,11 +248,10 @@ class Benchmark
      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
                            lat * mpi_layout[3]});
-      int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
+      uint64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
      //      NP= Grid.RankCount();
      NN = Grid.NodeCount();
      Vec rn;
@@ -279,20 +267,23 @@ class Benchmark
      uint64_t Nloop = NLOOP;
      for (int i = 0; i < NWARMUP; i++)
      {
        z = a * x - y;
      }
      double start = usecond();
      for (int i = 0; i < Nloop; i++)
      {
        z = a * x - y;
      }
      double stop = usecond();
-      double time = (stop - start) / Nloop * 1000;
+      double time = (stop - start) / Nloop / 1.e6;
-      double flops = vol * Nvec * 2; // mul,add
+      double flops = vol * Nvec * 2 / 1.e9; // mul,add
-      double bytes = 3.0 * vol * Nvec * sizeof(Real);
+      double bytes = 3.0 * vol * Nvec * sizeof(Real) / 1024. / 1024.;
-      std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
+
-                << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
+      grid_printf("%5d %15.2f %15.2f %15.2f %15.2f\n", lat, bytes / NN, time * 1.e6,
-                << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
+                  bytes / time / NN / 1024., flops / time / NN);
                << std::endl;
      nlohmann::json tmp;
      tmp["L"] = lat;
@@ -311,22 +302,14 @@ class Benchmark
    Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
    Coordinate mpi_layout = GridDefaultMpi();
-    grid_big_sep();
+    std::cout << GridLogMessage << "Benchmarking z = y*x SU(4) bandwidth" << std::endl;
-    std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
+    grid_small_sep();
-    grid_big_sep();
+    grid_printf("%5s %15s %15s %15s %15s\n", "L", "size (MB/node)", "time (usec)",
-    std::cout << GridLogMessage << "  L  "
+                "GB/s/node", "Gflop/s/node");
              << "\t\t"
              << "bytes"
              << "\t\t\t"
              << "GB/s"
              << "\t\t"
              << "Gflop/s"
              << "\t\t seconds"
              << "\t\tGB/s / node" << std::endl;
    uint64_t NN;
-    uint64_t lmax = 32;
+    uint64_t lmax = 48;
    GridSerialRNG sRNG;
    sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
@@ -347,24 +330,25 @@ class Benchmark
      x = Zero();
      LatticeSU4 y(&Grid);
      y = Zero();
      //      double a=2.0;
      uint64_t Nloop = NLOOP;
      for (int i = 0; i < NWARMUP; i++)
      {
        z = x * y;
      }
      double start = usecond();
      for (int i = 0; i < Nloop; i++)
      {
        z = x * y;
      }
      double stop = usecond();
-      double time = (stop - start) / Nloop * 1000;
+      double time = (stop - start) / Nloop / 1.e6;
-      double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
+      double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8) / 1.e9; // mul,add
-      double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
+      double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF) / 1024. / 1024.;
-      std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
+      grid_printf("%5d %15.2f %15.2f %15.2f %15.2f\n", lat, bytes / NN, time * 1.e6,
-                << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
+                  bytes / time / NN / 1024., flops / time / NN);
                << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
                << std::endl;
      nlohmann::json tmp;
      tmp["L"] = lat;