choose iteration count automatically

2023-06-20 18:08:34 +01:00
parent 8cd10019db
commit 7648ed7496
1 changed files with 61 additions and 60 deletions
--- a/Quda/Benchmark_Quda.cpp
+++ b/Quda/Benchmark_Quda.cpp
@@ -36,6 +36,29 @@ double get_timestamp()
 int nranks = -1;
 std::array<int, 4> mpi_grid = {1, 1, 1, 1};

+// run f() in a loop for roughly target_time seconds
+// returns seconds per iteration it took
+template <class F> double bench(F const &f, double target_time, int niter_warmup = 5)
+{
+  device_timer_t timer;
+  timer.start();
+  for (int iter = 0; iter < niter_warmup; ++iter)
+    f();
+  timer.stop();
+
+  double secs = timer.last() / niter_warmup;
+  int niter = std::max(1, int(target_time / secs));
+  // niter = std::min(1000, niter);
+  // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter);
+
+  timer.reset(__FUNCTION__, __FILE__, __LINE__);
+  timer.start();
+  for (int iter = 0; iter < niter; ++iter)
+    f();
+  timer.stop();
+  return timer.last() / niter;
+}
+
 void initComms(int argc, char **argv)
 {
  // init MPI communication
@@ -169,10 +192,8 @@ ColorSpinorField make_source(int L, int Ls = 1)
  return src;
 }

-void benchmark_wilson(std::vector<int> const &L_list, int niter)
+void benchmark_wilson(std::vector<int> const &L_list, double target_time)
 {
-  int niter_warmup = 10;
-
  printfQuda("==================== wilson dirac operator ====================\n");
 #ifdef FLOP_COUNTING_GRID
  printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
@@ -184,6 +205,8 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)

  for (int L : L_list)
  {
+    // printfQuda("starting wilson L=%d\n", L);
+
    auto U = make_gauge_field(L);
    auto src = make_source(L);

@@ -198,35 +221,26 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)
    // (the additional nullptr's are for smeared links and fancy preconditioners and such.
    // Not used for simple Wilson fermions)
    dirac.updateFields(&U, nullptr, nullptr, nullptr);
-
    auto res = ColorSpinorField(ColorSpinorParam(src));
+    auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };

-    // couple iterations without timing to warm up
-    for (int iter = 0; iter < niter_warmup; ++iter)
-      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
-
-    // actual benchmark with timings
+    // first run to get the quda tuning out of the way
    dirac.Flops(); // reset flops counter
-    device_timer_t device_timer;
-    device_timer.start();
-    double start_time = get_timestamp();
-    for (int iter = 0; iter < niter; ++iter)
-      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
-    double end_time = get_timestamp();
-    device_timer.stop();
+    f();
+    double flops = 1.0 * dirac.Flops();

-    double secs = device_timer.last() / niter;
+    // actual benchmarking
+    double start_time = get_timestamp();
+    double secs = bench(f, target_time);
+    double end_time = get_timestamp();

 #ifdef FLOP_COUNTING_GRID
    // this is the flop counting from Benchmark_Grid
    double Nc = 3;
    double Nd = 4;
    double Ns = 4;
-    double flops =
-        (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
+    flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
    flops *= L * L * L * L / 2.0;
-#else
-    double flops = 1.0 * dirac.Flops() / niter;
 #endif

    printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
@@ -240,10 +254,8 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)
  }
 }

-void benchmark_dwf(std::vector<int> const &L_list, int niter)
+void benchmark_dwf(std::vector<int> const &L_list, double target_time)
 {
-  int niter_warmup = 10;
-
  printfQuda("==================== domain wall dirac operator ====================\n");
 #ifdef FLOP_COUNTING_GRID
  printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
@@ -255,6 +267,7 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
  int Ls = 12;
  for (int L : L_list)
  {
+    // printfQuda("starting dwf L=%d\n", L);
    auto U = make_gauge_field(L);
    auto src = make_source(L, Ls);

@@ -270,35 +283,26 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
    // insert gauge field into the dirac operator
    // (the additional nullptr's are for smeared links and fancy preconditioners and such)
    dirac.updateFields(&U, nullptr, nullptr, nullptr);
-
    auto res = ColorSpinorField(ColorSpinorParam(src));
+    auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };

-    // couple iterations without timing to warm up
-    for (int iter = 0; iter < niter_warmup; ++iter)
-      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
-
-    // actual benchmark with timings
+    // first run to get the quda tuning out of the way
    dirac.Flops(); // reset flops counter
-    device_timer_t device_timer;
-    device_timer.start();
-    double start_time = get_timestamp();
-    for (int iter = 0; iter < niter; ++iter)
-      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
-    double end_time = get_timestamp();
-    device_timer.stop();
+    f();
+    double flops = 1.0 * dirac.Flops();

-    double secs = device_timer.last() / niter;
+    // actual benchmarking
+    double start_time = get_timestamp();
+    double secs = bench(f, target_time);
+    double end_time = get_timestamp();

 #ifdef FLOP_COUNTING_GRID
    // this is the flop counting from Benchmark_Grid
    double Nc = 3;
    double Nd = 4;
    double Ns = 4;
-    double flops =
-        (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
+    flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
    flops *= L * L * L * L * Ls / 2.0;
-#else
-    double flops = 1.0 * dirac.Flops() / niter;
 #endif

    printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
@@ -311,11 +315,11 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
  }
 }

-void benchmark_axpy(std::vector<int> const &L_list, int niter)
+void benchmark_axpy(std::vector<int> const &L_list, double target_time)
 {
  // number of iterations for warmup / measurement
  // (feel free to change for noise/time tradeoff)
-  constexpr int niter_warmup = 10;
+  constexpr int niter_warmup = 5;

  printfQuda("==================== axpy / memory ====================\n");

@@ -341,8 +345,9 @@ void benchmark_axpy(std::vector<int> const &L_list, int niter)
             "GiB/s/rank", "Gflop/s/rank");
  for (int L : L_list)
  {
-    // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
-    //            are LOCAL, i.e. per rank / per GPU
+    // printfQuda("starting axpy L=%d\n", L);
+    //  IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
+    //             are LOCAL, i.e. per rank / per GPU

    param.x[0] = L;
    param.x[1] = L;
@@ -369,20 +374,16 @@ void benchmark_axpy(std::vector<int> const &L_list, int niter)
    double flops = 2 * field_elements;
    double memory = 3 * sizeof(float) * field_elements;

-    // do some iterations to to let QUDA do its internal tuning and also stabilize cache
-    // behaviour and such
-    for (int iter = 0; iter < niter_warmup; ++iter)
-      blas::axpy(1.234, fieldA, fieldB);
+    auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); };

-    // running the actual benchmark
-    device_timer_t device_timer;
-    device_timer.start();
+    // first run to get the quda tuning out of the way
+    f();
+
+    // actual benchmarking
    double start_time = get_timestamp();
-    for (int iter = 0; iter < niter; ++iter)
-      blas::axpy(1.234, fieldA, fieldB);
+    double secs = bench(f, target_time);
    double end_time = get_timestamp();
-    device_timer.stop();
-    double secs = device_timer.last() / niter; // seconds per iteration
+
    double mem_MiB = memory / 1024. / 1024.;
    double GBps = mem_MiB / 1024 / secs;
    printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
@@ -419,11 +420,11 @@ int main(int argc, char **argv)
  printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
             mpi_grid[3]);

-  benchmark_axpy({8, 12, 16, 24, 32, 48}, 20);
+  benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0);

  setVerbosity(QUDA_SILENT);
-  benchmark_wilson({8, 12, 16, 24, 32, 48}, 20);
-  benchmark_dwf({8, 12, 16, 24, 32}, 20);
+  benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0);
+  benchmark_dwf({8, 12, 16, 24, 32}, 1.0);
  setVerbosity(QUDA_SUMMARIZE);

  printfQuda("==================== done with all benchmarks ====================\n");