From 7648ed7496a74bac5b5b24a10a17cd54f33507a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Simon=20B=C3=BCrger?= <simon.buerger@rwth-aachen.de>
Date: Tue, 20 Jun 2023 18:08:34 +0100
Subject: [PATCH] choose iteration count automatically

---
 Quda/Benchmark_Quda.cpp | 121 ++++++++++++++++++++--------------------
 1 file changed, 61 insertions(+), 60 deletions(-)
diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp
index 689cf32..67c81bc 100644
--- a/Quda/Benchmark_Quda.cpp
+++ b/Quda/Benchmark_Quda.cpp
@@ -36,6 +36,29 @@ double get_timestamp()
 int nranks = -1;
 std::array<int, 4> mpi_grid = {1, 1, 1, 1};
 
+// run f() in a loop for roughly target_time seconds
+// returns seconds per iteration it took
+template <class F> double bench(F const &f, double target_time, int niter_warmup = 5)
+{
+  device_timer_t timer;
+  timer.start();
+  for (int iter = 0; iter < niter_warmup; ++iter)
+    f();
+  timer.stop();
+
+  double secs = timer.last() / niter_warmup;
+  int niter = std::max(1, int(target_time / secs));
+  // niter = std::min(1000, niter);
+  // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter);
+
+  timer.reset(__FUNCTION__, __FILE__, __LINE__);
+  timer.start();
+  for (int iter = 0; iter < niter; ++iter)
+    f();
+  timer.stop();
+  return timer.last() / niter;
+}
+
 void initComms(int argc, char **argv)
 {
   // init MPI communication
@@ -169,10 +192,8 @@ ColorSpinorField make_source(int L, int Ls = 1)
   return src;
 }
 
-void benchmark_wilson(std::vector<int> const &L_list, int niter)
+void benchmark_wilson(std::vector<int> const &L_list, double target_time)
 {
-  int niter_warmup = 10;
-
   printfQuda("==================== wilson dirac operator ====================\n");
 #ifdef FLOP_COUNTING_GRID
   printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
@@ -184,6 +205,8 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)
 
   for (int L : L_list)
   {
+    // printfQuda("starting wilson L=%d\n", L);
+
     auto U = make_gauge_field(L);
     auto src = make_source(L);
 
@@ -198,35 +221,26 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)
     // (the additional nullptr's are for smeared links and fancy preconditioners and such.
     // Not used for simple Wilson fermions)
     dirac.updateFields(&U, nullptr, nullptr, nullptr);
-
     auto res = ColorSpinorField(ColorSpinorParam(src));
+    auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
 
-    // couple iterations without timing to warm up
-    for (int iter = 0; iter < niter_warmup; ++iter)
-      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
-
-    // actual benchmark with timings
+    // first run to get the quda tuning out of the way
     dirac.Flops(); // reset flops counter
-    device_timer_t device_timer;
-    device_timer.start();
-    double start_time = get_timestamp();
-    for (int iter = 0; iter < niter; ++iter)
-      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
-    double end_time = get_timestamp();
-    device_timer.stop();
+    f();
+    double flops = 1.0 * dirac.Flops();
 
-    double secs = device_timer.last() / niter;
+    // actual benchmarking
+    double start_time = get_timestamp();
+    double secs = bench(f, target_time);
+    double end_time = get_timestamp();
 
 #ifdef FLOP_COUNTING_GRID
     // this is the flop counting from Benchmark_Grid
     double Nc = 3;
     double Nd = 4;
     double Ns = 4;
-    double flops =
-        (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
+    flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
     flops *= L * L * L * L / 2.0;
-#else
-    double flops = 1.0 * dirac.Flops() / niter;
 #endif
 
     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
@@ -240,10 +254,8 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)
   }
 }
 
-void benchmark_dwf(std::vector<int> const &L_list, int niter)
+void benchmark_dwf(std::vector<int> const &L_list, double target_time)
 {
-  int niter_warmup = 10;
-
   printfQuda("==================== domain wall dirac operator ====================\n");
 #ifdef FLOP_COUNTING_GRID
   printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
@@ -255,6 +267,7 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
   int Ls = 12;
   for (int L : L_list)
   {
+    // printfQuda("starting dwf L=%d\n", L);
     auto U = make_gauge_field(L);
     auto src = make_source(L, Ls);
 
@@ -270,35 +283,26 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
     // insert gauge field into the dirac operator
     // (the additional nullptr's are for smeared links and fancy preconditioners and such)
     dirac.updateFields(&U, nullptr, nullptr, nullptr);
-
     auto res = ColorSpinorField(ColorSpinorParam(src));
+    auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
 
-    // couple iterations without timing to warm up
-    for (int iter = 0; iter < niter_warmup; ++iter)
-      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
-
-    // actual benchmark with timings
+    // first run to get the quda tuning out of the way
     dirac.Flops(); // reset flops counter
-    device_timer_t device_timer;
-    device_timer.start();
-    double start_time = get_timestamp();
-    for (int iter = 0; iter < niter; ++iter)
-      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
-    double end_time = get_timestamp();
-    device_timer.stop();
+    f();
+    double flops = 1.0 * dirac.Flops();
 
-    double secs = device_timer.last() / niter;
+    // actual benchmarking
+    double start_time = get_timestamp();
+    double secs = bench(f, target_time);
+    double end_time = get_timestamp();
 
 #ifdef FLOP_COUNTING_GRID
     // this is the flop counting from Benchmark_Grid
     double Nc = 3;
     double Nd = 4;
     double Ns = 4;
-    double flops =
-        (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
+    flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
     flops *= L * L * L * L * Ls / 2.0;
-#else
-    double flops = 1.0 * dirac.Flops() / niter;
 #endif
 
     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
@@ -311,11 +315,11 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
   }
 }
 
-void benchmark_axpy(std::vector<int> const &L_list, int niter)
+void benchmark_axpy(std::vector<int> const &L_list, double target_time)
 {
   // number of iterations for warmup / measurement
   // (feel free to change for noise/time tradeoff)
-  constexpr int niter_warmup = 10;
+  constexpr int niter_warmup = 5;
 
   printfQuda("==================== axpy / memory ====================\n");
 
@@ -341,8 +345,9 @@ void benchmark_axpy(std::vector<int> const &L_list, int niter)
              "GiB/s/rank", "Gflop/s/rank");
   for (int L : L_list)
   {
-    // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
-    //            are LOCAL, i.e. per rank / per GPU
+    // printfQuda("starting axpy L=%d\n", L);
+    //  IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
+    //             are LOCAL, i.e. per rank / per GPU
 
     param.x[0] = L;
     param.x[1] = L;
@@ -369,20 +374,16 @@ void benchmark_axpy(std::vector<int> const &L_list, int niter)
     double flops = 2 * field_elements;
     double memory = 3 * sizeof(float) * field_elements;
 
-    // do some iterations to to let QUDA do its internal tuning and also stabilize cache
-    // behaviour and such
-    for (int iter = 0; iter < niter_warmup; ++iter)
-      blas::axpy(1.234, fieldA, fieldB);
+    auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); };
 
-    // running the actual benchmark
-    device_timer_t device_timer;
-    device_timer.start();
+    // first run to get the quda tuning out of the way
+    f();
+
+    // actual benchmarking
     double start_time = get_timestamp();
-    for (int iter = 0; iter < niter; ++iter)
-      blas::axpy(1.234, fieldA, fieldB);
+    double secs = bench(f, target_time);
     double end_time = get_timestamp();
-    device_timer.stop();
-    double secs = device_timer.last() / niter; // seconds per iteration
+
     double mem_MiB = memory / 1024. / 1024.;
     double GBps = mem_MiB / 1024 / secs;
     printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
@@ -419,11 +420,11 @@ int main(int argc, char **argv)
   printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
              mpi_grid[3]);
 
-  benchmark_axpy({8, 12, 16, 24, 32, 48}, 20);
+  benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0);
 
   setVerbosity(QUDA_SILENT);
-  benchmark_wilson({8, 12, 16, 24, 32, 48}, 20);
-  benchmark_dwf({8, 12, 16, 24, 32}, 20);
+  benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0);
+  benchmark_dwf({8, 12, 16, 24, 32}, 1.0);
   setVerbosity(QUDA_SUMMARIZE);
 
   printfQuda("==================== done with all benchmarks ====================\n");