choose iteration count automatically

2023-06-20 18:08:34 +01:00
parent 3fbb8ea346
commit fb4c456776
1 changed files with 61 additions and 60 deletions
--- a/Quda/Benchmark_Quda.cpp
+++ b/Quda/Benchmark_Quda.cpp
@ -36,6 +36,29 @@ double get_timestamp()
 int nranks = -1;
 std::array<int, 4> mpi_grid = {1, 1, 1, 1};
 // run f() in a loop for roughly target_time seconds
 // returns seconds per iteration it took
 template <class F> double bench(F const &f, double target_time, int niter_warmup = 5)
 {
  device_timer_t timer;
  timer.start();
  for (int iter = 0; iter < niter_warmup; ++iter)
    f();
  timer.stop();
  double secs = timer.last() / niter_warmup;
  int niter = std::max(1, int(target_time / secs));
  // niter = std::min(1000, niter);
  // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter);
  timer.reset(__FUNCTION__, __FILE__, __LINE__);
  timer.start();
  for (int iter = 0; iter < niter; ++iter)
    f();
  timer.stop();
  return timer.last() / niter;
 }
 void initComms(int argc, char **argv)
 {
  // init MPI communication
@ -169,10 +192,8 @@ ColorSpinorField make_source(int L, int Ls = 1)
  return src;
 }
-void benchmark_wilson(std::vector<int> const &L_list, int niter)
+void benchmark_wilson(std::vector<int> const &L_list, double target_time)
 {
  int niter_warmup = 10;
  printfQuda("==================== wilson dirac operator ====================\n");
 #ifdef FLOP_COUNTING_GRID
  printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
@ -184,6 +205,8 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)
  for (int L : L_list)
  {
    // printfQuda("starting wilson L=%d\n", L);
    auto U = make_gauge_field(L);
    auto src = make_source(L);
@ -198,35 +221,26 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)
    // (the additional nullptr's are for smeared links and fancy preconditioners and such.
    // Not used for simple Wilson fermions)
    dirac.updateFields(&U, nullptr, nullptr, nullptr);
    auto res = ColorSpinorField(ColorSpinorParam(src));
    auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
-    // couple iterations without timing to warm up
+    // first run to get the quda tuning out of the way
    for (int iter = 0; iter < niter_warmup; ++iter)
      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
    // actual benchmark with timings
    dirac.Flops(); // reset flops counter
-    device_timer_t device_timer;
+    f();
-    device_timer.start();
+    double flops = 1.0 * dirac.Flops();
    double start_time = get_timestamp();
    for (int iter = 0; iter < niter; ++iter)
      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
    double end_time = get_timestamp();
    device_timer.stop();
-    double secs = device_timer.last() / niter;
+    // actual benchmarking
    double start_time = get_timestamp();
    double secs = bench(f, target_time);
    double end_time = get_timestamp();
 #ifdef FLOP_COUNTING_GRID
    // this is the flop counting from Benchmark_Grid
    double Nc = 3;
    double Nd = 4;
    double Ns = 4;
-    double flops =
+    flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
        (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
    flops *= L * L * L * L / 2.0;
 #else
    double flops = 1.0 * dirac.Flops() / niter;
 #endif
    printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
@ -240,10 +254,8 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter)
  }
 }
-void benchmark_dwf(std::vector<int> const &L_list, int niter)
+void benchmark_dwf(std::vector<int> const &L_list, double target_time)
 {
  int niter_warmup = 10;
  printfQuda("==================== domain wall dirac operator ====================\n");
 #ifdef FLOP_COUNTING_GRID
  printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
@ -255,6 +267,7 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
  int Ls = 12;
  for (int L : L_list)
  {
    // printfQuda("starting dwf L=%d\n", L);
    auto U = make_gauge_field(L);
    auto src = make_source(L, Ls);
@ -270,35 +283,26 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
    // insert gauge field into the dirac operator
    // (the additional nullptr's are for smeared links and fancy preconditioners and such)
    dirac.updateFields(&U, nullptr, nullptr, nullptr);
    auto res = ColorSpinorField(ColorSpinorParam(src));
    auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
-    // couple iterations without timing to warm up
+    // first run to get the quda tuning out of the way
    for (int iter = 0; iter < niter_warmup; ++iter)
      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
    // actual benchmark with timings
    dirac.Flops(); // reset flops counter
-    device_timer_t device_timer;
+    f();
-    device_timer.start();
+    double flops = 1.0 * dirac.Flops();
    double start_time = get_timestamp();
    for (int iter = 0; iter < niter; ++iter)
      dirac.Dslash(res, src, QUDA_EVEN_PARITY);
    double end_time = get_timestamp();
    device_timer.stop();
-    double secs = device_timer.last() / niter;
+    // actual benchmarking
    double start_time = get_timestamp();
    double secs = bench(f, target_time);
    double end_time = get_timestamp();
 #ifdef FLOP_COUNTING_GRID
    // this is the flop counting from Benchmark_Grid
    double Nc = 3;
    double Nd = 4;
    double Ns = 4;
-    double flops =
+    flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
        (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
    flops *= L * L * L * L * Ls / 2.0;
 #else
    double flops = 1.0 * dirac.Flops() / niter;
 #endif
    printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
@ -311,11 +315,11 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter)
  }
 }
-void benchmark_axpy(std::vector<int> const &L_list, int niter)
+void benchmark_axpy(std::vector<int> const &L_list, double target_time)
 {
  // number of iterations for warmup / measurement
  // (feel free to change for noise/time tradeoff)
-  constexpr int niter_warmup = 10;
+  constexpr int niter_warmup = 5;
  printfQuda("==================== axpy / memory ====================\n");
@ -341,8 +345,9 @@ void benchmark_axpy(std::vector<int> const &L_list, int niter)
             "GiB/s/rank", "Gflop/s/rank");
  for (int L : L_list)
  {
-    // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
+    // printfQuda("starting axpy L=%d\n", L);
-    //            are LOCAL, i.e. per rank / per GPU
+    //  IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
    //             are LOCAL, i.e. per rank / per GPU
    param.x[0] = L;
    param.x[1] = L;
@ -369,20 +374,16 @@ void benchmark_axpy(std::vector<int> const &L_list, int niter)
    double flops = 2 * field_elements;
    double memory = 3 * sizeof(float) * field_elements;
-    // do some iterations to to let QUDA do its internal tuning and also stabilize cache
+    auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); };
    // behaviour and such
    for (int iter = 0; iter < niter_warmup; ++iter)
      blas::axpy(1.234, fieldA, fieldB);
-    // running the actual benchmark
+    // first run to get the quda tuning out of the way
-    device_timer_t device_timer;
+    f();
-    device_timer.start();
+
    // actual benchmarking
    double start_time = get_timestamp();
-    for (int iter = 0; iter < niter; ++iter)
+    double secs = bench(f, target_time);
      blas::axpy(1.234, fieldA, fieldB);
    double end_time = get_timestamp();
-    device_timer.stop();
+
    double secs = device_timer.last() / niter; // seconds per iteration
    double mem_MiB = memory / 1024. / 1024.;
    double GBps = mem_MiB / 1024 / secs;
    printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
@ -419,11 +420,11 @@ int main(int argc, char **argv)
  printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
             mpi_grid[3]);
-  benchmark_axpy({8, 12, 16, 24, 32, 48}, 20);
+  benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0);
  setVerbosity(QUDA_SILENT);
-  benchmark_wilson({8, 12, 16, 24, 32, 48}, 20);
+  benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0);
-  benchmark_dwf({8, 12, 16, 24, 32}, 20);
+  benchmark_dwf({8, 12, 16, 24, 32}, 1.0);
  setVerbosity(QUDA_SUMMARIZE);
  printfQuda("==================== done with all benchmarks ====================\n");