From 7648ed7496a74bac5b5b24a10a17cd54f33507a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Tue, 20 Jun 2023 18:08:34 +0100 Subject: [PATCH] choose iteration count automatically --- Quda/Benchmark_Quda.cpp | 121 ++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 60 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 689cf32..67c81bc 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -36,6 +36,29 @@ double get_timestamp() int nranks = -1; std::array mpi_grid = {1, 1, 1, 1}; +// run f() in a loop for roughly target_time seconds +// returns seconds per iteration it took +template double bench(F const &f, double target_time, int niter_warmup = 5) +{ + device_timer_t timer; + timer.start(); + for (int iter = 0; iter < niter_warmup; ++iter) + f(); + timer.stop(); + + double secs = timer.last() / niter_warmup; + int niter = std::max(1, int(target_time / secs)); + // niter = std::min(1000, niter); + // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter); + + timer.reset(__FUNCTION__, __FILE__, __LINE__); + timer.start(); + for (int iter = 0; iter < niter; ++iter) + f(); + timer.stop(); + return timer.last() / niter; +} + void initComms(int argc, char **argv) { // init MPI communication @@ -169,10 +192,8 @@ ColorSpinorField make_source(int L, int Ls = 1) return src; } -void benchmark_wilson(std::vector const &L_list, int niter) +void benchmark_wilson(std::vector const &L_list, double target_time) { - int niter_warmup = 10; - printfQuda("==================== wilson dirac operator ====================\n"); #ifdef FLOP_COUNTING_GRID printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); @@ -184,6 +205,8 @@ void benchmark_wilson(std::vector const &L_list, int niter) for (int L : L_list) { + // printfQuda("starting wilson L=%d\n", L); + auto U = make_gauge_field(L); auto src = make_source(L); @@ -198,35 +221,26 @@ void benchmark_wilson(std::vector const &L_list, int niter) // (the additional nullptr's are for smeared links and fancy preconditioners and such. // Not used for simple Wilson fermions) dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto res = ColorSpinorField(ColorSpinorParam(src)); + auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; - // couple iterations without timing to warm up - for (int iter = 0; iter < niter_warmup; ++iter) - dirac.Dslash(res, src, QUDA_EVEN_PARITY); - - // actual benchmark with timings + // first run to get the quda tuning out of the way dirac.Flops(); // reset flops counter - device_timer_t device_timer; - device_timer.start(); - double start_time = get_timestamp(); - for (int iter = 0; iter < niter; ++iter) - dirac.Dslash(res, src, QUDA_EVEN_PARITY); - double end_time = get_timestamp(); - device_timer.stop(); + f(); + double flops = 1.0 * dirac.Flops(); - double secs = device_timer.last() / niter; + // actual benchmarking + double start_time = get_timestamp(); + double secs = bench(f, target_time); + double end_time = get_timestamp(); #ifdef FLOP_COUNTING_GRID // this is the flop counting from Benchmark_Grid double Nc = 3; double Nd = 4; double Ns = 4; - double flops = - (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); flops *= L * L * L * L / 2.0; -#else - double flops = 1.0 * dirac.Flops() / niter; #endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); @@ -240,10 +254,8 @@ void benchmark_wilson(std::vector const &L_list, int niter) } } -void benchmark_dwf(std::vector const &L_list, int niter) +void benchmark_dwf(std::vector const &L_list, double target_time) { - int niter_warmup = 10; - printfQuda("==================== domain wall dirac operator ====================\n"); #ifdef FLOP_COUNTING_GRID printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); @@ -255,6 +267,7 @@ void benchmark_dwf(std::vector const &L_list, int niter) int Ls = 12; for (int L : L_list) { + // printfQuda("starting dwf L=%d\n", L); auto U = make_gauge_field(L); auto src = make_source(L, Ls); @@ -270,35 +283,26 @@ void benchmark_dwf(std::vector const &L_list, int niter) // insert gauge field into the dirac operator // (the additional nullptr's are for smeared links and fancy preconditioners and such) dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto res = ColorSpinorField(ColorSpinorParam(src)); + auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; - // couple iterations without timing to warm up - for (int iter = 0; iter < niter_warmup; ++iter) - dirac.Dslash(res, src, QUDA_EVEN_PARITY); - - // actual benchmark with timings + // first run to get the quda tuning out of the way dirac.Flops(); // reset flops counter - device_timer_t device_timer; - device_timer.start(); - double start_time = get_timestamp(); - for (int iter = 0; iter < niter; ++iter) - dirac.Dslash(res, src, QUDA_EVEN_PARITY); - double end_time = get_timestamp(); - device_timer.stop(); + f(); + double flops = 1.0 * dirac.Flops(); - double secs = device_timer.last() / niter; + // actual benchmarking + double start_time = get_timestamp(); + double secs = bench(f, target_time); + double end_time = get_timestamp(); #ifdef FLOP_COUNTING_GRID // this is the flop counting from Benchmark_Grid double Nc = 3; double Nd = 4; double Ns = 4; - double flops = - (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); flops *= L * L * L * L * Ls / 2.0; -#else - double flops = 1.0 * dirac.Flops() / niter; #endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); @@ -311,11 +315,11 @@ void benchmark_dwf(std::vector const &L_list, int niter) } } -void benchmark_axpy(std::vector const &L_list, int niter) +void benchmark_axpy(std::vector const &L_list, double target_time) { // number of iterations for warmup / measurement // (feel free to change for noise/time tradeoff) - constexpr int niter_warmup = 10; + constexpr int niter_warmup = 5; printfQuda("==================== axpy / memory ====================\n"); @@ -341,8 +345,9 @@ void benchmark_axpy(std::vector const &L_list, int niter) "GiB/s/rank", "Gflop/s/rank"); for (int L : L_list) { - // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` - // are LOCAL, i.e. per rank / per GPU + // printfQuda("starting axpy L=%d\n", L); + // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` + // are LOCAL, i.e. per rank / per GPU param.x[0] = L; param.x[1] = L; @@ -369,20 +374,16 @@ void benchmark_axpy(std::vector const &L_list, int niter) double flops = 2 * field_elements; double memory = 3 * sizeof(float) * field_elements; - // do some iterations to to let QUDA do its internal tuning and also stabilize cache - // behaviour and such - for (int iter = 0; iter < niter_warmup; ++iter) - blas::axpy(1.234, fieldA, fieldB); + auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); }; - // running the actual benchmark - device_timer_t device_timer; - device_timer.start(); + // first run to get the quda tuning out of the way + f(); + + // actual benchmarking double start_time = get_timestamp(); - for (int iter = 0; iter < niter; ++iter) - blas::axpy(1.234, fieldA, fieldB); + double secs = bench(f, target_time); double end_time = get_timestamp(); - device_timer.stop(); - double secs = device_timer.last() / niter; // seconds per iteration + double mem_MiB = memory / 1024. / 1024.; double GBps = mem_MiB / 1024 / secs; printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps, @@ -419,11 +420,11 @@ int main(int argc, char **argv) printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], mpi_grid[3]); - benchmark_axpy({8, 12, 16, 24, 32, 48}, 20); + benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0); setVerbosity(QUDA_SILENT); - benchmark_wilson({8, 12, 16, 24, 32, 48}, 20); - benchmark_dwf({8, 12, 16, 24, 32}, 20); + benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0); + benchmark_dwf({8, 12, 16, 24, 32}, 1.0); setVerbosity(QUDA_SUMMARIZE); printfQuda("==================== done with all benchmarks ====================\n");