forked from portelli/lattice-benchmarks
		
	choose iteration count automatically
This commit is contained in:
		| @@ -36,6 +36,29 @@ double get_timestamp() | ||||
| int nranks = -1; | ||||
| std::array<int, 4> mpi_grid = {1, 1, 1, 1}; | ||||
|  | ||||
| // run f() in a loop for roughly target_time seconds | ||||
| // returns seconds per iteration it took | ||||
| template <class F> double bench(F const &f, double target_time, int niter_warmup = 5) | ||||
| { | ||||
|   device_timer_t timer; | ||||
|   timer.start(); | ||||
|   for (int iter = 0; iter < niter_warmup; ++iter) | ||||
|     f(); | ||||
|   timer.stop(); | ||||
|  | ||||
|   double secs = timer.last() / niter_warmup; | ||||
|   int niter = std::max(1, int(target_time / secs)); | ||||
|   // niter = std::min(1000, niter); | ||||
|   // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter); | ||||
|  | ||||
|   timer.reset(__FUNCTION__, __FILE__, __LINE__); | ||||
|   timer.start(); | ||||
|   for (int iter = 0; iter < niter; ++iter) | ||||
|     f(); | ||||
|   timer.stop(); | ||||
|   return timer.last() / niter; | ||||
| } | ||||
|  | ||||
| void initComms(int argc, char **argv) | ||||
| { | ||||
|   // init MPI communication | ||||
| @@ -169,10 +192,8 @@ ColorSpinorField make_source(int L, int Ls = 1) | ||||
|   return src; | ||||
| } | ||||
|  | ||||
| void benchmark_wilson(std::vector<int> const &L_list, int niter) | ||||
| void benchmark_wilson(std::vector<int> const &L_list, double target_time) | ||||
| { | ||||
|   int niter_warmup = 10; | ||||
|  | ||||
|   printfQuda("==================== wilson dirac operator ====================\n"); | ||||
| #ifdef FLOP_COUNTING_GRID | ||||
|   printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); | ||||
| @@ -184,6 +205,8 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter) | ||||
|  | ||||
|   for (int L : L_list) | ||||
|   { | ||||
|     // printfQuda("starting wilson L=%d\n", L); | ||||
|  | ||||
|     auto U = make_gauge_field(L); | ||||
|     auto src = make_source(L); | ||||
|  | ||||
| @@ -198,35 +221,26 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter) | ||||
|     // (the additional nullptr's are for smeared links and fancy preconditioners and such. | ||||
|     // Not used for simple Wilson fermions) | ||||
|     dirac.updateFields(&U, nullptr, nullptr, nullptr); | ||||
|  | ||||
|     auto res = ColorSpinorField(ColorSpinorParam(src)); | ||||
|     auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; | ||||
|  | ||||
|     // couple iterations without timing to warm up | ||||
|     for (int iter = 0; iter < niter_warmup; ++iter) | ||||
|       dirac.Dslash(res, src, QUDA_EVEN_PARITY); | ||||
|  | ||||
|     // actual benchmark with timings | ||||
|     // first run to get the quda tuning out of the way | ||||
|     dirac.Flops(); // reset flops counter | ||||
|     device_timer_t device_timer; | ||||
|     device_timer.start(); | ||||
|     double start_time = get_timestamp(); | ||||
|     for (int iter = 0; iter < niter; ++iter) | ||||
|       dirac.Dslash(res, src, QUDA_EVEN_PARITY); | ||||
|     double end_time = get_timestamp(); | ||||
|     device_timer.stop(); | ||||
|     f(); | ||||
|     double flops = 1.0 * dirac.Flops(); | ||||
|  | ||||
|     double secs = device_timer.last() / niter; | ||||
|     // actual benchmarking | ||||
|     double start_time = get_timestamp(); | ||||
|     double secs = bench(f, target_time); | ||||
|     double end_time = get_timestamp(); | ||||
|  | ||||
| #ifdef FLOP_COUNTING_GRID | ||||
|     // this is the flop counting from Benchmark_Grid | ||||
|     double Nc = 3; | ||||
|     double Nd = 4; | ||||
|     double Ns = 4; | ||||
|     double flops = | ||||
|         (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); | ||||
|     flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); | ||||
|     flops *= L * L * L * L / 2.0; | ||||
| #else | ||||
|     double flops = 1.0 * dirac.Flops() / niter; | ||||
| #endif | ||||
|  | ||||
|     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); | ||||
| @@ -240,10 +254,8 @@ void benchmark_wilson(std::vector<int> const &L_list, int niter) | ||||
|   } | ||||
| } | ||||
|  | ||||
| void benchmark_dwf(std::vector<int> const &L_list, int niter) | ||||
| void benchmark_dwf(std::vector<int> const &L_list, double target_time) | ||||
| { | ||||
|   int niter_warmup = 10; | ||||
|  | ||||
|   printfQuda("==================== domain wall dirac operator ====================\n"); | ||||
| #ifdef FLOP_COUNTING_GRID | ||||
|   printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); | ||||
| @@ -255,6 +267,7 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter) | ||||
|   int Ls = 12; | ||||
|   for (int L : L_list) | ||||
|   { | ||||
|     // printfQuda("starting dwf L=%d\n", L); | ||||
|     auto U = make_gauge_field(L); | ||||
|     auto src = make_source(L, Ls); | ||||
|  | ||||
| @@ -270,35 +283,26 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter) | ||||
|     // insert gauge field into the dirac operator | ||||
|     // (the additional nullptr's are for smeared links and fancy preconditioners and such) | ||||
|     dirac.updateFields(&U, nullptr, nullptr, nullptr); | ||||
|  | ||||
|     auto res = ColorSpinorField(ColorSpinorParam(src)); | ||||
|     auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; | ||||
|  | ||||
|     // couple iterations without timing to warm up | ||||
|     for (int iter = 0; iter < niter_warmup; ++iter) | ||||
|       dirac.Dslash(res, src, QUDA_EVEN_PARITY); | ||||
|  | ||||
|     // actual benchmark with timings | ||||
|     // first run to get the quda tuning out of the way | ||||
|     dirac.Flops(); // reset flops counter | ||||
|     device_timer_t device_timer; | ||||
|     device_timer.start(); | ||||
|     double start_time = get_timestamp(); | ||||
|     for (int iter = 0; iter < niter; ++iter) | ||||
|       dirac.Dslash(res, src, QUDA_EVEN_PARITY); | ||||
|     double end_time = get_timestamp(); | ||||
|     device_timer.stop(); | ||||
|     f(); | ||||
|     double flops = 1.0 * dirac.Flops(); | ||||
|  | ||||
|     double secs = device_timer.last() / niter; | ||||
|     // actual benchmarking | ||||
|     double start_time = get_timestamp(); | ||||
|     double secs = bench(f, target_time); | ||||
|     double end_time = get_timestamp(); | ||||
|  | ||||
| #ifdef FLOP_COUNTING_GRID | ||||
|     // this is the flop counting from Benchmark_Grid | ||||
|     double Nc = 3; | ||||
|     double Nd = 4; | ||||
|     double Ns = 4; | ||||
|     double flops = | ||||
|         (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); | ||||
|     flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); | ||||
|     flops *= L * L * L * L * Ls / 2.0; | ||||
| #else | ||||
|     double flops = 1.0 * dirac.Flops() / niter; | ||||
| #endif | ||||
|  | ||||
|     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); | ||||
| @@ -311,11 +315,11 @@ void benchmark_dwf(std::vector<int> const &L_list, int niter) | ||||
|   } | ||||
| } | ||||
|  | ||||
| void benchmark_axpy(std::vector<int> const &L_list, int niter) | ||||
| void benchmark_axpy(std::vector<int> const &L_list, double target_time) | ||||
| { | ||||
|   // number of iterations for warmup / measurement | ||||
|   // (feel free to change for noise/time tradeoff) | ||||
|   constexpr int niter_warmup = 10; | ||||
|   constexpr int niter_warmup = 5; | ||||
|  | ||||
|   printfQuda("==================== axpy / memory ====================\n"); | ||||
|  | ||||
| @@ -341,8 +345,9 @@ void benchmark_axpy(std::vector<int> const &L_list, int niter) | ||||
|              "GiB/s/rank", "Gflop/s/rank"); | ||||
|   for (int L : L_list) | ||||
|   { | ||||
|     // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` | ||||
|     //            are LOCAL, i.e. per rank / per GPU | ||||
|     // printfQuda("starting axpy L=%d\n", L); | ||||
|     //  IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` | ||||
|     //             are LOCAL, i.e. per rank / per GPU | ||||
|  | ||||
|     param.x[0] = L; | ||||
|     param.x[1] = L; | ||||
| @@ -369,20 +374,16 @@ void benchmark_axpy(std::vector<int> const &L_list, int niter) | ||||
|     double flops = 2 * field_elements; | ||||
|     double memory = 3 * sizeof(float) * field_elements; | ||||
|  | ||||
|     // do some iterations to to let QUDA do its internal tuning and also stabilize cache | ||||
|     // behaviour and such | ||||
|     for (int iter = 0; iter < niter_warmup; ++iter) | ||||
|       blas::axpy(1.234, fieldA, fieldB); | ||||
|     auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); }; | ||||
|  | ||||
|     // running the actual benchmark | ||||
|     device_timer_t device_timer; | ||||
|     device_timer.start(); | ||||
|     // first run to get the quda tuning out of the way | ||||
|     f(); | ||||
|  | ||||
|     // actual benchmarking | ||||
|     double start_time = get_timestamp(); | ||||
|     for (int iter = 0; iter < niter; ++iter) | ||||
|       blas::axpy(1.234, fieldA, fieldB); | ||||
|     double secs = bench(f, target_time); | ||||
|     double end_time = get_timestamp(); | ||||
|     device_timer.stop(); | ||||
|     double secs = device_timer.last() / niter; // seconds per iteration | ||||
|  | ||||
|     double mem_MiB = memory / 1024. / 1024.; | ||||
|     double GBps = mem_MiB / 1024 / secs; | ||||
|     printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps, | ||||
| @@ -419,11 +420,11 @@ int main(int argc, char **argv) | ||||
|   printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], | ||||
|              mpi_grid[3]); | ||||
|  | ||||
|   benchmark_axpy({8, 12, 16, 24, 32, 48}, 20); | ||||
|   benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0); | ||||
|  | ||||
|   setVerbosity(QUDA_SILENT); | ||||
|   benchmark_wilson({8, 12, 16, 24, 32, 48}, 20); | ||||
|   benchmark_dwf({8, 12, 16, 24, 32}, 20); | ||||
|   benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0); | ||||
|   benchmark_dwf({8, 12, 16, 24, 32}, 1.0); | ||||
|   setVerbosity(QUDA_SUMMARIZE); | ||||
|  | ||||
|   printfQuda("==================== done with all benchmarks ====================\n"); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user