diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 3ba7f84..84b8565 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -4,17 +4,22 @@ #include #include #include +#include #include #include #include #include #include -using namespace quda; - // remove to use QUDA's own flop counting instead of Grid's convention #define FLOP_COUNTING_GRID +#include "json.hpp" +using nlohmann::json; +json json_results; + +using namespace quda; + // This is the MPI grid, i.e. the layout of ranks int nranks = -1; std::array mpi_grid = {1, 1, 1, 1}; @@ -43,6 +48,9 @@ void initComms(int argc, char **argv) for (int d = 0; d < 4; d++) if (mpi_grid[d] > 1) commDimPartitionedSet(d); + + json_results["geometry"]["ranks"] = nranks; + json_results["geometry"]["mpi"] = mpi_grid; } // creates a random gauge field. L = local(!) size @@ -149,9 +157,8 @@ ColorSpinorField make_source(int L, int Ls = 1) return src; } -void benchmark_wilson() +void benchmark_wilson(std::vector const &L_list, int niter) { - int niter = 20; int niter_warmup = 10; printfQuda("==================== wilson dirac operator ====================\n"); @@ -163,7 +170,7 @@ void benchmark_wilson() #endif printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); - for (int L : {8, 12, 16, 24, 32, 48}) + for (int L : L_list) { auto U = make_gauge_field(L); auto src = make_source(L); @@ -180,18 +187,18 @@ void benchmark_wilson() // Not used for simple Wilson fermions) dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto tmp = ColorSpinorField(ColorSpinorParam(src)); + auto res = ColorSpinorField(ColorSpinorParam(src)); // couple iterations without timing to warm up for (int iter = 0; iter < niter_warmup; ++iter) - dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + dirac.Dslash(res, src, QUDA_EVEN_PARITY); // actual benchmark with timings dirac.Flops(); // reset flops counter device_timer_t device_timer; device_timer.start(); for (int iter = 0; iter < niter; ++iter) - dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + dirac.Dslash(res, src, QUDA_EVEN_PARITY); device_timer.stop(); double secs = device_timer.last() / niter; @@ -209,12 +216,16 @@ void benchmark_wilson() #endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + + json tmp; + tmp["L"] = L; + tmp["Gflops_wilson"] = flops / secs * 1e-9; + json_results["flops"]["results"].push_back(tmp); } } -void benchmark_dwf() +void benchmark_dwf(std::vector const &L_list, int niter) { - int niter = 20; int niter_warmup = 10; printfQuda("==================== domain wall dirac operator ====================\n"); @@ -226,7 +237,7 @@ void benchmark_dwf() #endif printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); int Ls = 12; - for (int L : {8, 12, 16, 24}) + for (int L : L_list) { auto U = make_gauge_field(L); auto src = make_source(L, Ls); @@ -244,18 +255,18 @@ void benchmark_dwf() // (the additional nullptr's are for smeared links and fancy preconditioners and such) dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto tmp = ColorSpinorField(ColorSpinorParam(src)); + auto res = ColorSpinorField(ColorSpinorParam(src)); // couple iterations without timing to warm up for (int iter = 0; iter < niter_warmup; ++iter) - dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + dirac.Dslash(res, src, QUDA_EVEN_PARITY); // actual benchmark with timings dirac.Flops(); // reset flops counter device_timer_t device_timer; device_timer.start(); for (int iter = 0; iter < niter; ++iter) - dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + dirac.Dslash(res, src, QUDA_EVEN_PARITY); device_timer.stop(); double secs = device_timer.last() / niter; @@ -273,15 +284,18 @@ void benchmark_dwf() #endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + json tmp; + tmp["L"] = L; + tmp["Gflops_dwf4"] = flops / secs * 1e-9; + json_results["flops"]["results"].push_back(tmp); } } -void benchmark_axpy() +void benchmark_axpy(std::vector const &L_list, int niter) { // number of iterations for warmup / measurement // (feel free to change for noise/time tradeoff) constexpr int niter_warmup = 10; - constexpr int niter = 20; printfQuda("==================== axpy / memory ====================\n"); @@ -305,7 +319,6 @@ void benchmark_axpy() printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", "GiB/s/rank", "Gflop/s/rank"); - std::vector L_list = {8, 12, 16, 24, 32, 48}; for (int L : L_list) { // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` @@ -348,14 +361,29 @@ void benchmark_axpy() blas::axpy(1.234, fieldA, fieldB); device_timer.stop(); double secs = device_timer.last() / niter; // seconds per iteration + double mem_MiB = memory / 1024. / 1024.; + double GBps = mem_MiB / 1024 / secs; + printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps, + flops / secs * 1e-9); - printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6, - memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9); + json tmp; + tmp["L"] = L; + tmp["size_MB"] = mem_MiB; + tmp["GBps"] = GBps; + tmp["GFlops"] = flops / secs * 1e-9; + json_results["axpy"].push_back(tmp); } } int main(int argc, char **argv) { + std::string json_filename = ""; // empty indicates no json output + for (int i = 0; i < argc; i++) + { + if (std::string(argv[i]) == "--json-out") + json_filename = argv[i + 1]; + } + initComms(argc, argv); initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used @@ -367,14 +395,28 @@ int main(int argc, char **argv) printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], mpi_grid[3]); - benchmark_axpy(); + benchmark_axpy({8, 12, 16, 24, 32, 48}, 20); setVerbosity(QUDA_SILENT); - benchmark_wilson(); - benchmark_dwf(); + benchmark_wilson({8, 12, 16, 24, 32, 48}, 20); + benchmark_dwf({8, 12, 16, 24, 32}, 20); setVerbosity(QUDA_SUMMARIZE); printfQuda("==================== done with all benchmarks ====================\n"); + + if (!json_filename.empty()) + { + printfQuda("writing benchmark results to %s\n", json_filename.c_str()); + + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + if (me == 0) + { + std::ofstream json_file(json_filename); + json_file << std::setw(2) << json_results; + } + } + endQuda(); quda::comm_finalize(); MPI_Finalize();