benchmark-quda #3

Open
simon.buerger wants to merge 16 commits from simon.buerger/lattice-benchmarks:benchmark-quda into main
Showing only changes of commit 0d588d065a - Show all commits

View File

@ -4,17 +4,22 @@
#include <cassert> #include <cassert>
#include <color_spinor_field.h> #include <color_spinor_field.h>
#include <dirac_quda.h> #include <dirac_quda.h>
#include <fstream>
#include <gauge_tools.h> #include <gauge_tools.h>
#include <memory> #include <memory>
#include <mpi.h> #include <mpi.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
using namespace quda;
// remove to use QUDA's own flop counting instead of Grid's convention // remove to use QUDA's own flop counting instead of Grid's convention
#define FLOP_COUNTING_GRID #define FLOP_COUNTING_GRID
#include "json.hpp"
using nlohmann::json;
json json_results;
using namespace quda;
// This is the MPI grid, i.e. the layout of ranks // This is the MPI grid, i.e. the layout of ranks
int nranks = -1; int nranks = -1;
std::array<int, 4> mpi_grid = {1, 1, 1, 1}; std::array<int, 4> mpi_grid = {1, 1, 1, 1};
@ -43,6 +48,9 @@ void initComms(int argc, char **argv)
for (int d = 0; d < 4; d++) for (int d = 0; d < 4; d++)
if (mpi_grid[d] > 1) if (mpi_grid[d] > 1)
commDimPartitionedSet(d); commDimPartitionedSet(d);
json_results["geometry"]["ranks"] = nranks;
json_results["geometry"]["mpi"] = mpi_grid;
} }
// creates a random gauge field. L = local(!) size // creates a random gauge field. L = local(!) size
@ -149,9 +157,8 @@ ColorSpinorField make_source(int L, int Ls = 1)
return src; return src;
} }
void benchmark_wilson() void benchmark_wilson(std::vector<int> const &L_list, int niter)
{ {
int niter = 20;
int niter_warmup = 10; int niter_warmup = 10;
printfQuda("==================== wilson dirac operator ====================\n"); printfQuda("==================== wilson dirac operator ====================\n");
@ -163,7 +170,7 @@ void benchmark_wilson()
#endif #endif
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
for (int L : {8, 12, 16, 24, 32, 48}) for (int L : L_list)
{ {
auto U = make_gauge_field(L); auto U = make_gauge_field(L);
auto src = make_source(L); auto src = make_source(L);
@ -180,18 +187,18 @@ void benchmark_wilson()
// Not used for simple Wilson fermions) // Not used for simple Wilson fermions)
dirac.updateFields(&U, nullptr, nullptr, nullptr); dirac.updateFields(&U, nullptr, nullptr, nullptr);
auto tmp = ColorSpinorField(ColorSpinorParam(src)); auto res = ColorSpinorField(ColorSpinorParam(src));
// couple iterations without timing to warm up // couple iterations without timing to warm up
for (int iter = 0; iter < niter_warmup; ++iter) for (int iter = 0; iter < niter_warmup; ++iter)
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); dirac.Dslash(res, src, QUDA_EVEN_PARITY);
// actual benchmark with timings // actual benchmark with timings
dirac.Flops(); // reset flops counter dirac.Flops(); // reset flops counter
device_timer_t device_timer; device_timer_t device_timer;
device_timer.start(); device_timer.start();
for (int iter = 0; iter < niter; ++iter) for (int iter = 0; iter < niter; ++iter)
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); dirac.Dslash(res, src, QUDA_EVEN_PARITY);
device_timer.stop(); device_timer.stop();
double secs = device_timer.last() / niter; double secs = device_timer.last() / niter;
@ -209,12 +216,16 @@ void benchmark_wilson()
#endif #endif
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
json tmp;
tmp["L"] = L;
tmp["Gflops_wilson"] = flops / secs * 1e-9;
json_results["flops"]["results"].push_back(tmp);
} }
} }
void benchmark_dwf() void benchmark_dwf(std::vector<int> const &L_list, int niter)
{ {
int niter = 20;
int niter_warmup = 10; int niter_warmup = 10;
printfQuda("==================== domain wall dirac operator ====================\n"); printfQuda("==================== domain wall dirac operator ====================\n");
@ -226,7 +237,7 @@ void benchmark_dwf()
#endif #endif
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
int Ls = 12; int Ls = 12;
for (int L : {8, 12, 16, 24}) for (int L : L_list)
{ {
auto U = make_gauge_field(L); auto U = make_gauge_field(L);
auto src = make_source(L, Ls); auto src = make_source(L, Ls);
@ -244,18 +255,18 @@ void benchmark_dwf()
// (the additional nullptr's are for smeared links and fancy preconditioners and such) // (the additional nullptr's are for smeared links and fancy preconditioners and such)
dirac.updateFields(&U, nullptr, nullptr, nullptr); dirac.updateFields(&U, nullptr, nullptr, nullptr);
auto tmp = ColorSpinorField(ColorSpinorParam(src)); auto res = ColorSpinorField(ColorSpinorParam(src));
// couple iterations without timing to warm up // couple iterations without timing to warm up
for (int iter = 0; iter < niter_warmup; ++iter) for (int iter = 0; iter < niter_warmup; ++iter)
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); dirac.Dslash(res, src, QUDA_EVEN_PARITY);
// actual benchmark with timings // actual benchmark with timings
dirac.Flops(); // reset flops counter dirac.Flops(); // reset flops counter
device_timer_t device_timer; device_timer_t device_timer;
device_timer.start(); device_timer.start();
for (int iter = 0; iter < niter; ++iter) for (int iter = 0; iter < niter; ++iter)
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); dirac.Dslash(res, src, QUDA_EVEN_PARITY);
device_timer.stop(); device_timer.stop();
double secs = device_timer.last() / niter; double secs = device_timer.last() / niter;
@ -273,15 +284,18 @@ void benchmark_dwf()
#endif #endif
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
json tmp;
tmp["L"] = L;
tmp["Gflops_dwf4"] = flops / secs * 1e-9;
json_results["flops"]["results"].push_back(tmp);
} }
} }
void benchmark_axpy() void benchmark_axpy(std::vector<int> const &L_list, int niter)
{ {
// number of iterations for warmup / measurement // number of iterations for warmup / measurement
// (feel free to change for noise/time tradeoff) // (feel free to change for noise/time tradeoff)
constexpr int niter_warmup = 10; constexpr int niter_warmup = 10;
constexpr int niter = 20;
printfQuda("==================== axpy / memory ====================\n"); printfQuda("==================== axpy / memory ====================\n");
@ -305,7 +319,6 @@ void benchmark_axpy()
printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)",
"GiB/s/rank", "Gflop/s/rank"); "GiB/s/rank", "Gflop/s/rank");
std::vector L_list = {8, 12, 16, 24, 32, 48};
for (int L : L_list) for (int L : L_list)
{ {
// IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
@ -348,14 +361,29 @@ void benchmark_axpy()
blas::axpy(1.234, fieldA, fieldB); blas::axpy(1.234, fieldA, fieldB);
device_timer.stop(); device_timer.stop();
double secs = device_timer.last() / niter; // seconds per iteration double secs = device_timer.last() / niter; // seconds per iteration
double mem_MiB = memory / 1024. / 1024.;
double GBps = mem_MiB / 1024 / secs;
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
flops / secs * 1e-9);
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6, json tmp;
memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9); tmp["L"] = L;
tmp["size_MB"] = mem_MiB;
tmp["GBps"] = GBps;
tmp["GFlops"] = flops / secs * 1e-9;
json_results["axpy"].push_back(tmp);
} }
} }
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
std::string json_filename = ""; // empty indicates no json output
for (int i = 0; i < argc; i++)
{
if (std::string(argv[i]) == "--json-out")
json_filename = argv[i + 1];
}
initComms(argc, argv); initComms(argc, argv);
initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used
@ -367,14 +395,28 @@ int main(int argc, char **argv)
printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
mpi_grid[3]); mpi_grid[3]);
benchmark_axpy(); benchmark_axpy({8, 12, 16, 24, 32, 48}, 20);
setVerbosity(QUDA_SILENT); setVerbosity(QUDA_SILENT);
benchmark_wilson(); benchmark_wilson({8, 12, 16, 24, 32, 48}, 20);
benchmark_dwf(); benchmark_dwf({8, 12, 16, 24, 32}, 20);
setVerbosity(QUDA_SUMMARIZE); setVerbosity(QUDA_SUMMARIZE);
printfQuda("==================== done with all benchmarks ====================\n"); printfQuda("==================== done with all benchmarks ====================\n");
if (!json_filename.empty())
{
printfQuda("writing benchmark results to %s\n", json_filename.c_str());
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
if (me == 0)
{
std::ofstream json_file(json_filename);
json_file << std::setw(2) << json_results;
}
}
endQuda(); endQuda();
quda::comm_finalize(); quda::comm_finalize();
MPI_Finalize(); MPI_Finalize();