benchmark-quda #3

Open
simon.buerger wants to merge 16 commits from simon.buerger/lattice-benchmarks:benchmark-quda into main
Showing only changes of commit 0d588d065a - Show all commits

View File

@ -4,17 +4,22 @@
#include <cassert>
#include <color_spinor_field.h>
#include <dirac_quda.h>
#include <fstream>
#include <gauge_tools.h>
#include <memory>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
using namespace quda;
// remove to use QUDA's own flop counting instead of Grid's convention
#define FLOP_COUNTING_GRID
#include "json.hpp"
using nlohmann::json;
json json_results;
using namespace quda;
// This is the MPI grid, i.e. the layout of ranks
int nranks = -1;
std::array<int, 4> mpi_grid = {1, 1, 1, 1};
@ -43,6 +48,9 @@ void initComms(int argc, char **argv)
for (int d = 0; d < 4; d++)
if (mpi_grid[d] > 1)
commDimPartitionedSet(d);
json_results["geometry"]["ranks"] = nranks;
json_results["geometry"]["mpi"] = mpi_grid;
}
// creates a random gauge field. L = local(!) size
@ -149,9 +157,8 @@ ColorSpinorField make_source(int L, int Ls = 1)
return src;
}
void benchmark_wilson()
void benchmark_wilson(std::vector<int> const &L_list, int niter)
{
int niter = 20;
int niter_warmup = 10;
printfQuda("==================== wilson dirac operator ====================\n");
@ -163,7 +170,7 @@ void benchmark_wilson()
#endif
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
for (int L : {8, 12, 16, 24, 32, 48})
for (int L : L_list)
{
auto U = make_gauge_field(L);
auto src = make_source(L);
@ -180,18 +187,18 @@ void benchmark_wilson()
// Not used for simple Wilson fermions)
dirac.updateFields(&U, nullptr, nullptr, nullptr);
auto tmp = ColorSpinorField(ColorSpinorParam(src));
auto res = ColorSpinorField(ColorSpinorParam(src));
// couple iterations without timing to warm up
for (int iter = 0; iter < niter_warmup; ++iter)
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
// actual benchmark with timings
dirac.Flops(); // reset flops counter
device_timer_t device_timer;
device_timer.start();
for (int iter = 0; iter < niter; ++iter)
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
device_timer.stop();
double secs = device_timer.last() / niter;
@ -209,12 +216,16 @@ void benchmark_wilson()
#endif
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
json tmp;
tmp["L"] = L;
tmp["Gflops_wilson"] = flops / secs * 1e-9;
json_results["flops"]["results"].push_back(tmp);
}
}
void benchmark_dwf()
void benchmark_dwf(std::vector<int> const &L_list, int niter)
{
int niter = 20;
int niter_warmup = 10;
printfQuda("==================== domain wall dirac operator ====================\n");
@ -226,7 +237,7 @@ void benchmark_dwf()
#endif
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
int Ls = 12;
for (int L : {8, 12, 16, 24})
for (int L : L_list)
{
auto U = make_gauge_field(L);
auto src = make_source(L, Ls);
@ -244,18 +255,18 @@ void benchmark_dwf()
// (the additional nullptr's are for smeared links and fancy preconditioners and such)
dirac.updateFields(&U, nullptr, nullptr, nullptr);
auto tmp = ColorSpinorField(ColorSpinorParam(src));
auto res = ColorSpinorField(ColorSpinorParam(src));
// couple iterations without timing to warm up
for (int iter = 0; iter < niter_warmup; ++iter)
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
// actual benchmark with timings
dirac.Flops(); // reset flops counter
device_timer_t device_timer;
device_timer.start();
for (int iter = 0; iter < niter; ++iter)
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
device_timer.stop();
double secs = device_timer.last() / niter;
@ -273,15 +284,18 @@ void benchmark_dwf()
#endif
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
json tmp;
tmp["L"] = L;
tmp["Gflops_dwf4"] = flops / secs * 1e-9;
json_results["flops"]["results"].push_back(tmp);
}
}
void benchmark_axpy()
void benchmark_axpy(std::vector<int> const &L_list, int niter)
{
// number of iterations for warmup / measurement
// (feel free to change for noise/time tradeoff)
constexpr int niter_warmup = 10;
constexpr int niter = 20;
printfQuda("==================== axpy / memory ====================\n");
@ -305,7 +319,6 @@ void benchmark_axpy()
printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)",
"GiB/s/rank", "Gflop/s/rank");
std::vector L_list = {8, 12, 16, 24, 32, 48};
for (int L : L_list)
{
// IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
@ -348,14 +361,29 @@ void benchmark_axpy()
blas::axpy(1.234, fieldA, fieldB);
device_timer.stop();
double secs = device_timer.last() / niter; // seconds per iteration
double mem_MiB = memory / 1024. / 1024.;
double GBps = mem_MiB / 1024 / secs;
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
flops / secs * 1e-9);
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6,
memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9);
json tmp;
tmp["L"] = L;
tmp["size_MB"] = mem_MiB;
tmp["GBps"] = GBps;
tmp["GFlops"] = flops / secs * 1e-9;
json_results["axpy"].push_back(tmp);
}
}
int main(int argc, char **argv)
{
std::string json_filename = ""; // empty indicates no json output
for (int i = 0; i < argc; i++)
{
if (std::string(argv[i]) == "--json-out")
json_filename = argv[i + 1];
}
initComms(argc, argv);
initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used
@ -367,14 +395,28 @@ int main(int argc, char **argv)
printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
mpi_grid[3]);
benchmark_axpy();
benchmark_axpy({8, 12, 16, 24, 32, 48}, 20);
setVerbosity(QUDA_SILENT);
benchmark_wilson();
benchmark_dwf();
benchmark_wilson({8, 12, 16, 24, 32, 48}, 20);
benchmark_dwf({8, 12, 16, 24, 32}, 20);
setVerbosity(QUDA_SUMMARIZE);
printfQuda("==================== done with all benchmarks ====================\n");
if (!json_filename.empty())
{
printfQuda("writing benchmark results to %s\n", json_filename.c_str());
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
if (me == 0)
{
std::ofstream json_file(json_filename);
json_file << std::setw(2) << json_results;
}
}
endQuda();
quda::comm_finalize();
MPI_Finalize();