benchmark-quda #3
@ -4,17 +4,22 @@
|
||||
#include <cassert>
|
||||
#include <color_spinor_field.h>
|
||||
#include <dirac_quda.h>
|
||||
#include <fstream>
|
||||
#include <gauge_tools.h>
|
||||
#include <memory>
|
||||
#include <mpi.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
using namespace quda;
|
||||
|
||||
// remove to use QUDA's own flop counting instead of Grid's convention
|
||||
#define FLOP_COUNTING_GRID
|
||||
|
||||
#include "json.hpp"
|
||||
using nlohmann::json;
|
||||
json json_results;
|
||||
|
||||
using namespace quda;
|
||||
|
||||
// This is the MPI grid, i.e. the layout of ranks
|
||||
int nranks = -1;
|
||||
std::array<int, 4> mpi_grid = {1, 1, 1, 1};
|
||||
@ -43,6 +48,9 @@ void initComms(int argc, char **argv)
|
||||
for (int d = 0; d < 4; d++)
|
||||
if (mpi_grid[d] > 1)
|
||||
commDimPartitionedSet(d);
|
||||
|
||||
json_results["geometry"]["ranks"] = nranks;
|
||||
json_results["geometry"]["mpi"] = mpi_grid;
|
||||
}
|
||||
|
||||
// creates a random gauge field. L = local(!) size
|
||||
@ -149,9 +157,8 @@ ColorSpinorField make_source(int L, int Ls = 1)
|
||||
return src;
|
||||
}
|
||||
|
||||
void benchmark_wilson()
|
||||
void benchmark_wilson(std::vector<int> const &L_list, int niter)
|
||||
{
|
||||
int niter = 20;
|
||||
int niter_warmup = 10;
|
||||
|
||||
printfQuda("==================== wilson dirac operator ====================\n");
|
||||
@ -163,7 +170,7 @@ void benchmark_wilson()
|
||||
#endif
|
||||
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
||||
|
||||
for (int L : {8, 12, 16, 24, 32, 48})
|
||||
for (int L : L_list)
|
||||
{
|
||||
auto U = make_gauge_field(L);
|
||||
auto src = make_source(L);
|
||||
@ -180,18 +187,18 @@ void benchmark_wilson()
|
||||
// Not used for simple Wilson fermions)
|
||||
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
||||
|
||||
auto tmp = ColorSpinorField(ColorSpinorParam(src));
|
||||
auto res = ColorSpinorField(ColorSpinorParam(src));
|
||||
|
||||
// couple iterations without timing to warm up
|
||||
for (int iter = 0; iter < niter_warmup; ++iter)
|
||||
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
|
||||
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
|
||||
|
||||
// actual benchmark with timings
|
||||
dirac.Flops(); // reset flops counter
|
||||
device_timer_t device_timer;
|
||||
device_timer.start();
|
||||
for (int iter = 0; iter < niter; ++iter)
|
||||
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
|
||||
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
|
||||
device_timer.stop();
|
||||
|
||||
double secs = device_timer.last() / niter;
|
||||
@ -209,12 +216,16 @@ void benchmark_wilson()
|
||||
#endif
|
||||
|
||||
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
||||
|
||||
json tmp;
|
||||
tmp["L"] = L;
|
||||
tmp["Gflops_wilson"] = flops / secs * 1e-9;
|
||||
json_results["flops"]["results"].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void benchmark_dwf()
|
||||
void benchmark_dwf(std::vector<int> const &L_list, int niter)
|
||||
{
|
||||
int niter = 20;
|
||||
int niter_warmup = 10;
|
||||
|
||||
printfQuda("==================== domain wall dirac operator ====================\n");
|
||||
@ -226,7 +237,7 @@ void benchmark_dwf()
|
||||
#endif
|
||||
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
||||
int Ls = 12;
|
||||
for (int L : {8, 12, 16, 24})
|
||||
for (int L : L_list)
|
||||
{
|
||||
auto U = make_gauge_field(L);
|
||||
auto src = make_source(L, Ls);
|
||||
@ -244,18 +255,18 @@ void benchmark_dwf()
|
||||
// (the additional nullptr's are for smeared links and fancy preconditioners and such)
|
||||
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
||||
|
||||
auto tmp = ColorSpinorField(ColorSpinorParam(src));
|
||||
auto res = ColorSpinorField(ColorSpinorParam(src));
|
||||
|
||||
// couple iterations without timing to warm up
|
||||
for (int iter = 0; iter < niter_warmup; ++iter)
|
||||
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
|
||||
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
|
||||
|
||||
// actual benchmark with timings
|
||||
dirac.Flops(); // reset flops counter
|
||||
device_timer_t device_timer;
|
||||
device_timer.start();
|
||||
for (int iter = 0; iter < niter; ++iter)
|
||||
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
|
||||
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
|
||||
device_timer.stop();
|
||||
|
||||
double secs = device_timer.last() / niter;
|
||||
@ -273,15 +284,18 @@ void benchmark_dwf()
|
||||
#endif
|
||||
|
||||
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
||||
json tmp;
|
||||
tmp["L"] = L;
|
||||
tmp["Gflops_dwf4"] = flops / secs * 1e-9;
|
||||
json_results["flops"]["results"].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void benchmark_axpy()
|
||||
void benchmark_axpy(std::vector<int> const &L_list, int niter)
|
||||
{
|
||||
// number of iterations for warmup / measurement
|
||||
// (feel free to change for noise/time tradeoff)
|
||||
constexpr int niter_warmup = 10;
|
||||
constexpr int niter = 20;
|
||||
|
||||
printfQuda("==================== axpy / memory ====================\n");
|
||||
|
||||
@ -305,7 +319,6 @@ void benchmark_axpy()
|
||||
|
||||
printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)",
|
||||
"GiB/s/rank", "Gflop/s/rank");
|
||||
std::vector L_list = {8, 12, 16, 24, 32, 48};
|
||||
for (int L : L_list)
|
||||
{
|
||||
// IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
|
||||
@ -348,14 +361,29 @@ void benchmark_axpy()
|
||||
blas::axpy(1.234, fieldA, fieldB);
|
||||
device_timer.stop();
|
||||
double secs = device_timer.last() / niter; // seconds per iteration
|
||||
double mem_MiB = memory / 1024. / 1024.;
|
||||
double GBps = mem_MiB / 1024 / secs;
|
||||
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
|
||||
flops / secs * 1e-9);
|
||||
|
||||
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6,
|
||||
memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9);
|
||||
json tmp;
|
||||
tmp["L"] = L;
|
||||
tmp["size_MB"] = mem_MiB;
|
||||
tmp["GBps"] = GBps;
|
||||
tmp["GFlops"] = flops / secs * 1e-9;
|
||||
json_results["axpy"].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
std::string json_filename = ""; // empty indicates no json output
|
||||
for (int i = 0; i < argc; i++)
|
||||
{
|
||||
if (std::string(argv[i]) == "--json-out")
|
||||
json_filename = argv[i + 1];
|
||||
}
|
||||
|
||||
initComms(argc, argv);
|
||||
|
||||
initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used
|
||||
@ -367,14 +395,28 @@ int main(int argc, char **argv)
|
||||
printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
|
||||
mpi_grid[3]);
|
||||
|
||||
benchmark_axpy();
|
||||
benchmark_axpy({8, 12, 16, 24, 32, 48}, 20);
|
||||
|
||||
setVerbosity(QUDA_SILENT);
|
||||
benchmark_wilson();
|
||||
benchmark_dwf();
|
||||
benchmark_wilson({8, 12, 16, 24, 32, 48}, 20);
|
||||
benchmark_dwf({8, 12, 16, 24, 32}, 20);
|
||||
setVerbosity(QUDA_SUMMARIZE);
|
||||
|
||||
printfQuda("==================== done with all benchmarks ====================\n");
|
||||
|
||||
if (!json_filename.empty())
|
||||
{
|
||||
printfQuda("writing benchmark results to %s\n", json_filename.c_str());
|
||||
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
if (me == 0)
|
||||
{
|
||||
std::ofstream json_file(json_filename);
|
||||
json_file << std::setw(2) << json_results;
|
||||
}
|
||||
}
|
||||
|
||||
endQuda();
|
||||
quda::comm_finalize();
|
||||
MPI_Finalize();
|
||||
|
Loading…
Reference in New Issue
Block a user