benchmark-quda #3
@ -4,17 +4,22 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <color_spinor_field.h>
|
#include <color_spinor_field.h>
|
||||||
#include <dirac_quda.h>
|
#include <dirac_quda.h>
|
||||||
|
#include <fstream>
|
||||||
#include <gauge_tools.h>
|
#include <gauge_tools.h>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
using namespace quda;
|
|
||||||
|
|
||||||
// remove to use QUDA's own flop counting instead of Grid's convention
|
// remove to use QUDA's own flop counting instead of Grid's convention
|
||||||
#define FLOP_COUNTING_GRID
|
#define FLOP_COUNTING_GRID
|
||||||
|
|
||||||
|
#include "json.hpp"
|
||||||
|
using nlohmann::json;
|
||||||
|
json json_results;
|
||||||
|
|
||||||
|
using namespace quda;
|
||||||
|
|
||||||
// This is the MPI grid, i.e. the layout of ranks
|
// This is the MPI grid, i.e. the layout of ranks
|
||||||
int nranks = -1;
|
int nranks = -1;
|
||||||
std::array<int, 4> mpi_grid = {1, 1, 1, 1};
|
std::array<int, 4> mpi_grid = {1, 1, 1, 1};
|
||||||
@ -43,6 +48,9 @@ void initComms(int argc, char **argv)
|
|||||||
for (int d = 0; d < 4; d++)
|
for (int d = 0; d < 4; d++)
|
||||||
if (mpi_grid[d] > 1)
|
if (mpi_grid[d] > 1)
|
||||||
commDimPartitionedSet(d);
|
commDimPartitionedSet(d);
|
||||||
|
|
||||||
|
json_results["geometry"]["ranks"] = nranks;
|
||||||
|
json_results["geometry"]["mpi"] = mpi_grid;
|
||||||
}
|
}
|
||||||
|
|
||||||
// creates a random gauge field. L = local(!) size
|
// creates a random gauge field. L = local(!) size
|
||||||
@ -149,9 +157,8 @@ ColorSpinorField make_source(int L, int Ls = 1)
|
|||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
void benchmark_wilson()
|
void benchmark_wilson(std::vector<int> const &L_list, int niter)
|
||||||
{
|
{
|
||||||
int niter = 20;
|
|
||||||
int niter_warmup = 10;
|
int niter_warmup = 10;
|
||||||
|
|
||||||
printfQuda("==================== wilson dirac operator ====================\n");
|
printfQuda("==================== wilson dirac operator ====================\n");
|
||||||
@ -163,7 +170,7 @@ void benchmark_wilson()
|
|||||||
#endif
|
#endif
|
||||||
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
||||||
|
|
||||||
for (int L : {8, 12, 16, 24, 32, 48})
|
for (int L : L_list)
|
||||||
{
|
{
|
||||||
auto U = make_gauge_field(L);
|
auto U = make_gauge_field(L);
|
||||||
auto src = make_source(L);
|
auto src = make_source(L);
|
||||||
@ -180,18 +187,18 @@ void benchmark_wilson()
|
|||||||
// Not used for simple Wilson fermions)
|
// Not used for simple Wilson fermions)
|
||||||
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
||||||
|
|
||||||
auto tmp = ColorSpinorField(ColorSpinorParam(src));
|
auto res = ColorSpinorField(ColorSpinorParam(src));
|
||||||
|
|
||||||
// couple iterations without timing to warm up
|
// couple iterations without timing to warm up
|
||||||
for (int iter = 0; iter < niter_warmup; ++iter)
|
for (int iter = 0; iter < niter_warmup; ++iter)
|
||||||
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
|
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
|
||||||
|
|
||||||
// actual benchmark with timings
|
// actual benchmark with timings
|
||||||
dirac.Flops(); // reset flops counter
|
dirac.Flops(); // reset flops counter
|
||||||
device_timer_t device_timer;
|
device_timer_t device_timer;
|
||||||
device_timer.start();
|
device_timer.start();
|
||||||
for (int iter = 0; iter < niter; ++iter)
|
for (int iter = 0; iter < niter; ++iter)
|
||||||
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
|
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
|
||||||
device_timer.stop();
|
device_timer.stop();
|
||||||
|
|
||||||
double secs = device_timer.last() / niter;
|
double secs = device_timer.last() / niter;
|
||||||
@ -209,12 +216,16 @@ void benchmark_wilson()
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
||||||
|
|
||||||
|
json tmp;
|
||||||
|
tmp["L"] = L;
|
||||||
|
tmp["Gflops_wilson"] = flops / secs * 1e-9;
|
||||||
|
json_results["flops"]["results"].push_back(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void benchmark_dwf()
|
void benchmark_dwf(std::vector<int> const &L_list, int niter)
|
||||||
{
|
{
|
||||||
int niter = 20;
|
|
||||||
int niter_warmup = 10;
|
int niter_warmup = 10;
|
||||||
|
|
||||||
printfQuda("==================== domain wall dirac operator ====================\n");
|
printfQuda("==================== domain wall dirac operator ====================\n");
|
||||||
@ -226,7 +237,7 @@ void benchmark_dwf()
|
|||||||
#endif
|
#endif
|
||||||
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
||||||
int Ls = 12;
|
int Ls = 12;
|
||||||
for (int L : {8, 12, 16, 24})
|
for (int L : L_list)
|
||||||
{
|
{
|
||||||
auto U = make_gauge_field(L);
|
auto U = make_gauge_field(L);
|
||||||
auto src = make_source(L, Ls);
|
auto src = make_source(L, Ls);
|
||||||
@ -244,18 +255,18 @@ void benchmark_dwf()
|
|||||||
// (the additional nullptr's are for smeared links and fancy preconditioners and such)
|
// (the additional nullptr's are for smeared links and fancy preconditioners and such)
|
||||||
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
||||||
|
|
||||||
auto tmp = ColorSpinorField(ColorSpinorParam(src));
|
auto res = ColorSpinorField(ColorSpinorParam(src));
|
||||||
|
|
||||||
// couple iterations without timing to warm up
|
// couple iterations without timing to warm up
|
||||||
for (int iter = 0; iter < niter_warmup; ++iter)
|
for (int iter = 0; iter < niter_warmup; ++iter)
|
||||||
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
|
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
|
||||||
|
|
||||||
// actual benchmark with timings
|
// actual benchmark with timings
|
||||||
dirac.Flops(); // reset flops counter
|
dirac.Flops(); // reset flops counter
|
||||||
device_timer_t device_timer;
|
device_timer_t device_timer;
|
||||||
device_timer.start();
|
device_timer.start();
|
||||||
for (int iter = 0; iter < niter; ++iter)
|
for (int iter = 0; iter < niter; ++iter)
|
||||||
dirac.Dslash(tmp, src, QUDA_EVEN_PARITY);
|
dirac.Dslash(res, src, QUDA_EVEN_PARITY);
|
||||||
device_timer.stop();
|
device_timer.stop();
|
||||||
|
|
||||||
double secs = device_timer.last() / niter;
|
double secs = device_timer.last() / niter;
|
||||||
@ -273,15 +284,18 @@ void benchmark_dwf()
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
||||||
|
json tmp;
|
||||||
|
tmp["L"] = L;
|
||||||
|
tmp["Gflops_dwf4"] = flops / secs * 1e-9;
|
||||||
|
json_results["flops"]["results"].push_back(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void benchmark_axpy()
|
void benchmark_axpy(std::vector<int> const &L_list, int niter)
|
||||||
{
|
{
|
||||||
// number of iterations for warmup / measurement
|
// number of iterations for warmup / measurement
|
||||||
// (feel free to change for noise/time tradeoff)
|
// (feel free to change for noise/time tradeoff)
|
||||||
constexpr int niter_warmup = 10;
|
constexpr int niter_warmup = 10;
|
||||||
constexpr int niter = 20;
|
|
||||||
|
|
||||||
printfQuda("==================== axpy / memory ====================\n");
|
printfQuda("==================== axpy / memory ====================\n");
|
||||||
|
|
||||||
@ -305,7 +319,6 @@ void benchmark_axpy()
|
|||||||
|
|
||||||
printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)",
|
printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)",
|
||||||
"GiB/s/rank", "Gflop/s/rank");
|
"GiB/s/rank", "Gflop/s/rank");
|
||||||
std::vector L_list = {8, 12, 16, 24, 32, 48};
|
|
||||||
for (int L : L_list)
|
for (int L : L_list)
|
||||||
{
|
{
|
||||||
// IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
|
// IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
|
||||||
@ -348,14 +361,29 @@ void benchmark_axpy()
|
|||||||
blas::axpy(1.234, fieldA, fieldB);
|
blas::axpy(1.234, fieldA, fieldB);
|
||||||
device_timer.stop();
|
device_timer.stop();
|
||||||
double secs = device_timer.last() / niter; // seconds per iteration
|
double secs = device_timer.last() / niter; // seconds per iteration
|
||||||
|
double mem_MiB = memory / 1024. / 1024.;
|
||||||
|
double GBps = mem_MiB / 1024 / secs;
|
||||||
|
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
|
||||||
|
flops / secs * 1e-9);
|
||||||
|
|
||||||
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6,
|
json tmp;
|
||||||
memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9);
|
tmp["L"] = L;
|
||||||
|
tmp["size_MB"] = mem_MiB;
|
||||||
|
tmp["GBps"] = GBps;
|
||||||
|
tmp["GFlops"] = flops / secs * 1e-9;
|
||||||
|
json_results["axpy"].push_back(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
std::string json_filename = ""; // empty indicates no json output
|
||||||
|
for (int i = 0; i < argc; i++)
|
||||||
|
{
|
||||||
|
if (std::string(argv[i]) == "--json-out")
|
||||||
|
json_filename = argv[i + 1];
|
||||||
|
}
|
||||||
|
|
||||||
initComms(argc, argv);
|
initComms(argc, argv);
|
||||||
|
|
||||||
initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used
|
initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used
|
||||||
@ -367,14 +395,28 @@ int main(int argc, char **argv)
|
|||||||
printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
|
printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
|
||||||
mpi_grid[3]);
|
mpi_grid[3]);
|
||||||
|
|
||||||
benchmark_axpy();
|
benchmark_axpy({8, 12, 16, 24, 32, 48}, 20);
|
||||||
|
|
||||||
setVerbosity(QUDA_SILENT);
|
setVerbosity(QUDA_SILENT);
|
||||||
benchmark_wilson();
|
benchmark_wilson({8, 12, 16, 24, 32, 48}, 20);
|
||||||
benchmark_dwf();
|
benchmark_dwf({8, 12, 16, 24, 32}, 20);
|
||||||
setVerbosity(QUDA_SUMMARIZE);
|
setVerbosity(QUDA_SUMMARIZE);
|
||||||
|
|
||||||
printfQuda("==================== done with all benchmarks ====================\n");
|
printfQuda("==================== done with all benchmarks ====================\n");
|
||||||
|
|
||||||
|
if (!json_filename.empty())
|
||||||
|
{
|
||||||
|
printfQuda("writing benchmark results to %s\n", json_filename.c_str());
|
||||||
|
|
||||||
|
int me = 0;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||||
|
if (me == 0)
|
||||||
|
{
|
||||||
|
std::ofstream json_file(json_filename);
|
||||||
|
json_file << std::setw(2) << json_results;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
endQuda();
|
endQuda();
|
||||||
quda::comm_finalize();
|
quda::comm_finalize();
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
|
Loading…
Reference in New Issue
Block a user