diff --git a/Grid/Benchmark_Grid.cpp b/Grid/Benchmark_Grid.cpp index 59bbdc2..d27707c 100644 --- a/Grid/Benchmark_Grid.cpp +++ b/Grid/Benchmark_Grid.cpp @@ -73,6 +73,8 @@ class Benchmark {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); + Grid::Coordinate shm; + GlobalSharedMemory::GetShmDims(mpi, shm); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); @@ -85,7 +87,9 @@ class Benchmark std::cout << GridLogMessage << "* OpenMP threads : " << GridThread::GetThreads() << std::endl; - std::cout << GridLogMessage << "* MPI tasks : " << GridCmdVectorIntToString(mpi) + std::cout << GridLogMessage << "* MPI layout : " << GridCmdVectorIntToString(mpi) + << std::endl; + std::cout << GridLogMessage << "* Shm layout : " << GridCmdVectorIntToString(shm) << std::endl; std::cout << GridLogMessage << "* vReal : " << sizeof(vReal) * 8 << "bits ; " @@ -118,6 +122,7 @@ class Benchmark for (unsigned int i = 0; i < mpi.size(); ++i) { tmp["mpi"].push_back(mpi[i]); + tmp["shm"].push_back(shm[i]); } tmp["ranks"] = NP; tmp["nodes"] = NN; @@ -132,6 +137,8 @@ class Benchmark Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); + Coordinate shm_layout; + GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout); for (int mu = 0; mu < Nd; mu++) if (mpi_layout[mu] > 1) @@ -143,8 +150,8 @@ class Benchmark std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl; grid_small_sep(); - grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)", - "time (usec)", "rate (GB/s/node)", "std dev", "max"); + grid_printf("%5s %5s %7s %15s %15s %15s %15s %15s\n", "L", "dir", "shm", + "payload (B)", "time (usec)", "rate (GB/s/node)", "std dev", "max"); for (int lat = 16; lat <= maxlat; lat += 8) { @@ -173,74 +180,80 @@ class Benchmark for (int dir = 0; dir < 8; dir++) { int mu = dir % 4; - if (mpi_layout[mu] > 1) + if (mpi_layout[mu] == 1) // skip directions that are not distributed + continue; + bool is_shm = mpi_layout[mu] == shm_layout[mu]; + bool is_partial_shm = !is_shm && shm_layout[mu] != 1; + + std::vector times(Nloop); + for (int i = 0; i < NWARMUP; i++) + { + int xmit_to_rank; + int recv_from_rank; + + if (dir == mu) + { + int comm_proc = 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + } + else + { + int comm_proc = mpi_layout[mu] - 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + } + Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], + recv_from_rank, bytes); + } + for (int i = 0; i < Nloop; i++) { - std::vector times(Nloop); - for (int i = 0; i < NWARMUP; i++) + dbytes = 0; + double start = usecond(); + int xmit_to_rank; + int recv_from_rank; + + if (dir == mu) { - int xmit_to_rank; - int recv_from_rank; - - if (dir == mu) - { - int comm_proc = 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - else - { - int comm_proc = mpi_layout[mu] - 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, bytes); + int comm_proc = 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); } - for (int i = 0; i < Nloop; i++) + else { - - dbytes = 0; - double start = usecond(); - int xmit_to_rank; - int recv_from_rank; - - if (dir == mu) - { - int comm_proc = 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - else - { - int comm_proc = mpi_layout[mu] - 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, bytes); - dbytes += bytes; - - double stop = usecond(); - t_time[i] = stop - start; // microseconds + int comm_proc = mpi_layout[mu] - 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); } - timestat.statistics(t_time); + Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], + recv_from_rank, bytes); + dbytes += bytes; - dbytes = dbytes * ppn; - double bidibytes = 2. * dbytes; - double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; - double rate_err = rate * timestat.err / timestat.mean; - double rate_max = rate * timestat.mean / timestat.min; - grid_printf("%5d %5d %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, bytes, - timestat.mean, rate, rate_err, rate_max); - nlohmann::json tmp; - nlohmann::json tmp_rate; - tmp["L"] = lat; - tmp["dir"] = dir; - tmp["bytes"] = bytes; - tmp["time_usec"] = timestat.mean; - tmp_rate["mean"] = rate; - tmp_rate["error"] = rate_err; - tmp_rate["max"] = rate_max; - tmp["rate_GBps"] = tmp_rate; - json_results["comms"].push_back(tmp); + double stop = usecond(); + t_time[i] = stop - start; // microseconds } + timestat.statistics(t_time); + + dbytes = dbytes * ppn; + double bidibytes = 2. * dbytes; + double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; + double rate_err = rate * timestat.err / timestat.mean; + double rate_max = rate * timestat.mean / timestat.min; + grid_printf("%5d %5d %7s %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, + is_shm ? "yes" + : is_partial_shm ? "partial" + : "no", + bytes, timestat.mean, rate, rate_err, rate_max); + nlohmann::json tmp; + nlohmann::json tmp_rate; + tmp["L"] = lat; + tmp["dir"] = dir; + tmp["shared_mem"] = is_shm; + tmp["partial_shared_mem"] = is_partial_shm; + tmp["bytes"] = bytes; + tmp["time_usec"] = timestat.mean; + tmp_rate["mean"] = rate; + tmp_rate["error"] = rate_err; + tmp_rate["max"] = rate_max; + tmp["rate_GBps"] = tmp_rate; + json_results["comms"].push_back(tmp); } for (int d = 0; d < 8; d++) { diff --git a/Quda/.clang-format b/Quda/.clang-format new file mode 100644 index 0000000..9d54a25 --- /dev/null +++ b/Quda/.clang-format @@ -0,0 +1,14 @@ +{ + BasedOnStyle: LLVM, + UseTab: Never, + IndentWidth: 2, + TabWidth: 2, + BreakBeforeBraces: Allman, + AllowShortIfStatementsOnASingleLine: false, + IndentCaseLabels: false, + ColumnLimit: 90, + AccessModifierOffset: -4, + NamespaceIndentation: All, + FixNamespaceComments: false, + SortIncludes: true, +} diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp new file mode 100644 index 0000000..7421652 --- /dev/null +++ b/Quda/Benchmark_Quda.cpp @@ -0,0 +1,458 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// remove to use QUDA's own flop counting instead of Grid's convention +#define FLOP_COUNTING_GRID + +#include "json.hpp" +using nlohmann::json; +json json_results; + +using namespace quda; + +// thanks chatGPT :) +std::string get_timestamp() +{ + // Get the current time + auto now = std::chrono::system_clock::now(); + + // Convert the current time to a time_t object + std::time_t currentTime = std::chrono::system_clock::to_time_t(now); + + // Format the time using std::put_time + std::stringstream ss; + ss << std::put_time(std::localtime(¤tTime), "%Y%m%d %H:%M:%S"); + + return ss.str(); +} + +// This is the MPI grid, i.e. the layout of ranks +int nranks = -1; +std::array mpi_grid = {1, 1, 1, 1}; + +// run f() in a loop for roughly target_time seconds +// returns seconds per iteration it took +template double bench(F const &f, double target_time, int niter_warmup = 5) +{ + device_timer_t timer; + timer.start(); + for (int iter = 0; iter < niter_warmup; ++iter) + f(); + timer.stop(); + + double secs = timer.last() / niter_warmup; + int niter = std::max(1, int(target_time / secs)); + // niter = std::min(1000, niter); + // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter); + + // important: each rank has its own timer, so their measurements can slightly vary. But + // 'niter' needs to be consistent (bug took me a couple hours to track down) + comm_broadcast_global(&niter, sizeof(niter), 0); + + timer.reset(__FUNCTION__, __FILE__, __LINE__); + timer.start(); + for (int iter = 0; iter < niter; ++iter) + f(); + timer.stop(); + return timer.last() / niter; +} + +void initComms(int argc, char **argv) +{ + // init MPI communication + MPI_Init(&argc, &argv); + + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + assert(1 <= nranks && nranks <= 100000); + + mpi_grid[3] = nranks; + + // this maps coordinates to rank number + auto lex_rank_from_coords = [](int const *coords, void *) + { + int rank = coords[0]; + for (int i = 1; i < 4; i++) + rank = mpi_grid[i] * rank + coords[i]; + return rank; + }; + + initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr); + + for (int d = 0; d < 4; d++) + if (mpi_grid[d] > 1) + commDimPartitionedSet(d); + + json_results["geometry"]["ranks"] = nranks; + json_results["geometry"]["mpi"] = mpi_grid; +} + +// creates a random gauge field. L = local(!) size +cudaGaugeField make_gauge_field(int L) +{ + GaugeFieldParam param; + + // dimension and type of the lattice object + param.nDim = 4; + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; + + // number of colors. potentially confusingly, QUDA sometimes uses the word "color" to + // things unrelated with physical color. things like "nColor=32" do pop up in deflation + // solvers where it (to my understanding) refers to the number of (parallely processed) + // deflation vectors. + param.nColor = 3; + + // boundary conditions (dont really care for benchmark) + param.t_boundary = QUDA_PERIODIC_T; + + // for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller + // precisions are available in QUDA too + param.setPrecision(QUDA_SINGLE_PRECISION); + + // no even/odd subset, we want a full lattice + param.siteSubset = QUDA_FULL_SITE_SUBSET; + + // what kind of 3x3 matrices the field contains. A proper gauge field has SU(3) + // matrices, but (for example) smeared/thick links could have non-unitary links. + param.link_type = QUDA_SU3_LINKS; + + // "NULL" does not initialize the field upon creation, "ZERO" would set everything to 0 + param.create = QUDA_NULL_FIELD_CREATE; + + // field should be allocated directly on the accelerator/GPU + param.location = QUDA_CUDA_FIELD_LOCATION; + + // "reconstruct" here means reconstructing a SU(3) matrix from fewer than 18 real + // numbers (=3x3 complex numbers). Great feature in production (saving + // memory/cache/network bandwidth), not used for this benchmark. + param.reconstruct = QUDA_RECONSTRUCT_NO; + + // "ghostExchange" would often be called "halo exchange" outside of Quda. This has + // nothing to do with ghost fields from continuum/perturbative qcd. + param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; + + // This controls the physical order of elements. "float2" is the the default + param.order = QUDA_FLOAT2_GAUGE_ORDER; + + // this means the field is a LORENTZ vector (which a gauge field must be). Has nothing + // to do with spin. + param.geometry = QUDA_VECTOR_GEOMETRY; + + // create the field and fill with random SU(3) matrices + // std::cout << param << std::endl; // double-check parameters + auto U = cudaGaugeField(param); + gaugeGauss(U, /*seed=*/1234, 1.0); + return U; +} + +// create a random source vector (L = local size) +ColorSpinorField make_source(int L, int Ls = 1) +{ + // NOTE: `param.x` directly determines the size of the (local, per rank) memory + // allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a + // physical local volume of L^4, thus implicity choosing a dimension for the + // checkerboarding (shouldnt really matter of course which one). + ColorSpinorParam param; + param.nColor = 3; + param.nSpin = 4; + param.nVec = 1; // only a single vector + param.pad = 0; + param.siteSubset = QUDA_PARITY_SITE_SUBSET; + param.nDim = Ls == 1 ? 4 : 5; + param.x[0] = L / 2; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; + param.x[4] = Ls; + param.pc_type = QUDA_4D_PC; + param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + + // somewhat surprisingly, the DiracWilson::Dslash(...) function only works with the + // UKQCD_GAMMA_BASIS + param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; + + param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field + param.setPrecision(QUDA_SINGLE_PRECISION); + param.location = QUDA_CUDA_FIELD_LOCATION; + + // create the field and fill it with random values + auto src = ColorSpinorField(param); + quda::RNG rng(src, 1234); + spinorNoise(src, rng, QUDA_NOISE_GAUSS); + /*printfQuda( + "created src with norm = %f (sanity check: should be close to %f) and %f bytes\n", + blas::norm2(src), 2.0 * 12 * geom[0] * geom[1] * geom[2] * geom[3], + src.Bytes() * 1.0);*/ + // src.PrintDims(); + + return src; +} + +void benchmark_wilson(std::vector const &L_list, double target_time) +{ + printfQuda("==================== wilson dirac operator ====================\n"); +#ifdef FLOP_COUNTING_GRID + printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); +#else + printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " + "Benchmark_Grid)\n"); +#endif + printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); + + for (int L : L_list) + { + // printfQuda("starting wilson L=%d\n", L); + + auto U = make_gauge_field(L); + auto src = make_source(L); + + // create (Wilson) dirac operator + DiracParam param; + param.kappa = 0.10; + param.dagger = QUDA_DAG_NO; + param.matpcType = QUDA_MATPC_EVEN_EVEN; + auto dirac = DiracWilson(param); + + // insert gauge field into the dirac operator + // (the additional nullptr's are for smeared links and fancy preconditioners and such. + // Not used for simple Wilson fermions) + dirac.updateFields(&U, nullptr, nullptr, nullptr); + auto res = ColorSpinorField(ColorSpinorParam(src)); + auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; + + // first run to get the quda tuning out of the way + dirac.Flops(); // reset flops counter + f(); + double flops = 1.0 * dirac.Flops(); + + // actual benchmarking + auto start_time = get_timestamp(); + double secs = bench(f, target_time); + auto end_time = get_timestamp(); + +#ifdef FLOP_COUNTING_GRID + // this is the flop counting from Benchmark_Grid + double Nc = 3; + double Nd = 4; + double Ns = 4; + flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops *= L * L * L * L / 2.0; +#endif + + printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + + json tmp; + tmp["L"] = L; + tmp["Gflops_wilson"] = flops / secs * 1e-9; + tmp["start_time"] = start_time; + tmp["end_time"] = end_time; + json_results["flops"]["results"].push_back(tmp); + } +} + +void benchmark_dwf(std::vector const &L_list, double target_time) +{ + printfQuda("==================== domain wall dirac operator ====================\n"); +#ifdef FLOP_COUNTING_GRID + printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); +#else + printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " + "Benchmark_Grid)\n"); +#endif + printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); + int Ls = 12; + for (int L : L_list) + { + // printfQuda("starting dwf L=%d\n", L); + auto U = make_gauge_field(L); + auto src = make_source(L, Ls); + + // create dirac operator + DiracParam param; + param.kappa = 0.10; + param.Ls = Ls; + param.m5 = 0.1; + param.dagger = QUDA_DAG_NO; + param.matpcType = QUDA_MATPC_EVEN_EVEN; + auto dirac = DiracDomainWall(param); + + // insert gauge field into the dirac operator + // (the additional nullptr's are for smeared links and fancy preconditioners and such) + dirac.updateFields(&U, nullptr, nullptr, nullptr); + auto res = ColorSpinorField(ColorSpinorParam(src)); + auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; + + // first run to get the quda tuning out of the way + dirac.Flops(); // reset flops counter + f(); + double flops = 1.0 * dirac.Flops(); + + // actual benchmarking + auto start_time = get_timestamp(); + double secs = bench(f, target_time); + auto end_time = get_timestamp(); + +#ifdef FLOP_COUNTING_GRID + // this is the flop counting from Benchmark_Grid + double Nc = 3; + double Nd = 4; + double Ns = 4; + flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops *= L * L * L * L * Ls / 2.0; +#endif + + printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + json tmp; + tmp["L"] = L; + tmp["Gflops_dwf4"] = flops / secs * 1e-9; + tmp["start_time"] = start_time; + tmp["end_time"] = end_time; + json_results["flops"]["results"].push_back(tmp); + } +} + +void benchmark_axpy(std::vector const &L_list, double target_time) +{ + // number of iterations for warmup / measurement + // (feel free to change for noise/time tradeoff) + constexpr int niter_warmup = 5; + + printfQuda("==================== axpy / memory ====================\n"); + + ColorSpinorParam param; + param.nDim = 4; // 4-dimensional lattice + param.x[4] = 1; // no fifth dimension + param.nColor = 3; // supported values for nSpin/nColor are configured when compiling + // QUDA. "3*4" will probably always be enabled, so we stick with this + param.nSpin = 4; + param.nVec = 1; // just a single vector + param.siteSubset = QUDA_FULL_SITE_SUBSET; // full lattice = no odd/even + param.pad = 0; // no padding + param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field + param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU + param.setPrecision(QUDA_SINGLE_PRECISION); + + // the following dont matter for an axpy benchmark, but need to choose something + param.pc_type = QUDA_4D_PC; + param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + + printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", + "GiB/s/rank", "Gflop/s/rank"); + for (int L : L_list) + { + // printfQuda("starting axpy L=%d\n", L); + // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` + // are LOCAL, i.e. per rank / per GPU + + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; + + // number of (real) elements in one (local) field + size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * + param.nColor * param.nSpin; + + // create the field(s) + auto fieldA = ColorSpinorField(param); + auto fieldB = ColorSpinorField(param); + assert(fieldA.Bytes() == sizeof(float) * field_elements); // sanity check + assert(fieldB.Bytes() == sizeof(float) * field_elements); // sanity check + + // fill fields with random values + quda::RNG rng(fieldA, 1234); + spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); + spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); + + // number of operations / bytes per iteration + // axpy is one addition, one multiplication, two read, one write + double flops = 2 * field_elements; + double memory = 3 * sizeof(float) * field_elements; + + auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); }; + + // first run to get the quda tuning out of the way + f(); + + // actual benchmarking + auto start_time = get_timestamp(); + double secs = bench(f, target_time); + auto end_time = get_timestamp(); + + double mem_MiB = memory / 1024. / 1024.; + double GBps = mem_MiB / 1024 / secs; + printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps, + flops / secs * 1e-9); + + json tmp; + tmp["L"] = L; + tmp["size_MB"] = mem_MiB; + tmp["GBps"] = GBps; + tmp["GFlops"] = flops / secs * 1e-9; + tmp["start_time"] = start_time; + tmp["end_time"] = end_time; + json_results["axpy"].push_back(tmp); + } +} + +int main(int argc, char **argv) +{ + std::string json_filename = ""; // empty indicates no json output + for (int i = 0; i < argc; i++) + { + if (std::string(argv[i]) == "--json-out") + json_filename = argv[i + 1]; + } + + initComms(argc, argv); + + initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used + + // verbosity options are: + // SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE + setVerbosity(QUDA_SUMMARIZE); + + printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], + mpi_grid[3]); + + benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0); + + setVerbosity(QUDA_SILENT); + benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0); + benchmark_dwf({8, 12, 16, 24, 32}, 1.0); + setVerbosity(QUDA_SUMMARIZE); + + printfQuda("==================== done with all benchmarks ====================\n"); + + if (!json_filename.empty()) + { + printfQuda("writing benchmark results to %s\n", json_filename.c_str()); + + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + if (me == 0) + { + std::ofstream json_file(json_filename); + json_file << std::setw(2) << json_results; + } + } + + endQuda(); + quda::comm_finalize(); + MPI_Finalize(); +} diff --git a/Quda/Readme.md b/Quda/Readme.md new file mode 100644 index 0000000..a162304 --- /dev/null +++ b/Quda/Readme.md @@ -0,0 +1,30 @@ +# QUDA benchmarks + +This folder contains benchmarks for the [QUDA](https://github.com/lattice/quda) library. + +- `Benchmark_Quda`: This benchmark measure floating point performances of fermion +matrices (Wilson and DWF), as well as memory bandwidth (using a simple `axpy` operation). Measurements are +performed for a fixed range of problem sizes. + +## Building +After setting up your compilation environment (Tursa: `source /home/dp207/dp207/shared/env/production/env-{base,gpu}.sh`): +```bash +./build-quda.sh # build Quda +./build-benchmark.sh # build benchmark +``` +where `` is an arbitrary directory where every product will be stored. + +## Running the Benchmark + +The benchmark should be run as +```bash +mpirun -np /prefix/qudabench/Benchmark_Quda +``` +where `` is the total number of GPU's to use. On Tursa this is 4 times the number of nodes. + +Note: +- on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary. +- due to Qudas automatic tuning, the benchmark might take significantly longer to run than `Benchmark_Grid` (even though it does fewer things). + - setting `QUDA_ENABLE_TUNING=0` disables all tuning (degrades performance severely). By default, it is turned on. + - setting `QUDA_RESOURCE_PATH=` enables Quda to save and reuse optimal tuning parameters, making repeated runs much faster + \ No newline at end of file diff --git a/Quda/build-benchmark.sh b/Quda/build-benchmark.sh new file mode 100755 index 0000000..288f3ac --- /dev/null +++ b/Quda/build-benchmark.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1090,SC1091 + +set -euo pipefail + +if (( $# != 1 )); then + echo "usage: $(basename "$0") " 1>&2 + exit 1 +fi +env_dir=$1 + +# TODO: this is Tursa specific. have not figured out the correct way to do this. +EXTRA_LIBS="/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so" + +# NOTE: these flags need to be in sync with Qudas compilation options (see build-quda.sh) +BUILD_FLAGS="-O3 -std=c++17 -DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=12 -DQUDA_RECONSTRUCT=4" + +call_dir=$(pwd -P) +script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" +cd "${env_dir}" +env_dir=$(pwd -P) +cd "${call_dir}" +BUILD_DIR="${env_dir}/build/Quda-benchmarks" +PREFIX_DIR="${env_dir}/prefix/qudabench" +QUDA_DIR=${env_dir}/prefix/quda +mkdir -p "${BUILD_DIR}" +mkdir -p "${PREFIX_DIR}" + +LINK_FLAGS="-Wl,-rpath,$QUDA_DIR/lib: $QUDA_DIR/lib/libquda.so $EXTRA_LIBS -lpthread -lmpi" + +g++ $BUILD_FLAGS -I$QUDA_DIR/include/targets/cuda -I$QUDA_DIR/include -c -o $BUILD_DIR/Benchmark_Quda.o $script_dir/Benchmark_Quda.cpp +g++ -g -O3 $BUILD_DIR/Benchmark_Quda.o -o $PREFIX_DIR/Benchmark_Quda $LINK_FLAGS -lmpi diff --git a/Quda/build-quda.sh b/Quda/build-quda.sh new file mode 100755 index 0000000..fa2368d --- /dev/null +++ b/Quda/build-quda.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1090,SC1091 + +BUILD_FLAGS="-O3 -std=c++17" +QUDA_FLAGS="-DQUDA_MPI=ON -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=4 -DQUDA_GPU_ARCH=sm_80" + +set -euo pipefail + +if (( $# != 1 )); then + echo "usage: $(basename "$0") " 1>&2 + exit 1 +fi +env_dir=$1 + +call_dir=$(pwd -P) +mkdir -p ${env_dir} +cd "${env_dir}" +env_dir=$(pwd -P) +cd "${call_dir}" + +build_dir="${env_dir}/build/quda" +if [ -d "${build_dir}" ]; then + echo "error: directory '${build_dir}' exists" + exit 1 +fi +mkdir -p "${build_dir}" + +git clone https://github.com/lattice/quda.git "${build_dir}" +cd "${build_dir}" + +mkdir build; cd build +cmake .. $QUDA_FLAGS -DCMAKE_INSTALL_PREFIX=${env_dir}/prefix/quda +make -j128 +make install + +cd "${call_dir}" diff --git a/Quda/env.sh b/Quda/env.sh new file mode 100644 index 0000000..d88cde2 --- /dev/null +++ b/Quda/env.sh @@ -0,0 +1,21 @@ +module load gcc/9.3.0 +module load cuda/11.4.1 +module load openmpi/4.1.1-cuda11.4 + +export QUDA_RESOURCE_PATH=$(pwd)/tuning +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export OMPI_MCA_pml=ucx # by fabian. no idea what this is +#export UCX_TLS=rc,rc_x,sm,cuda_copy,cuda_ipc,gdr_copy +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_THRESH=16384 +export UCX_RNDV_SCHEME=put_zcopy +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n + +export OMPI_MCA_io=romio321 +export OMPI_MCA_btl_openib_allow_ib=true +export OMPI_MCA_btl_openib_device_type=infiniband +export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 + +export QUDA_REORDER_LOCATION=GPU # this is the default anyway