From 9de49f867241ede742b412dfc573dabef5097a2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Mon, 24 Apr 2023 11:35:47 +0100 Subject: [PATCH] fix scaling conventions for multi-gpu --- Quda/Benchmark_Quda.cpp | 251 ++++++++++++++++++++++------------------ 1 file changed, 137 insertions(+), 114 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 0687a06..3a0b795 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -1,56 +1,58 @@ #include #include #include +#include #include -#include -// #include +#include +#include #include +#include #include #include -#include -#include -#include - using namespace quda; -QudaPrecision smoother_halo_prec = QUDA_INVALID_PRECISION; +// This is the MPI grid, i.e. the layout of ranks +int nranks = -1; +std::array mpi_grid = {1, 1, 1, 1}; -// This is the MPI grid, i.e. the layout of ranks, not the lattice volume -std::array gridsize = {1, 1, 1, 4}; - -void initComms(int argc, char **argv, std::array const &commDims) +void initComms(int argc, char **argv) { // init MPI communication MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + assert(1 <= nranks && nranks <= 100000); + + mpi_grid[3] = nranks; + // this maps coordinates to rank number auto lex_rank_from_coords = [](int const *coords, void *) { int rank = coords[0]; for (int i = 1; i < 4; i++) - rank = gridsize[i] * rank + coords[i]; + rank = mpi_grid[i] * rank + coords[i]; return rank; }; - initCommsGridQuda(4, commDims.data(), lex_rank_from_coords, nullptr); + initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr); for (int d = 0; d < 4; d++) - if (gridsize[d] > 1) + if (mpi_grid[d] > 1) commDimPartitionedSet(d); } -// creates a random gauge field -cudaGaugeField make_gauge_field(std::array const &geom) +// creates a random gauge field. L = local(!) size +cudaGaugeField make_gauge_field(int L) { GaugeFieldParam param; // dimension and type of the lattice object param.nDim = 4; - param.x[0] = geom[0]; - param.x[1] = geom[1]; - param.x[2] = geom[2]; - param.x[3] = geom[3]; + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; // number of colors. potentially confusingly, QUDA sometimes uses the word "color" to // things unrelated with physical color. things like "nColor=32" do pop up in deflation @@ -101,8 +103,8 @@ cudaGaugeField make_gauge_field(std::array const &geom) return U; } -// create a random source vector -ColorSpinorField make_source(std::array const &geom) +// create a random source vector (L = local size) +ColorSpinorField make_source(int L) { ColorSpinorParam param; param.nColor = 3; @@ -111,10 +113,10 @@ ColorSpinorField make_source(std::array const &geom) param.pad = 0; param.siteSubset = QUDA_FULL_SITE_SUBSET; param.nDim = 4; - param.x[0] = geom[0]; - param.x[1] = geom[1]; - param.x[2] = geom[2]; - param.x[3] = geom[3]; + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; param.x[4] = 1; // no fifth dimension param.pc_type = QUDA_4D_PC; param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; @@ -136,130 +138,151 @@ ColorSpinorField make_source(std::array const &geom) return src; } -void benchmark(int L, int niter) +void benchmark_wilson() { - std::array geom = {L, L, L, L}; + int niter = 20; + int niter_warmup = 10; - printfQuda("======================= benchmarking L=%d =======================\n", L); + printfQuda("==================== wilson dirac operator ====================\n"); + printfQuda("IMPORTANT: QUDAs own flop counting. Probably not the same as in Grid.\n"); + printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); - auto U = make_gauge_field(geom); - printfQuda("created random gauge field, %.3f GiB (sanity check: should be %.3f)\n", - U.Bytes() / 1024. / 1024. / 1024., - 1.0 * L * L * L * L * 4 * 18 * 8 / 1024. / 1024. / 1024.); - auto src = make_source(geom); - printfQuda("created random source, %.3f GiB (sanity check: should be %.3f)\n", - src.Bytes() / 1024. / 1024. / 1024., - 1.0 * L * L * L * L * 12 * 2 * 8 / 1024. / 1024. / 1024.); + for (int L : {8, 12, 16, 24, 32}) + { + auto U = make_gauge_field(L); + auto src = make_source(L); - // create (Wilson) dirac operator - DiracParam param; - param.kappa = 0.10; - param.dagger = QUDA_DAG_NO; - param.matpcType = QUDA_MATPC_EVEN_EVEN; - auto dirac = DiracWilson(param); + // create (Wilson) dirac operator + DiracParam param; + param.kappa = 0.10; + param.dagger = QUDA_DAG_NO; + param.matpcType = QUDA_MATPC_EVEN_EVEN; + auto dirac = DiracWilson(param); - // insert gauge field into the dirac operator - // (the additional nullptr's are for smeared links and fancy preconditioners and such. - // Not used for simple Wilson fermions) - dirac.updateFields(&U, nullptr, nullptr, nullptr); + // insert gauge field into the dirac operator + // (the additional nullptr's are for smeared links and fancy preconditioners and such. + // Not used for simple Wilson fermions) + dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto tmp = ColorSpinorField(ColorSpinorParam(src)); + auto tmp = ColorSpinorField(ColorSpinorParam(src)); - printfQuda("benchmarking Dirac operator. geom=(%d,%d,%d,%d), niter=%d\n", geom[0], - geom[1], geom[2], geom[3], niter); + // couple iterations without timing to warm up + for (int iter = 0; iter < niter_warmup; ++iter) + dirac.M(tmp, src); - // couple iterations without timing to warm up - printfQuda("warmup...\n"); - for (int iter = 0; iter < 20; ++iter) - dirac.M(tmp, src); + // actual benchmark with timings + dirac.Flops(); // reset flops counter + device_timer_t device_timer; + device_timer.start(); + for (int iter = 0; iter < niter; ++iter) + dirac.M(tmp, src); + device_timer.stop(); - printfQuda("running...\n"); - dirac.Flops(); // reset flops counter - device_timer_t device_timer; - device_timer.start(); - for (int iter = 0; iter < niter; ++iter) - dirac.M(tmp, src); - device_timer.stop(); + double secs = device_timer.last() / niter; + double flops = 1.0 * dirac.Flops() / niter; - double secs = device_timer.last(); - double gflops = (dirac.Flops() * 1e-9) / secs; - printfQuda("Gflops = %6.2f\n", gflops); + printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + } } -void benchmark_axpy(int L) +void benchmark_axpy() { - printfQuda("================ axpy L=%d ==============\n", L); + // number of iterations for warmup / measurement + // (feel free to change for noise/time tradeoff) + constexpr int niter_warmup = 10; + constexpr int niter = 20; + + printfQuda("==================== axpy / memory ====================\n"); ColorSpinorParam param; - param.nColor = 3; + param.nDim = 4; // 4-dimensional lattice + param.x[4] = 1; // no fifth dimension + param.nColor = 3; // supported values for nSpin/nColor are configured when compiling + // QUDA. "3*4" will probably always be enabled, so we stick with this param.nSpin = 4; - param.nVec = 1; - param.pad = 0; - param.siteSubset = QUDA_FULL_SITE_SUBSET; - param.nDim = 4; - param.x[0] = L; - param.x[1] = L; - param.x[2] = L; - param.x[3] = L; - param.x[4] = 1; // no fifth dimension + param.nVec = 1; // just a single vector + param.siteSubset = QUDA_FULL_SITE_SUBSET; // full lattice = no odd/even + param.pad = 0; // no padding + param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field + param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU + param.setPrecision(QUDA_DOUBLE_PRECISION); + + // the following dont matter for an axpy benchmark, but need to choose something param.pc_type = QUDA_4D_PC; param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field - param.setPrecision(QUDA_DOUBLE_PRECISION); - param.location = QUDA_CUDA_FIELD_LOCATION; - // create the field and fill it with random values - auto fieldA = ColorSpinorField(param); - auto fieldB = ColorSpinorField(param); - quda::RNG rng(fieldA, 1234); - auto size_bytes = size_t(8) * 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * - param.nColor * param.nSpin; - assert(fieldA.Bytes() == size_bytes); // sanity check - assert(fieldB.Bytes() == size_bytes); // sanity check - spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); - spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); + printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", + "GiB/s/rank", "Gflop/s/rank"); + std::vector L_list = {8, 12, 16, 24, 32}; + for (int L : L_list) + { + // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` + // are LOCAL, i.e. per rank / per GPU - // number of (real) elements in the field = number of fma instructions to do - double flops_per_iter = - 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * param.nColor * param.nSpin; + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; - int niter = 20; + // number of (real) elements in one (local) field + size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * + param.nColor * param.nSpin; - printfQuda("warmup...\n"); - for (int iter = 0; iter < 10; ++iter) - blas::axpy(1.234, fieldA, fieldB); + // create the field(s) + auto fieldA = ColorSpinorField(param); + auto fieldB = ColorSpinorField(param); + assert(fieldA.Bytes() == sizeof(double) * field_elements); // sanity check + assert(fieldB.Bytes() == sizeof(double) * field_elements); // sanity check - printfQuda("running...\n"); - device_timer_t device_timer; - device_timer.start(); - for (int iter = 0; iter < niter; ++iter) - blas::axpy(1.234, fieldA, fieldB); // fieldB += 1.234*fieldA - device_timer.stop(); + // fill fields with random values + quda::RNG rng(fieldA, 1234); + spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); + spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); - double secs = device_timer.last(); - double gflops = (flops_per_iter * niter) * 1e-9 / secs; - printfQuda("Gflops = %6.2f\n", gflops); - printfQuda("bytes = %6.2f GiB\n", 3. * fieldA.Bytes() / 1024. / 1024. / 1024.); - printfQuda("bandwidth = %6.2f GiB/s\n", - fieldA.Bytes() * 3 / 1024. / 1024. / 1024. * niter / secs); + // number of operations / bytes per iteration + // axpy is one addition, one multiplication, two read, one write + double flops = 2 * field_elements; + double memory = 3 * sizeof(double) * field_elements; + + // do some iterations to to let QUDA do its internal tuning and also stabilize cache + // behaviour and such + for (int iter = 0; iter < niter_warmup; ++iter) + blas::axpy(1.234, fieldA, fieldB); + + // running the actual benchmark + device_timer_t device_timer; + device_timer.start(); + for (int iter = 0; iter < niter; ++iter) + blas::axpy(1.234, fieldA, fieldB); + device_timer.stop(); + double secs = device_timer.last() / niter; // seconds per iteration + + printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6, + memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9); + } } int main(int argc, char **argv) { - initComms(argc, argv, gridsize); + initComms(argc, argv); initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used // verbosity options are: // SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE - setVerbosity(QUDA_VERBOSE); + setVerbosity(QUDA_SUMMARIZE); - for (int L : {8, 12, 16, 24, 32}) - benchmark_axpy(L); - for (int L : {16, 24, 32, 48, 64}) - benchmark(L, 100); + printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], + mpi_grid[3]); + benchmark_axpy(); + + setVerbosity(QUDA_SILENT); + benchmark_wilson(); + setVerbosity(QUDA_SUMMARIZE); + + printfQuda("==================== done with all benchmarks ====================\n"); endQuda(); quda::comm_finalize(); MPI_Finalize();