From 32e301fc674f990f5fa3912be5749ce98f6fdc34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Mon, 5 Jun 2023 17:07:07 +0100 Subject: [PATCH] add DWF benchmark --- Quda/Benchmark_Quda.cpp | 106 +++++++++++++++++++++++++++++++++++----- Quda/env.sh | 1 + 2 files changed, 96 insertions(+), 11 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 37b3dea..a1ea744 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -12,6 +12,9 @@ using namespace quda; +// remove to use QUDA's own flop counting instead of Grid's convention +#define FLOP_COUNTING_GRID + // This is the MPI grid, i.e. the layout of ranks int nranks = -1; std::array mpi_grid = {1, 1, 1, 1}; @@ -65,7 +68,7 @@ cudaGaugeField make_gauge_field(int L) // for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller // precisions are available in QUDA too - param.setPrecision(QUDA_DOUBLE_PRECISION); + param.setPrecision(QUDA_SINGLE_PRECISION); // no even/odd subset, we want a full lattice param.siteSubset = QUDA_FULL_SITE_SUBSET; @@ -104,7 +107,7 @@ cudaGaugeField make_gauge_field(int L) } // create a random source vector (L = local size) -ColorSpinorField make_source(int L) +ColorSpinorField make_source(int L, int Ls = 1) { // NOTE: `param.x` directly determines the size of the (local, per rank) memory // allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a @@ -116,12 +119,12 @@ ColorSpinorField make_source(int L) param.nVec = 1; // only a single vector param.pad = 0; param.siteSubset = QUDA_PARITY_SITE_SUBSET; - param.nDim = 4; + param.nDim = Ls == 1 ? 4 : 5; param.x[0] = L / 2; param.x[1] = L; param.x[2] = L; param.x[3] = L; - param.x[4] = 1; // no fifth dimension + param.x[4] = Ls; param.pc_type = QUDA_4D_PC; param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; @@ -130,7 +133,7 @@ ColorSpinorField make_source(int L) param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field - param.setPrecision(QUDA_DOUBLE_PRECISION); + param.setPrecision(QUDA_SINGLE_PRECISION); param.location = QUDA_CUDA_FIELD_LOCATION; // create the field and fill it with random values @@ -152,10 +155,15 @@ void benchmark_wilson() int niter_warmup = 10; printfQuda("==================== wilson dirac operator ====================\n"); - printfQuda("IMPORTANT: QUDAs own flop counting. Probably not the same as in Grid.\n"); +#ifdef FLOP_COUNTING_GRID + printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); +#else + printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " + "Benchmark_Grid)\n"); +#endif printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); - for (int L : {8, 12, 16, 24, 32}) + for (int L : {8, 12, 16, 24, 32, 48}) { auto U = make_gauge_field(L); auto src = make_source(L); @@ -187,7 +195,82 @@ void benchmark_wilson() device_timer.stop(); double secs = device_timer.last() / niter; + +#ifdef FLOP_COUNTING_GRID + // this is the flop counting from Benchmark_Grid + double Nc = 3; + double Nd = 4; + double Ns = 4; + double flops = + (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops *= L * L * L * L / 2.0; +#else double flops = 1.0 * dirac.Flops() / niter; +#endif + + printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + } +} + +void benchmark_dwf() +{ + int niter = 20; + int niter_warmup = 10; + + printfQuda("==================== domain wall dirac operator ====================\n"); +#ifdef FLOP_COUNTING_GRID + printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); +#else + printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " + "Benchmark_Grid)\n"); +#endif + printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); + int Ls = 12; + for (int L : {8, 12, 16, 24, 32, 48}) + { + auto U = make_gauge_field(L); + auto src = make_source(L, Ls); + + // create dirac operator + DiracParam param; + param.kappa = 0.10; + param.Ls = Ls; + param.m5 = 0.1; + param.dagger = QUDA_DAG_NO; + param.matpcType = QUDA_MATPC_EVEN_EVEN; + auto dirac = DiracDomainWall(param); + + // insert gauge field into the dirac operator + // (the additional nullptr's are for smeared links and fancy preconditioners and such) + dirac.updateFields(&U, nullptr, nullptr, nullptr); + + auto tmp = ColorSpinorField(ColorSpinorParam(src)); + + // couple iterations without timing to warm up + for (int iter = 0; iter < niter_warmup; ++iter) + dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + + // actual benchmark with timings + dirac.Flops(); // reset flops counter + device_timer_t device_timer; + device_timer.start(); + for (int iter = 0; iter < niter; ++iter) + dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + device_timer.stop(); + + double secs = device_timer.last() / niter; + +#ifdef FLOP_COUNTING_GRID + // this is the flop counting from Benchmark_Grid + double Nc = 3; + double Nd = 4; + double Ns = 4; + double flops = + (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops *= L * L * L * L * Ls / 2.0; +#else + double flops = 1.0 * dirac.Flops() / niter; +#endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); } @@ -213,7 +296,7 @@ void benchmark_axpy() param.pad = 0; // no padding param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU - param.setPrecision(QUDA_DOUBLE_PRECISION); + param.setPrecision(QUDA_SINGLE_PRECISION); // the following dont matter for an axpy benchmark, but need to choose something param.pc_type = QUDA_4D_PC; @@ -240,8 +323,8 @@ void benchmark_axpy() // create the field(s) auto fieldA = ColorSpinorField(param); auto fieldB = ColorSpinorField(param); - assert(fieldA.Bytes() == sizeof(double) * field_elements); // sanity check - assert(fieldB.Bytes() == sizeof(double) * field_elements); // sanity check + assert(fieldA.Bytes() == sizeof(float) * field_elements); // sanity check + assert(fieldB.Bytes() == sizeof(float) * field_elements); // sanity check // fill fields with random values quda::RNG rng(fieldA, 1234); @@ -251,7 +334,7 @@ void benchmark_axpy() // number of operations / bytes per iteration // axpy is one addition, one multiplication, two read, one write double flops = 2 * field_elements; - double memory = 3 * sizeof(double) * field_elements; + double memory = 3 * sizeof(float) * field_elements; // do some iterations to to let QUDA do its internal tuning and also stabilize cache // behaviour and such @@ -288,6 +371,7 @@ int main(int argc, char **argv) setVerbosity(QUDA_SILENT); benchmark_wilson(); + benchmark_dwf(); setVerbosity(QUDA_SUMMARIZE); printfQuda("==================== done with all benchmarks ====================\n"); diff --git a/Quda/env.sh b/Quda/env.sh index f73234d..d88cde2 100644 --- a/Quda/env.sh +++ b/Quda/env.sh @@ -2,6 +2,7 @@ module load gcc/9.3.0 module load cuda/11.4.1 module load openmpi/4.1.1-cuda11.4 +export QUDA_RESOURCE_PATH=$(pwd)/tuning export OMP_NUM_THREADS=4 export OMPI_MCA_btl=^uct,openib export OMPI_MCA_pml=ucx # by fabian. no idea what this is