From abb5fcfbb16859807065ae623309d50bc84a0511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Fri, 31 Mar 2023 18:03:39 +0100 Subject: [PATCH 01/16] first draft of Quda Benchmark --- Quda/.clang-format | 14 ++++ Quda/Benchmark_Quda.cpp | 169 ++++++++++++++++++++++++++++++++++++++++ Quda/build.sh | 11 +++ 3 files changed, 194 insertions(+) create mode 100644 Quda/.clang-format create mode 100644 Quda/Benchmark_Quda.cpp create mode 100755 Quda/build.sh diff --git a/Quda/.clang-format b/Quda/.clang-format new file mode 100644 index 0000000..9d54a25 --- /dev/null +++ b/Quda/.clang-format @@ -0,0 +1,14 @@ +{ + BasedOnStyle: LLVM, + UseTab: Never, + IndentWidth: 2, + TabWidth: 2, + BreakBeforeBraces: Allman, + AllowShortIfStatementsOnASingleLine: false, + IndentCaseLabels: false, + ColumnLimit: 90, + AccessModifierOffset: -4, + NamespaceIndentation: All, + FixNamespaceComments: false, + SortIncludes: true, +} diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp new file mode 100644 index 0000000..4ce5e7b --- /dev/null +++ b/Quda/Benchmark_Quda.cpp @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +// #include +#include +#include +#include + +#include +#include +#include + +using namespace quda; + +QudaPrecision smoother_halo_prec = QUDA_INVALID_PRECISION; + +std::array gridsize = {1, 1, 1, 4}; + +void initComms(int argc, char **argv, std::array const &commDims) +{ + // init MPI communication + MPI_Init(&argc, &argv); + + // this maps coordinates to rank number + auto lex_rank_from_coords = [](int const *coords, void *) + { + int rank = coords[0]; + for (int i = 1; i < 4; i++) + rank = gridsize[i] * rank + coords[i]; + return rank; + }; + initCommsGridQuda(4, commDims.data(), lex_rank_from_coords, nullptr); + + for (int d = 0; d < 4; d++) + if (gridsize[d] > 1) + commDimPartitionedSet(d); +} + +// creates a random gauge field +cudaGaugeField make_gauge_field(std::array const &geom) +{ + GaugeFieldParam param; + + // dimension and type of the lattice object + param.nDim = 4; + param.nColor = 3; + param.x[0] = geom[0]; + param.x[1] = geom[1]; + param.x[2] = geom[2]; + param.x[3] = geom[3]; + param.t_boundary = QUDA_PERIODIC_T; + param.siteSubset = QUDA_FULL_SITE_SUBSET; // no even/odd, just a full lattice + param.link_type = QUDA_SU3_LINKS; + param.setPrecision(QUDA_DOUBLE_PRECISION); + + param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the fields + param.location = QUDA_CUDA_FIELD_LOCATION; // field should live on the accelerator + + // turn off advanced features we dont care about for this benchmark + param.reconstruct = QUDA_RECONSTRUCT_NO; + param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; + + // these control the physical data layout. Might be interesting to try out different + // settings + param.order = QUDA_FLOAT2_GAUGE_ORDER; + param.geometry = QUDA_SCALAR_GEOMETRY; + + // create the field and fill with random SU(3) matrices + auto U = cudaGaugeField(param); + quda::RNG rng(U, /*seed=*/1234); + gaugeGauss(U, rng, 1.0); + return U; +} + +// create a random source vector +ColorSpinorField make_source(std::array const &geom) +{ + ColorSpinorParam param; + param.nColor = 3; + param.nSpin = 4; + param.nVec = 1; // only a single vector + param.pad = 0; + param.siteSubset = QUDA_FULL_SITE_SUBSET; + param.nDim = 4; + param.x[0] = geom[0]; + param.x[1] = geom[1]; + param.x[2] = geom[2]; + param.x[3] = geom[3]; + param.x[4] = 1; // no fifth dimension + param.pc_type = QUDA_4D_PC; + param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field + param.setPrecision(QUDA_DOUBLE_PRECISION); + param.location = QUDA_CUDA_FIELD_LOCATION; + + // create the field and fill it with random values + auto src = ColorSpinorField(param); + quda::RNG rng(src, 1234); + spinorNoise(src, rng, QUDA_NOISE_GAUSS); + printfQuda( + "created src with norm = %f (sanity check: should be close to %f) and %f bytes\n", + blas::norm2(src), 2.0 * 12 * geom[0] * geom[1] * geom[2] * geom[3], + src.Bytes() * 1.0); + src.PrintDims(); + + return src; +} + +void benchmark(int L, int niter) +{ + std::array geom = {L, L, L, L}; + + auto U = make_gauge_field(geom); + auto src = make_source(geom); + + // create (Wilson) dirac operator + DiracParam param; + param.kappa = 0.10; + param.dagger = QUDA_DAG_NO; + param.matpcType = QUDA_MATPC_EVEN_EVEN; + auto dirac = DiracWilson(param); + + // insert gauge field into the dirac operator + // (the additional nullptr's are for smeared links and fancy preconditioners and such. + // Not used for simple Wilson fermions) + dirac.updateFields(&U, nullptr, nullptr, nullptr); + + auto tmp = ColorSpinorField(ColorSpinorParam(src)); + + printfQuda("benchmarking Dirac operator. geom=(%d,%d,%d,%d), niter=%d\n", geom[0], + geom[1], geom[2], geom[3], niter); + + // couple iterations without timing to warm up + for (int iter = 0; iter < 20; ++iter) + dirac.M(tmp, src); + + dirac.Flops(); // reset flops counter + device_timer_t device_timer; + device_timer.start(); + for (int iter = 0; iter < niter; ++iter) + dirac.M(tmp, src); + device_timer.stop(); + + double secs = device_timer.last(); + double gflops = (dirac.Flops() * 1e-9) / secs; + printfQuda("Gflops = %6.1f\n", gflops); +} + +int main(int argc, char **argv) +{ + initComms(argc, argv, gridsize); + + // -1 for multi-gpu. otherwise this selects the device to be used + initQuda(-1); + + // verbosity options are: + // SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE + setVerbosity(QUDA_SUMMARIZE); + + for (int L : {8, 16, 24, 32}) + benchmark(L, 1000); + + endQuda(); + quda::comm_finalize(); + MPI_Finalize(); +} diff --git a/Quda/build.sh b/Quda/build.sh new file mode 100755 index 0000000..ea3221f --- /dev/null +++ b/Quda/build.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -e + + +COMPILE_FLAGS="-DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=7 -g -O3 -Wall -Wextra -pthread -std=c++17" +LINK_FLAGS="-g -O3 -Wl,-rpath -Wl,/mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib -Wl,--enable-new-dtags -L/mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib -pthread -Wl,-rpath,/home/dp207/dp207/dc-burg2/quda_build/tests:/home/dp207/dp207/dc-burg2/quda_build/lib:/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs:/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64: ../../quda_install/lib/libquda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart_static.a -ldl /usr/lib64/librt.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcublas.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcufft.so -lpthread /home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so" + +$CXX $COMPILE_FLAGS -I/home/dp207/dp207/dc-burg2/quda_install/include/ -o Benchmark_Quda.o -c Benchmark_Quda.cpp +$CXX $LINK_FLAGS Benchmark_Quda.o -o Benchmark_Quda + -- 2.45.2 From b95984c230ddbab72ba70b1338ff1d616d2c6b73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Fri, 21 Apr 2023 10:38:28 +0100 Subject: [PATCH 02/16] add quda axpy/memory benchmark --- Quda/Benchmark_Quda.cpp | 141 +++++++++++++++++++++++++++++++++------- Quda/build.sh | 17 +++-- 2 files changed, 127 insertions(+), 31 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 4ce5e7b..0687a06 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -16,6 +16,7 @@ using namespace quda; QudaPrecision smoother_halo_prec = QUDA_INVALID_PRECISION; +// This is the MPI grid, i.e. the layout of ranks, not the lattice volume std::array gridsize = {1, 1, 1, 4}; void initComms(int argc, char **argv, std::array const &commDims) @@ -31,6 +32,7 @@ void initComms(int argc, char **argv, std::array const &commDims) rank = gridsize[i] * rank + coords[i]; return rank; }; + initCommsGridQuda(4, commDims.data(), lex_rank_from_coords, nullptr); for (int d = 0; d < 4; d++) @@ -45,32 +47,57 @@ cudaGaugeField make_gauge_field(std::array const &geom) // dimension and type of the lattice object param.nDim = 4; - param.nColor = 3; param.x[0] = geom[0]; param.x[1] = geom[1]; param.x[2] = geom[2]; param.x[3] = geom[3]; + + // number of colors. potentially confusingly, QUDA sometimes uses the word "color" to + // things unrelated with physical color. things like "nColor=32" do pop up in deflation + // solvers where it (to my understanding) refers to the number of (parallely processed) + // deflation vectors. + param.nColor = 3; + + // boundary conditions (dont really care for benchmark) param.t_boundary = QUDA_PERIODIC_T; - param.siteSubset = QUDA_FULL_SITE_SUBSET; // no even/odd, just a full lattice - param.link_type = QUDA_SU3_LINKS; + + // for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller + // precisions are available in QUDA too param.setPrecision(QUDA_DOUBLE_PRECISION); - param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the fields - param.location = QUDA_CUDA_FIELD_LOCATION; // field should live on the accelerator + // no even/odd subset, we want a full lattice + param.siteSubset = QUDA_FULL_SITE_SUBSET; - // turn off advanced features we dont care about for this benchmark + // what kind of 3x3 matrices the field contains. A proper gauge field has SU(3) + // matrices, but (for example) smeared/thick links could have non-unitary links. + param.link_type = QUDA_SU3_LINKS; + + // "NULL" does not initialize the field upon creation, "ZERO" would set everything to 0 + param.create = QUDA_NULL_FIELD_CREATE; + + // field should be allocated directly on the accelerator/GPU + param.location = QUDA_CUDA_FIELD_LOCATION; + + // "reconstruct" here means reconstructing a SU(3) matrix from fewer than 18 real + // numbers (=3x3 complex numbers). Great feature in production (saving + // memory/cache/network bandwidth), not used for this benchmark. param.reconstruct = QUDA_RECONSTRUCT_NO; + + // "ghostExchange" would often be called "halo exchange" outside of Quda. This has + // nothing to do with ghost fields from continuum/perturbative qcd. param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; - // these control the physical data layout. Might be interesting to try out different - // settings + // This controls the physical order of elements. "float2" is the the default param.order = QUDA_FLOAT2_GAUGE_ORDER; - param.geometry = QUDA_SCALAR_GEOMETRY; + + // this means the field is a LORENTZ vector (which a gauge field must be). Has nothing + // to do with spin. + param.geometry = QUDA_VECTOR_GEOMETRY; // create the field and fill with random SU(3) matrices + // std::cout << param << std::endl; // double-check parameters auto U = cudaGaugeField(param); - quda::RNG rng(U, /*seed=*/1234); - gaugeGauss(U, rng, 1.0); + gaugeGauss(U, /*seed=*/1234, 1.0); return U; } @@ -100,11 +127,11 @@ ColorSpinorField make_source(std::array const &geom) auto src = ColorSpinorField(param); quda::RNG rng(src, 1234); spinorNoise(src, rng, QUDA_NOISE_GAUSS); - printfQuda( + /*printfQuda( "created src with norm = %f (sanity check: should be close to %f) and %f bytes\n", blas::norm2(src), 2.0 * 12 * geom[0] * geom[1] * geom[2] * geom[3], - src.Bytes() * 1.0); - src.PrintDims(); + src.Bytes() * 1.0);*/ + // src.PrintDims(); return src; } @@ -113,8 +140,16 @@ void benchmark(int L, int niter) { std::array geom = {L, L, L, L}; + printfQuda("======================= benchmarking L=%d =======================\n", L); + auto U = make_gauge_field(geom); + printfQuda("created random gauge field, %.3f GiB (sanity check: should be %.3f)\n", + U.Bytes() / 1024. / 1024. / 1024., + 1.0 * L * L * L * L * 4 * 18 * 8 / 1024. / 1024. / 1024.); auto src = make_source(geom); + printfQuda("created random source, %.3f GiB (sanity check: should be %.3f)\n", + src.Bytes() / 1024. / 1024. / 1024., + 1.0 * L * L * L * L * 12 * 2 * 8 / 1024. / 1024. / 1024.); // create (Wilson) dirac operator DiracParam param; @@ -134,9 +169,11 @@ void benchmark(int L, int niter) geom[1], geom[2], geom[3], niter); // couple iterations without timing to warm up + printfQuda("warmup...\n"); for (int iter = 0; iter < 20; ++iter) dirac.M(tmp, src); + printfQuda("running...\n"); dirac.Flops(); // reset flops counter device_timer_t device_timer; device_timer.start(); @@ -146,22 +183,82 @@ void benchmark(int L, int niter) double secs = device_timer.last(); double gflops = (dirac.Flops() * 1e-9) / secs; - printfQuda("Gflops = %6.1f\n", gflops); + printfQuda("Gflops = %6.2f\n", gflops); +} + +void benchmark_axpy(int L) +{ + printfQuda("================ axpy L=%d ==============\n", L); + + ColorSpinorParam param; + param.nColor = 3; + param.nSpin = 4; + param.nVec = 1; + param.pad = 0; + param.siteSubset = QUDA_FULL_SITE_SUBSET; + param.nDim = 4; + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; + param.x[4] = 1; // no fifth dimension + param.pc_type = QUDA_4D_PC; + param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field + param.setPrecision(QUDA_DOUBLE_PRECISION); + param.location = QUDA_CUDA_FIELD_LOCATION; + + // create the field and fill it with random values + auto fieldA = ColorSpinorField(param); + auto fieldB = ColorSpinorField(param); + quda::RNG rng(fieldA, 1234); + auto size_bytes = size_t(8) * 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * + param.nColor * param.nSpin; + assert(fieldA.Bytes() == size_bytes); // sanity check + assert(fieldB.Bytes() == size_bytes); // sanity check + spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); + spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); + + // number of (real) elements in the field = number of fma instructions to do + double flops_per_iter = + 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * param.nColor * param.nSpin; + + int niter = 20; + + printfQuda("warmup...\n"); + for (int iter = 0; iter < 10; ++iter) + blas::axpy(1.234, fieldA, fieldB); + + printfQuda("running...\n"); + device_timer_t device_timer; + device_timer.start(); + for (int iter = 0; iter < niter; ++iter) + blas::axpy(1.234, fieldA, fieldB); // fieldB += 1.234*fieldA + device_timer.stop(); + + double secs = device_timer.last(); + double gflops = (flops_per_iter * niter) * 1e-9 / secs; + printfQuda("Gflops = %6.2f\n", gflops); + printfQuda("bytes = %6.2f GiB\n", 3. * fieldA.Bytes() / 1024. / 1024. / 1024.); + printfQuda("bandwidth = %6.2f GiB/s\n", + fieldA.Bytes() * 3 / 1024. / 1024. / 1024. * niter / secs); } int main(int argc, char **argv) { initComms(argc, argv, gridsize); - // -1 for multi-gpu. otherwise this selects the device to be used - initQuda(-1); + initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used - // verbosity options are: - // SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE - setVerbosity(QUDA_SUMMARIZE); + // verbosity options are: + // SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE + setVerbosity(QUDA_VERBOSE); - for (int L : {8, 16, 24, 32}) - benchmark(L, 1000); + for (int L : {8, 12, 16, 24, 32}) + benchmark_axpy(L); + for (int L : {16, 24, 32, 48, 64}) + benchmark(L, 100); endQuda(); quda::comm_finalize(); diff --git a/Quda/build.sh b/Quda/build.sh index ea3221f..960c9c8 100755 --- a/Quda/build.sh +++ b/Quda/build.sh @@ -1,11 +1,10 @@ #!/bin/bash +#CXX=/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/bin/g++ +QUDA_BUILD=/home/dp207/dp207/dc-burg2/quda_build +QUDA_SRC=/home/dp207/dp207/dc-burg2/quda +#QUDA_BUILD= -set -e - - -COMPILE_FLAGS="-DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=7 -g -O3 -Wall -Wextra -pthread -std=c++17" -LINK_FLAGS="-g -O3 -Wl,-rpath -Wl,/mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib -Wl,--enable-new-dtags -L/mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib -pthread -Wl,-rpath,/home/dp207/dp207/dc-burg2/quda_build/tests:/home/dp207/dp207/dc-burg2/quda_build/lib:/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs:/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64: ../../quda_install/lib/libquda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart_static.a -ldl /usr/lib64/librt.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcublas.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcufft.so -lpthread /home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so" - -$CXX $COMPILE_FLAGS -I/home/dp207/dp207/dc-burg2/quda_install/include/ -o Benchmark_Quda.o -c Benchmark_Quda.cpp -$CXX $LINK_FLAGS Benchmark_Quda.o -o Benchmark_Quda - +FLAGS="-DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=7 -g -O3 -Wall -Wextra -std=c++17 " +$CXX $FLAGS -I$QUDA_BUILD/include/targets/cuda -I$QUDA_SRC/include -I$QUDA_BUILD/include -isystem $QUDA_SRC/include/externals -isystem $QUDA_BUILD/_deps/eigen-src -c -o Benchmark_Quda.o Benchmark_Quda.cpp +LINK_FLAGS="-Wl,-rpath,$QUDA_BUILD/tests:$QUDA_BUILD/lib:/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs: $QUDA_BUILD/lib/libquda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart_static.a -ldl /usr/lib64/librt.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcublas.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcufft.so -lpthread" +$CXX -g -O3 Benchmark_Quda.o -o Benchmark_Quda $LINK_FLAGS -lmpi -- 2.45.2 From 176b1ba7761cdce447d445556b6922318ca5b061 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Fri, 21 Apr 2023 17:27:49 +0100 Subject: [PATCH 03/16] tidy up the wilson benchmark and add environment script --- Quda/env.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 Quda/env.sh diff --git a/Quda/env.sh b/Quda/env.sh new file mode 100644 index 0000000..f73234d --- /dev/null +++ b/Quda/env.sh @@ -0,0 +1,20 @@ +module load gcc/9.3.0 +module load cuda/11.4.1 +module load openmpi/4.1.1-cuda11.4 + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export OMPI_MCA_pml=ucx # by fabian. no idea what this is +#export UCX_TLS=rc,rc_x,sm,cuda_copy,cuda_ipc,gdr_copy +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_THRESH=16384 +export UCX_RNDV_SCHEME=put_zcopy +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n + +export OMPI_MCA_io=romio321 +export OMPI_MCA_btl_openib_allow_ib=true +export OMPI_MCA_btl_openib_device_type=infiniband +export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 + +export QUDA_REORDER_LOCATION=GPU # this is the default anyway -- 2.45.2 From 9de49f867241ede742b412dfc573dabef5097a2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Mon, 24 Apr 2023 11:35:47 +0100 Subject: [PATCH 04/16] fix scaling conventions for multi-gpu --- Quda/Benchmark_Quda.cpp | 251 ++++++++++++++++++++++------------------ 1 file changed, 137 insertions(+), 114 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 0687a06..3a0b795 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -1,56 +1,58 @@ #include #include #include +#include #include -#include -// #include +#include +#include #include +#include #include #include -#include -#include -#include - using namespace quda; -QudaPrecision smoother_halo_prec = QUDA_INVALID_PRECISION; +// This is the MPI grid, i.e. the layout of ranks +int nranks = -1; +std::array mpi_grid = {1, 1, 1, 1}; -// This is the MPI grid, i.e. the layout of ranks, not the lattice volume -std::array gridsize = {1, 1, 1, 4}; - -void initComms(int argc, char **argv, std::array const &commDims) +void initComms(int argc, char **argv) { // init MPI communication MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + assert(1 <= nranks && nranks <= 100000); + + mpi_grid[3] = nranks; + // this maps coordinates to rank number auto lex_rank_from_coords = [](int const *coords, void *) { int rank = coords[0]; for (int i = 1; i < 4; i++) - rank = gridsize[i] * rank + coords[i]; + rank = mpi_grid[i] * rank + coords[i]; return rank; }; - initCommsGridQuda(4, commDims.data(), lex_rank_from_coords, nullptr); + initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr); for (int d = 0; d < 4; d++) - if (gridsize[d] > 1) + if (mpi_grid[d] > 1) commDimPartitionedSet(d); } -// creates a random gauge field -cudaGaugeField make_gauge_field(std::array const &geom) +// creates a random gauge field. L = local(!) size +cudaGaugeField make_gauge_field(int L) { GaugeFieldParam param; // dimension and type of the lattice object param.nDim = 4; - param.x[0] = geom[0]; - param.x[1] = geom[1]; - param.x[2] = geom[2]; - param.x[3] = geom[3]; + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; // number of colors. potentially confusingly, QUDA sometimes uses the word "color" to // things unrelated with physical color. things like "nColor=32" do pop up in deflation @@ -101,8 +103,8 @@ cudaGaugeField make_gauge_field(std::array const &geom) return U; } -// create a random source vector -ColorSpinorField make_source(std::array const &geom) +// create a random source vector (L = local size) +ColorSpinorField make_source(int L) { ColorSpinorParam param; param.nColor = 3; @@ -111,10 +113,10 @@ ColorSpinorField make_source(std::array const &geom) param.pad = 0; param.siteSubset = QUDA_FULL_SITE_SUBSET; param.nDim = 4; - param.x[0] = geom[0]; - param.x[1] = geom[1]; - param.x[2] = geom[2]; - param.x[3] = geom[3]; + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; param.x[4] = 1; // no fifth dimension param.pc_type = QUDA_4D_PC; param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; @@ -136,130 +138,151 @@ ColorSpinorField make_source(std::array const &geom) return src; } -void benchmark(int L, int niter) +void benchmark_wilson() { - std::array geom = {L, L, L, L}; + int niter = 20; + int niter_warmup = 10; - printfQuda("======================= benchmarking L=%d =======================\n", L); + printfQuda("==================== wilson dirac operator ====================\n"); + printfQuda("IMPORTANT: QUDAs own flop counting. Probably not the same as in Grid.\n"); + printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); - auto U = make_gauge_field(geom); - printfQuda("created random gauge field, %.3f GiB (sanity check: should be %.3f)\n", - U.Bytes() / 1024. / 1024. / 1024., - 1.0 * L * L * L * L * 4 * 18 * 8 / 1024. / 1024. / 1024.); - auto src = make_source(geom); - printfQuda("created random source, %.3f GiB (sanity check: should be %.3f)\n", - src.Bytes() / 1024. / 1024. / 1024., - 1.0 * L * L * L * L * 12 * 2 * 8 / 1024. / 1024. / 1024.); + for (int L : {8, 12, 16, 24, 32}) + { + auto U = make_gauge_field(L); + auto src = make_source(L); - // create (Wilson) dirac operator - DiracParam param; - param.kappa = 0.10; - param.dagger = QUDA_DAG_NO; - param.matpcType = QUDA_MATPC_EVEN_EVEN; - auto dirac = DiracWilson(param); + // create (Wilson) dirac operator + DiracParam param; + param.kappa = 0.10; + param.dagger = QUDA_DAG_NO; + param.matpcType = QUDA_MATPC_EVEN_EVEN; + auto dirac = DiracWilson(param); - // insert gauge field into the dirac operator - // (the additional nullptr's are for smeared links and fancy preconditioners and such. - // Not used for simple Wilson fermions) - dirac.updateFields(&U, nullptr, nullptr, nullptr); + // insert gauge field into the dirac operator + // (the additional nullptr's are for smeared links and fancy preconditioners and such. + // Not used for simple Wilson fermions) + dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto tmp = ColorSpinorField(ColorSpinorParam(src)); + auto tmp = ColorSpinorField(ColorSpinorParam(src)); - printfQuda("benchmarking Dirac operator. geom=(%d,%d,%d,%d), niter=%d\n", geom[0], - geom[1], geom[2], geom[3], niter); + // couple iterations without timing to warm up + for (int iter = 0; iter < niter_warmup; ++iter) + dirac.M(tmp, src); - // couple iterations without timing to warm up - printfQuda("warmup...\n"); - for (int iter = 0; iter < 20; ++iter) - dirac.M(tmp, src); + // actual benchmark with timings + dirac.Flops(); // reset flops counter + device_timer_t device_timer; + device_timer.start(); + for (int iter = 0; iter < niter; ++iter) + dirac.M(tmp, src); + device_timer.stop(); - printfQuda("running...\n"); - dirac.Flops(); // reset flops counter - device_timer_t device_timer; - device_timer.start(); - for (int iter = 0; iter < niter; ++iter) - dirac.M(tmp, src); - device_timer.stop(); + double secs = device_timer.last() / niter; + double flops = 1.0 * dirac.Flops() / niter; - double secs = device_timer.last(); - double gflops = (dirac.Flops() * 1e-9) / secs; - printfQuda("Gflops = %6.2f\n", gflops); + printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + } } -void benchmark_axpy(int L) +void benchmark_axpy() { - printfQuda("================ axpy L=%d ==============\n", L); + // number of iterations for warmup / measurement + // (feel free to change for noise/time tradeoff) + constexpr int niter_warmup = 10; + constexpr int niter = 20; + + printfQuda("==================== axpy / memory ====================\n"); ColorSpinorParam param; - param.nColor = 3; + param.nDim = 4; // 4-dimensional lattice + param.x[4] = 1; // no fifth dimension + param.nColor = 3; // supported values for nSpin/nColor are configured when compiling + // QUDA. "3*4" will probably always be enabled, so we stick with this param.nSpin = 4; - param.nVec = 1; - param.pad = 0; - param.siteSubset = QUDA_FULL_SITE_SUBSET; - param.nDim = 4; - param.x[0] = L; - param.x[1] = L; - param.x[2] = L; - param.x[3] = L; - param.x[4] = 1; // no fifth dimension + param.nVec = 1; // just a single vector + param.siteSubset = QUDA_FULL_SITE_SUBSET; // full lattice = no odd/even + param.pad = 0; // no padding + param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field + param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU + param.setPrecision(QUDA_DOUBLE_PRECISION); + + // the following dont matter for an axpy benchmark, but need to choose something param.pc_type = QUDA_4D_PC; param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field - param.setPrecision(QUDA_DOUBLE_PRECISION); - param.location = QUDA_CUDA_FIELD_LOCATION; - // create the field and fill it with random values - auto fieldA = ColorSpinorField(param); - auto fieldB = ColorSpinorField(param); - quda::RNG rng(fieldA, 1234); - auto size_bytes = size_t(8) * 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * - param.nColor * param.nSpin; - assert(fieldA.Bytes() == size_bytes); // sanity check - assert(fieldB.Bytes() == size_bytes); // sanity check - spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); - spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); + printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", + "GiB/s/rank", "Gflop/s/rank"); + std::vector L_list = {8, 12, 16, 24, 32}; + for (int L : L_list) + { + // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` + // are LOCAL, i.e. per rank / per GPU - // number of (real) elements in the field = number of fma instructions to do - double flops_per_iter = - 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * param.nColor * param.nSpin; + param.x[0] = L; + param.x[1] = L; + param.x[2] = L; + param.x[3] = L; - int niter = 20; + // number of (real) elements in one (local) field + size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * + param.nColor * param.nSpin; - printfQuda("warmup...\n"); - for (int iter = 0; iter < 10; ++iter) - blas::axpy(1.234, fieldA, fieldB); + // create the field(s) + auto fieldA = ColorSpinorField(param); + auto fieldB = ColorSpinorField(param); + assert(fieldA.Bytes() == sizeof(double) * field_elements); // sanity check + assert(fieldB.Bytes() == sizeof(double) * field_elements); // sanity check - printfQuda("running...\n"); - device_timer_t device_timer; - device_timer.start(); - for (int iter = 0; iter < niter; ++iter) - blas::axpy(1.234, fieldA, fieldB); // fieldB += 1.234*fieldA - device_timer.stop(); + // fill fields with random values + quda::RNG rng(fieldA, 1234); + spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); + spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); - double secs = device_timer.last(); - double gflops = (flops_per_iter * niter) * 1e-9 / secs; - printfQuda("Gflops = %6.2f\n", gflops); - printfQuda("bytes = %6.2f GiB\n", 3. * fieldA.Bytes() / 1024. / 1024. / 1024.); - printfQuda("bandwidth = %6.2f GiB/s\n", - fieldA.Bytes() * 3 / 1024. / 1024. / 1024. * niter / secs); + // number of operations / bytes per iteration + // axpy is one addition, one multiplication, two read, one write + double flops = 2 * field_elements; + double memory = 3 * sizeof(double) * field_elements; + + // do some iterations to to let QUDA do its internal tuning and also stabilize cache + // behaviour and such + for (int iter = 0; iter < niter_warmup; ++iter) + blas::axpy(1.234, fieldA, fieldB); + + // running the actual benchmark + device_timer_t device_timer; + device_timer.start(); + for (int iter = 0; iter < niter; ++iter) + blas::axpy(1.234, fieldA, fieldB); + device_timer.stop(); + double secs = device_timer.last() / niter; // seconds per iteration + + printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6, + memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9); + } } int main(int argc, char **argv) { - initComms(argc, argv, gridsize); + initComms(argc, argv); initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used // verbosity options are: // SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE - setVerbosity(QUDA_VERBOSE); + setVerbosity(QUDA_SUMMARIZE); - for (int L : {8, 12, 16, 24, 32}) - benchmark_axpy(L); - for (int L : {16, 24, 32, 48, 64}) - benchmark(L, 100); + printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], + mpi_grid[3]); + benchmark_axpy(); + + setVerbosity(QUDA_SILENT); + benchmark_wilson(); + setVerbosity(QUDA_SUMMARIZE); + + printfQuda("==================== done with all benchmarks ====================\n"); endQuda(); quda::comm_finalize(); MPI_Finalize(); -- 2.45.2 From 0af6b9047a5897816b05e1b88d8916b23ec772e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Mon, 24 Apr 2023 14:58:53 +0100 Subject: [PATCH 05/16] benchmark Dslash(...) instead of full M(...) --- Quda/Benchmark_Quda.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 3a0b795..37b3dea 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -106,21 +106,29 @@ cudaGaugeField make_gauge_field(int L) // create a random source vector (L = local size) ColorSpinorField make_source(int L) { + // NOTE: `param.x` directly determines the size of the (local, per rank) memory + // allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a + // physical local volume of L^4, thus implicity choosing a dimension for the + // checkerboarding (shouldnt really matter of course which one). ColorSpinorParam param; param.nColor = 3; param.nSpin = 4; param.nVec = 1; // only a single vector param.pad = 0; - param.siteSubset = QUDA_FULL_SITE_SUBSET; + param.siteSubset = QUDA_PARITY_SITE_SUBSET; param.nDim = 4; - param.x[0] = L; + param.x[0] = L / 2; param.x[1] = L; param.x[2] = L; param.x[3] = L; param.x[4] = 1; // no fifth dimension param.pc_type = QUDA_4D_PC; param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; - param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + + // somewhat surprisingly, the DiracWilson::Dslash(...) function only works with the + // UKQCD_GAMMA_BASIS + param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; + param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field param.setPrecision(QUDA_DOUBLE_PRECISION); param.location = QUDA_CUDA_FIELD_LOCATION; @@ -168,14 +176,14 @@ void benchmark_wilson() // couple iterations without timing to warm up for (int iter = 0; iter < niter_warmup; ++iter) - dirac.M(tmp, src); + dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); // actual benchmark with timings dirac.Flops(); // reset flops counter device_timer_t device_timer; device_timer.start(); for (int iter = 0; iter < niter; ++iter) - dirac.M(tmp, src); + dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); device_timer.stop(); double secs = device_timer.last() / niter; -- 2.45.2 From 6c1598173754d5e9a634022a2ad181fd39e9d323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Mon, 5 Jun 2023 17:07:07 +0100 Subject: [PATCH 06/16] add DWF benchmark --- Quda/Benchmark_Quda.cpp | 106 +++++++++++++++++++++++++++++++++++----- Quda/env.sh | 1 + 2 files changed, 96 insertions(+), 11 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 37b3dea..a1ea744 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -12,6 +12,9 @@ using namespace quda; +// remove to use QUDA's own flop counting instead of Grid's convention +#define FLOP_COUNTING_GRID + // This is the MPI grid, i.e. the layout of ranks int nranks = -1; std::array mpi_grid = {1, 1, 1, 1}; @@ -65,7 +68,7 @@ cudaGaugeField make_gauge_field(int L) // for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller // precisions are available in QUDA too - param.setPrecision(QUDA_DOUBLE_PRECISION); + param.setPrecision(QUDA_SINGLE_PRECISION); // no even/odd subset, we want a full lattice param.siteSubset = QUDA_FULL_SITE_SUBSET; @@ -104,7 +107,7 @@ cudaGaugeField make_gauge_field(int L) } // create a random source vector (L = local size) -ColorSpinorField make_source(int L) +ColorSpinorField make_source(int L, int Ls = 1) { // NOTE: `param.x` directly determines the size of the (local, per rank) memory // allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a @@ -116,12 +119,12 @@ ColorSpinorField make_source(int L) param.nVec = 1; // only a single vector param.pad = 0; param.siteSubset = QUDA_PARITY_SITE_SUBSET; - param.nDim = 4; + param.nDim = Ls == 1 ? 4 : 5; param.x[0] = L / 2; param.x[1] = L; param.x[2] = L; param.x[3] = L; - param.x[4] = 1; // no fifth dimension + param.x[4] = Ls; param.pc_type = QUDA_4D_PC; param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; @@ -130,7 +133,7 @@ ColorSpinorField make_source(int L) param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field - param.setPrecision(QUDA_DOUBLE_PRECISION); + param.setPrecision(QUDA_SINGLE_PRECISION); param.location = QUDA_CUDA_FIELD_LOCATION; // create the field and fill it with random values @@ -152,10 +155,15 @@ void benchmark_wilson() int niter_warmup = 10; printfQuda("==================== wilson dirac operator ====================\n"); - printfQuda("IMPORTANT: QUDAs own flop counting. Probably not the same as in Grid.\n"); +#ifdef FLOP_COUNTING_GRID + printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); +#else + printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " + "Benchmark_Grid)\n"); +#endif printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); - for (int L : {8, 12, 16, 24, 32}) + for (int L : {8, 12, 16, 24, 32, 48}) { auto U = make_gauge_field(L); auto src = make_source(L); @@ -187,7 +195,82 @@ void benchmark_wilson() device_timer.stop(); double secs = device_timer.last() / niter; + +#ifdef FLOP_COUNTING_GRID + // this is the flop counting from Benchmark_Grid + double Nc = 3; + double Nd = 4; + double Ns = 4; + double flops = + (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops *= L * L * L * L / 2.0; +#else double flops = 1.0 * dirac.Flops() / niter; +#endif + + printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + } +} + +void benchmark_dwf() +{ + int niter = 20; + int niter_warmup = 10; + + printfQuda("==================== domain wall dirac operator ====================\n"); +#ifdef FLOP_COUNTING_GRID + printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); +#else + printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " + "Benchmark_Grid)\n"); +#endif + printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); + int Ls = 12; + for (int L : {8, 12, 16, 24, 32, 48}) + { + auto U = make_gauge_field(L); + auto src = make_source(L, Ls); + + // create dirac operator + DiracParam param; + param.kappa = 0.10; + param.Ls = Ls; + param.m5 = 0.1; + param.dagger = QUDA_DAG_NO; + param.matpcType = QUDA_MATPC_EVEN_EVEN; + auto dirac = DiracDomainWall(param); + + // insert gauge field into the dirac operator + // (the additional nullptr's are for smeared links and fancy preconditioners and such) + dirac.updateFields(&U, nullptr, nullptr, nullptr); + + auto tmp = ColorSpinorField(ColorSpinorParam(src)); + + // couple iterations without timing to warm up + for (int iter = 0; iter < niter_warmup; ++iter) + dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + + // actual benchmark with timings + dirac.Flops(); // reset flops counter + device_timer_t device_timer; + device_timer.start(); + for (int iter = 0; iter < niter; ++iter) + dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + device_timer.stop(); + + double secs = device_timer.last() / niter; + +#ifdef FLOP_COUNTING_GRID + // this is the flop counting from Benchmark_Grid + double Nc = 3; + double Nd = 4; + double Ns = 4; + double flops = + (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops *= L * L * L * L * Ls / 2.0; +#else + double flops = 1.0 * dirac.Flops() / niter; +#endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); } @@ -213,7 +296,7 @@ void benchmark_axpy() param.pad = 0; // no padding param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU - param.setPrecision(QUDA_DOUBLE_PRECISION); + param.setPrecision(QUDA_SINGLE_PRECISION); // the following dont matter for an axpy benchmark, but need to choose something param.pc_type = QUDA_4D_PC; @@ -240,8 +323,8 @@ void benchmark_axpy() // create the field(s) auto fieldA = ColorSpinorField(param); auto fieldB = ColorSpinorField(param); - assert(fieldA.Bytes() == sizeof(double) * field_elements); // sanity check - assert(fieldB.Bytes() == sizeof(double) * field_elements); // sanity check + assert(fieldA.Bytes() == sizeof(float) * field_elements); // sanity check + assert(fieldB.Bytes() == sizeof(float) * field_elements); // sanity check // fill fields with random values quda::RNG rng(fieldA, 1234); @@ -251,7 +334,7 @@ void benchmark_axpy() // number of operations / bytes per iteration // axpy is one addition, one multiplication, two read, one write double flops = 2 * field_elements; - double memory = 3 * sizeof(double) * field_elements; + double memory = 3 * sizeof(float) * field_elements; // do some iterations to to let QUDA do its internal tuning and also stabilize cache // behaviour and such @@ -288,6 +371,7 @@ int main(int argc, char **argv) setVerbosity(QUDA_SILENT); benchmark_wilson(); + benchmark_dwf(); setVerbosity(QUDA_SUMMARIZE); printfQuda("==================== done with all benchmarks ====================\n"); diff --git a/Quda/env.sh b/Quda/env.sh index f73234d..d88cde2 100644 --- a/Quda/env.sh +++ b/Quda/env.sh @@ -2,6 +2,7 @@ module load gcc/9.3.0 module load cuda/11.4.1 module load openmpi/4.1.1-cuda11.4 +export QUDA_RESOURCE_PATH=$(pwd)/tuning export OMP_NUM_THREADS=4 export OMPI_MCA_btl=^uct,openib export OMPI_MCA_pml=ucx # by fabian. no idea what this is -- 2.45.2 From 6a11511000056d339a21bbe7a513b1ff69bcbeb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Fri, 9 Jun 2023 18:07:36 +0100 Subject: [PATCH 07/16] better range of lattice sizes --- Quda/Benchmark_Quda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index a1ea744..3ba7f84 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -226,7 +226,7 @@ void benchmark_dwf() #endif printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); int Ls = 12; - for (int L : {8, 12, 16, 24, 32, 48}) + for (int L : {8, 12, 16, 24}) { auto U = make_gauge_field(L); auto src = make_source(L, Ls); @@ -305,7 +305,7 @@ void benchmark_axpy() printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", "GiB/s/rank", "Gflop/s/rank"); - std::vector L_list = {8, 12, 16, 24, 32}; + std::vector L_list = {8, 12, 16, 24, 32, 48}; for (int L : L_list) { // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` -- 2.45.2 From bd68360c2cdfba03b37d694467ede1215dd3c203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Fri, 9 Jun 2023 18:09:31 +0100 Subject: [PATCH 08/16] clean up build script a bit --- Quda/build-benchmark.sh | 32 ++++++++++++++++++++++++++++++++ Quda/build-quda.sh | 36 ++++++++++++++++++++++++++++++++++++ Quda/build.sh | 10 ---------- 3 files changed, 68 insertions(+), 10 deletions(-) create mode 100755 Quda/build-benchmark.sh create mode 100755 Quda/build-quda.sh delete mode 100755 Quda/build.sh diff --git a/Quda/build-benchmark.sh b/Quda/build-benchmark.sh new file mode 100755 index 0000000..9a9892c --- /dev/null +++ b/Quda/build-benchmark.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1090,SC1091 + +set -euo pipefail + +if (( $# != 1 )); then + echo "usage: $(basename "$0") " 1>&2 + exit 1 +fi +env_dir=$1 + +# TODO: this is Tursa specific. have not figured out the correct way to do this. +EXTRA_LIBS="/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so" + +# NOTE: these flags need to be in sync with Qudas compilation options (see build-quda.sh) +BUILD_FLAGS="-O3 -std=c++17 -DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=12 -DQUDA_RECONSTRUCT=4" + +call_dir=$(pwd -P) +script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" +cd "${env_dir}" +env_dir=$(pwd -P) +cd "${call_dir}" +BUILD_DIR="${env_dir}/build/Quda-benchmarks" +PREFIX_DIR="${env_dir}/prefix/qudabench" +QUDA_DIR=${env_dir}/prefix/quda +mkdir -p "${BUILD_DIR}" +mkdir -p "${PREFIX_DIR}" + +LINK_FLAGS="-Wl,-rpath,$QUDA_DIR/lib: $QUDA_DIR/lib/libquda.so $EXTRA_LIBS -lpthread -lmpi" + +g++ $BUILD_FLAGS -I$QUDA_DIR/include -c -o $BUILD_DIR/Benchmark_Quda.o $script_dir/Benchmark_Quda.cpp +g++ -g -O3 $BUILD_DIR/Benchmark_Quda.o -o $PREFIX_DIR/Benchmark_Quda $LINK_FLAGS -lmpi diff --git a/Quda/build-quda.sh b/Quda/build-quda.sh new file mode 100755 index 0000000..fa2368d --- /dev/null +++ b/Quda/build-quda.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1090,SC1091 + +BUILD_FLAGS="-O3 -std=c++17" +QUDA_FLAGS="-DQUDA_MPI=ON -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=4 -DQUDA_GPU_ARCH=sm_80" + +set -euo pipefail + +if (( $# != 1 )); then + echo "usage: $(basename "$0") " 1>&2 + exit 1 +fi +env_dir=$1 + +call_dir=$(pwd -P) +mkdir -p ${env_dir} +cd "${env_dir}" +env_dir=$(pwd -P) +cd "${call_dir}" + +build_dir="${env_dir}/build/quda" +if [ -d "${build_dir}" ]; then + echo "error: directory '${build_dir}' exists" + exit 1 +fi +mkdir -p "${build_dir}" + +git clone https://github.com/lattice/quda.git "${build_dir}" +cd "${build_dir}" + +mkdir build; cd build +cmake .. $QUDA_FLAGS -DCMAKE_INSTALL_PREFIX=${env_dir}/prefix/quda +make -j128 +make install + +cd "${call_dir}" diff --git a/Quda/build.sh b/Quda/build.sh deleted file mode 100755 index 960c9c8..0000000 --- a/Quda/build.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -#CXX=/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/bin/g++ -QUDA_BUILD=/home/dp207/dp207/dc-burg2/quda_build -QUDA_SRC=/home/dp207/dp207/dc-burg2/quda -#QUDA_BUILD= - -FLAGS="-DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=7 -g -O3 -Wall -Wextra -std=c++17 " -$CXX $FLAGS -I$QUDA_BUILD/include/targets/cuda -I$QUDA_SRC/include -I$QUDA_BUILD/include -isystem $QUDA_SRC/include/externals -isystem $QUDA_BUILD/_deps/eigen-src -c -o Benchmark_Quda.o Benchmark_Quda.cpp -LINK_FLAGS="-Wl,-rpath,$QUDA_BUILD/tests:$QUDA_BUILD/lib:/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs: $QUDA_BUILD/lib/libquda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart_static.a -ldl /usr/lib64/librt.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcublas.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcufft.so -lpthread" -$CXX -g -O3 Benchmark_Quda.o -o Benchmark_Quda $LINK_FLAGS -lmpi -- 2.45.2 From 7193ef4c4a4db387fcb0b10204da16089b18d92d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Fri, 9 Jun 2023 18:09:49 +0100 Subject: [PATCH 09/16] add Readme.md to Quda benchmark --- Quda/Readme.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 Quda/Readme.md diff --git a/Quda/Readme.md b/Quda/Readme.md new file mode 100644 index 0000000..5c7520e --- /dev/null +++ b/Quda/Readme.md @@ -0,0 +1,25 @@ +# QUDA benchmarks + +This folder contains benchmarks for the [QUDA](https://github.com/lattice/quda) library. + +- `Benchmark_Quda`: This benchmark measure floating point performances of fermion +matrices (Wilson and DWF), as well as memory bandwidth (using a simple `axpy` operation). Measurements are +performed for a fixed range of problem sizes. + +## Building +After setting up your compilation environment (Tursa: `source /home/dp207/dp207/shared/env/production/env-{base,gpu}.sh`): +```bash +./build-quda.sh # build Quda +./build-benchmark.sh # build benchmark +``` +where `` is an arbitrary directory where every product will be stored. + +## Running the Benchmark + +The benchmark should be run as +```bash +mpirun -np /prefix/qudabench/Benchmark_Quda +``` +where `` is the total number of GPU's to use. On Tursa this is 4 times the number of nodes. + +Note: on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary. \ No newline at end of file -- 2.45.2 From a1ad41bb06a4a4563de6948cdf312c115a631773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Fri, 9 Jun 2023 18:20:50 +0100 Subject: [PATCH 10/16] Update 'Quda/Readme.md' --- Quda/Readme.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Quda/Readme.md b/Quda/Readme.md index 5c7520e..a162304 100644 --- a/Quda/Readme.md +++ b/Quda/Readme.md @@ -22,4 +22,9 @@ mpirun -np /prefix/qudabench/Benchmark_Quda ``` where `` is the total number of GPU's to use. On Tursa this is 4 times the number of nodes. -Note: on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary. \ No newline at end of file +Note: +- on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary. +- due to Qudas automatic tuning, the benchmark might take significantly longer to run than `Benchmark_Grid` (even though it does fewer things). + - setting `QUDA_ENABLE_TUNING=0` disables all tuning (degrades performance severely). By default, it is turned on. + - setting `QUDA_RESOURCE_PATH=` enables Quda to save and reuse optimal tuning parameters, making repeated runs much faster + \ No newline at end of file -- 2.45.2 From 38527db14d334bacf879b6035977e9f6bd251043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Wed, 12 Apr 2023 11:40:39 +0100 Subject: [PATCH 11/16] add indication of shared-memory directions in comms benchmark --- Grid/Benchmark_Grid.cpp | 139 ++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 63 deletions(-) diff --git a/Grid/Benchmark_Grid.cpp b/Grid/Benchmark_Grid.cpp index 59bbdc2..d27707c 100644 --- a/Grid/Benchmark_Grid.cpp +++ b/Grid/Benchmark_Grid.cpp @@ -73,6 +73,8 @@ class Benchmark {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); + Grid::Coordinate shm; + GlobalSharedMemory::GetShmDims(mpi, shm); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); @@ -85,7 +87,9 @@ class Benchmark std::cout << GridLogMessage << "* OpenMP threads : " << GridThread::GetThreads() << std::endl; - std::cout << GridLogMessage << "* MPI tasks : " << GridCmdVectorIntToString(mpi) + std::cout << GridLogMessage << "* MPI layout : " << GridCmdVectorIntToString(mpi) + << std::endl; + std::cout << GridLogMessage << "* Shm layout : " << GridCmdVectorIntToString(shm) << std::endl; std::cout << GridLogMessage << "* vReal : " << sizeof(vReal) * 8 << "bits ; " @@ -118,6 +122,7 @@ class Benchmark for (unsigned int i = 0; i < mpi.size(); ++i) { tmp["mpi"].push_back(mpi[i]); + tmp["shm"].push_back(shm[i]); } tmp["ranks"] = NP; tmp["nodes"] = NN; @@ -132,6 +137,8 @@ class Benchmark Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); + Coordinate shm_layout; + GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout); for (int mu = 0; mu < Nd; mu++) if (mpi_layout[mu] > 1) @@ -143,8 +150,8 @@ class Benchmark std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl; grid_small_sep(); - grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)", - "time (usec)", "rate (GB/s/node)", "std dev", "max"); + grid_printf("%5s %5s %7s %15s %15s %15s %15s %15s\n", "L", "dir", "shm", + "payload (B)", "time (usec)", "rate (GB/s/node)", "std dev", "max"); for (int lat = 16; lat <= maxlat; lat += 8) { @@ -173,74 +180,80 @@ class Benchmark for (int dir = 0; dir < 8; dir++) { int mu = dir % 4; - if (mpi_layout[mu] > 1) + if (mpi_layout[mu] == 1) // skip directions that are not distributed + continue; + bool is_shm = mpi_layout[mu] == shm_layout[mu]; + bool is_partial_shm = !is_shm && shm_layout[mu] != 1; + + std::vector times(Nloop); + for (int i = 0; i < NWARMUP; i++) + { + int xmit_to_rank; + int recv_from_rank; + + if (dir == mu) + { + int comm_proc = 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + } + else + { + int comm_proc = mpi_layout[mu] - 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + } + Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], + recv_from_rank, bytes); + } + for (int i = 0; i < Nloop; i++) { - std::vector times(Nloop); - for (int i = 0; i < NWARMUP; i++) + dbytes = 0; + double start = usecond(); + int xmit_to_rank; + int recv_from_rank; + + if (dir == mu) { - int xmit_to_rank; - int recv_from_rank; - - if (dir == mu) - { - int comm_proc = 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - else - { - int comm_proc = mpi_layout[mu] - 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, bytes); + int comm_proc = 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); } - for (int i = 0; i < Nloop; i++) + else { - - dbytes = 0; - double start = usecond(); - int xmit_to_rank; - int recv_from_rank; - - if (dir == mu) - { - int comm_proc = 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - else - { - int comm_proc = mpi_layout[mu] - 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, bytes); - dbytes += bytes; - - double stop = usecond(); - t_time[i] = stop - start; // microseconds + int comm_proc = mpi_layout[mu] - 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); } - timestat.statistics(t_time); + Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], + recv_from_rank, bytes); + dbytes += bytes; - dbytes = dbytes * ppn; - double bidibytes = 2. * dbytes; - double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; - double rate_err = rate * timestat.err / timestat.mean; - double rate_max = rate * timestat.mean / timestat.min; - grid_printf("%5d %5d %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, bytes, - timestat.mean, rate, rate_err, rate_max); - nlohmann::json tmp; - nlohmann::json tmp_rate; - tmp["L"] = lat; - tmp["dir"] = dir; - tmp["bytes"] = bytes; - tmp["time_usec"] = timestat.mean; - tmp_rate["mean"] = rate; - tmp_rate["error"] = rate_err; - tmp_rate["max"] = rate_max; - tmp["rate_GBps"] = tmp_rate; - json_results["comms"].push_back(tmp); + double stop = usecond(); + t_time[i] = stop - start; // microseconds } + timestat.statistics(t_time); + + dbytes = dbytes * ppn; + double bidibytes = 2. * dbytes; + double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; + double rate_err = rate * timestat.err / timestat.mean; + double rate_max = rate * timestat.mean / timestat.min; + grid_printf("%5d %5d %7s %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, + is_shm ? "yes" + : is_partial_shm ? "partial" + : "no", + bytes, timestat.mean, rate, rate_err, rate_max); + nlohmann::json tmp; + nlohmann::json tmp_rate; + tmp["L"] = lat; + tmp["dir"] = dir; + tmp["shared_mem"] = is_shm; + tmp["partial_shared_mem"] = is_partial_shm; + tmp["bytes"] = bytes; + tmp["time_usec"] = timestat.mean; + tmp_rate["mean"] = rate; + tmp_rate["error"] = rate_err; + tmp_rate["max"] = rate_max; + tmp["rate_GBps"] = tmp_rate; + json_results["comms"].push_back(tmp); } for (int d = 0; d < 8; d++) { -- 2.45.2 From 0d588d065a9adb542bc482638d2fb4090f266a40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Mon, 19 Jun 2023 18:08:50 +0100 Subject: [PATCH 12/16] add json output to Benchmark_Quda --- Quda/Benchmark_Quda.cpp | 86 ++++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 3ba7f84..84b8565 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -4,17 +4,22 @@ #include #include #include +#include #include #include #include #include #include -using namespace quda; - // remove to use QUDA's own flop counting instead of Grid's convention #define FLOP_COUNTING_GRID +#include "json.hpp" +using nlohmann::json; +json json_results; + +using namespace quda; + // This is the MPI grid, i.e. the layout of ranks int nranks = -1; std::array mpi_grid = {1, 1, 1, 1}; @@ -43,6 +48,9 @@ void initComms(int argc, char **argv) for (int d = 0; d < 4; d++) if (mpi_grid[d] > 1) commDimPartitionedSet(d); + + json_results["geometry"]["ranks"] = nranks; + json_results["geometry"]["mpi"] = mpi_grid; } // creates a random gauge field. L = local(!) size @@ -149,9 +157,8 @@ ColorSpinorField make_source(int L, int Ls = 1) return src; } -void benchmark_wilson() +void benchmark_wilson(std::vector const &L_list, int niter) { - int niter = 20; int niter_warmup = 10; printfQuda("==================== wilson dirac operator ====================\n"); @@ -163,7 +170,7 @@ void benchmark_wilson() #endif printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); - for (int L : {8, 12, 16, 24, 32, 48}) + for (int L : L_list) { auto U = make_gauge_field(L); auto src = make_source(L); @@ -180,18 +187,18 @@ void benchmark_wilson() // Not used for simple Wilson fermions) dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto tmp = ColorSpinorField(ColorSpinorParam(src)); + auto res = ColorSpinorField(ColorSpinorParam(src)); // couple iterations without timing to warm up for (int iter = 0; iter < niter_warmup; ++iter) - dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + dirac.Dslash(res, src, QUDA_EVEN_PARITY); // actual benchmark with timings dirac.Flops(); // reset flops counter device_timer_t device_timer; device_timer.start(); for (int iter = 0; iter < niter; ++iter) - dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + dirac.Dslash(res, src, QUDA_EVEN_PARITY); device_timer.stop(); double secs = device_timer.last() / niter; @@ -209,12 +216,16 @@ void benchmark_wilson() #endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + + json tmp; + tmp["L"] = L; + tmp["Gflops_wilson"] = flops / secs * 1e-9; + json_results["flops"]["results"].push_back(tmp); } } -void benchmark_dwf() +void benchmark_dwf(std::vector const &L_list, int niter) { - int niter = 20; int niter_warmup = 10; printfQuda("==================== domain wall dirac operator ====================\n"); @@ -226,7 +237,7 @@ void benchmark_dwf() #endif printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); int Ls = 12; - for (int L : {8, 12, 16, 24}) + for (int L : L_list) { auto U = make_gauge_field(L); auto src = make_source(L, Ls); @@ -244,18 +255,18 @@ void benchmark_dwf() // (the additional nullptr's are for smeared links and fancy preconditioners and such) dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto tmp = ColorSpinorField(ColorSpinorParam(src)); + auto res = ColorSpinorField(ColorSpinorParam(src)); // couple iterations without timing to warm up for (int iter = 0; iter < niter_warmup; ++iter) - dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + dirac.Dslash(res, src, QUDA_EVEN_PARITY); // actual benchmark with timings dirac.Flops(); // reset flops counter device_timer_t device_timer; device_timer.start(); for (int iter = 0; iter < niter; ++iter) - dirac.Dslash(tmp, src, QUDA_EVEN_PARITY); + dirac.Dslash(res, src, QUDA_EVEN_PARITY); device_timer.stop(); double secs = device_timer.last() / niter; @@ -273,15 +284,18 @@ void benchmark_dwf() #endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); + json tmp; + tmp["L"] = L; + tmp["Gflops_dwf4"] = flops / secs * 1e-9; + json_results["flops"]["results"].push_back(tmp); } } -void benchmark_axpy() +void benchmark_axpy(std::vector const &L_list, int niter) { // number of iterations for warmup / measurement // (feel free to change for noise/time tradeoff) constexpr int niter_warmup = 10; - constexpr int niter = 20; printfQuda("==================== axpy / memory ====================\n"); @@ -305,7 +319,6 @@ void benchmark_axpy() printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", "GiB/s/rank", "Gflop/s/rank"); - std::vector L_list = {8, 12, 16, 24, 32, 48}; for (int L : L_list) { // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` @@ -348,14 +361,29 @@ void benchmark_axpy() blas::axpy(1.234, fieldA, fieldB); device_timer.stop(); double secs = device_timer.last() / niter; // seconds per iteration + double mem_MiB = memory / 1024. / 1024.; + double GBps = mem_MiB / 1024 / secs; + printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps, + flops / secs * 1e-9); - printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6, - memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9); + json tmp; + tmp["L"] = L; + tmp["size_MB"] = mem_MiB; + tmp["GBps"] = GBps; + tmp["GFlops"] = flops / secs * 1e-9; + json_results["axpy"].push_back(tmp); } } int main(int argc, char **argv) { + std::string json_filename = ""; // empty indicates no json output + for (int i = 0; i < argc; i++) + { + if (std::string(argv[i]) == "--json-out") + json_filename = argv[i + 1]; + } + initComms(argc, argv); initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used @@ -367,14 +395,28 @@ int main(int argc, char **argv) printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], mpi_grid[3]); - benchmark_axpy(); + benchmark_axpy({8, 12, 16, 24, 32, 48}, 20); setVerbosity(QUDA_SILENT); - benchmark_wilson(); - benchmark_dwf(); + benchmark_wilson({8, 12, 16, 24, 32, 48}, 20); + benchmark_dwf({8, 12, 16, 24, 32}, 20); setVerbosity(QUDA_SUMMARIZE); printfQuda("==================== done with all benchmarks ====================\n"); + + if (!json_filename.empty()) + { + printfQuda("writing benchmark results to %s\n", json_filename.c_str()); + + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + if (me == 0) + { + std::ofstream json_file(json_filename); + json_file << std::setw(2) << json_results; + } + } + endQuda(); quda::comm_finalize(); MPI_Finalize(); -- 2.45.2 From 8cd10019dbd31e94510c5257b8d051ab0afabc15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Mon, 19 Jun 2023 18:22:24 +0100 Subject: [PATCH 13/16] add timestamps to benchmarks --- Quda/Benchmark_Quda.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 84b8565..689cf32 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,17 @@ json json_results; using namespace quda; +// timestamp = seconds since program start. +// these are written to the json output with the goal of later matching them against +// power-measurments to determine energy efficiency. +using Clock = std::chrono::steady_clock; +Clock::time_point program_start_time = Clock::now(); +double get_timestamp() +{ + auto dur = Clock::now() - program_start_time; + return std::chrono::duration_cast(dur).count() * 1.0e-6; +} + // This is the MPI grid, i.e. the layout of ranks int nranks = -1; std::array mpi_grid = {1, 1, 1, 1}; @@ -197,8 +209,10 @@ void benchmark_wilson(std::vector const &L_list, int niter) dirac.Flops(); // reset flops counter device_timer_t device_timer; device_timer.start(); + double start_time = get_timestamp(); for (int iter = 0; iter < niter; ++iter) dirac.Dslash(res, src, QUDA_EVEN_PARITY); + double end_time = get_timestamp(); device_timer.stop(); double secs = device_timer.last() / niter; @@ -220,6 +234,8 @@ void benchmark_wilson(std::vector const &L_list, int niter) json tmp; tmp["L"] = L; tmp["Gflops_wilson"] = flops / secs * 1e-9; + tmp["start_time"] = start_time; + tmp["end_time"] = end_time; json_results["flops"]["results"].push_back(tmp); } } @@ -265,8 +281,10 @@ void benchmark_dwf(std::vector const &L_list, int niter) dirac.Flops(); // reset flops counter device_timer_t device_timer; device_timer.start(); + double start_time = get_timestamp(); for (int iter = 0; iter < niter; ++iter) dirac.Dslash(res, src, QUDA_EVEN_PARITY); + double end_time = get_timestamp(); device_timer.stop(); double secs = device_timer.last() / niter; @@ -287,6 +305,8 @@ void benchmark_dwf(std::vector const &L_list, int niter) json tmp; tmp["L"] = L; tmp["Gflops_dwf4"] = flops / secs * 1e-9; + tmp["start_time"] = start_time; + tmp["end_time"] = end_time; json_results["flops"]["results"].push_back(tmp); } } @@ -357,8 +377,10 @@ void benchmark_axpy(std::vector const &L_list, int niter) // running the actual benchmark device_timer_t device_timer; device_timer.start(); + double start_time = get_timestamp(); for (int iter = 0; iter < niter; ++iter) blas::axpy(1.234, fieldA, fieldB); + double end_time = get_timestamp(); device_timer.stop(); double secs = device_timer.last() / niter; // seconds per iteration double mem_MiB = memory / 1024. / 1024.; @@ -371,6 +393,8 @@ void benchmark_axpy(std::vector const &L_list, int niter) tmp["size_MB"] = mem_MiB; tmp["GBps"] = GBps; tmp["GFlops"] = flops / secs * 1e-9; + tmp["start_time"] = start_time; + tmp["end_time"] = end_time; json_results["axpy"].push_back(tmp); } } -- 2.45.2 From 7648ed7496a74bac5b5b24a10a17cd54f33507a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Tue, 20 Jun 2023 18:08:34 +0100 Subject: [PATCH 14/16] choose iteration count automatically --- Quda/Benchmark_Quda.cpp | 121 ++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 60 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 689cf32..67c81bc 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -36,6 +36,29 @@ double get_timestamp() int nranks = -1; std::array mpi_grid = {1, 1, 1, 1}; +// run f() in a loop for roughly target_time seconds +// returns seconds per iteration it took +template double bench(F const &f, double target_time, int niter_warmup = 5) +{ + device_timer_t timer; + timer.start(); + for (int iter = 0; iter < niter_warmup; ++iter) + f(); + timer.stop(); + + double secs = timer.last() / niter_warmup; + int niter = std::max(1, int(target_time / secs)); + // niter = std::min(1000, niter); + // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter); + + timer.reset(__FUNCTION__, __FILE__, __LINE__); + timer.start(); + for (int iter = 0; iter < niter; ++iter) + f(); + timer.stop(); + return timer.last() / niter; +} + void initComms(int argc, char **argv) { // init MPI communication @@ -169,10 +192,8 @@ ColorSpinorField make_source(int L, int Ls = 1) return src; } -void benchmark_wilson(std::vector const &L_list, int niter) +void benchmark_wilson(std::vector const &L_list, double target_time) { - int niter_warmup = 10; - printfQuda("==================== wilson dirac operator ====================\n"); #ifdef FLOP_COUNTING_GRID printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); @@ -184,6 +205,8 @@ void benchmark_wilson(std::vector const &L_list, int niter) for (int L : L_list) { + // printfQuda("starting wilson L=%d\n", L); + auto U = make_gauge_field(L); auto src = make_source(L); @@ -198,35 +221,26 @@ void benchmark_wilson(std::vector const &L_list, int niter) // (the additional nullptr's are for smeared links and fancy preconditioners and such. // Not used for simple Wilson fermions) dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto res = ColorSpinorField(ColorSpinorParam(src)); + auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; - // couple iterations without timing to warm up - for (int iter = 0; iter < niter_warmup; ++iter) - dirac.Dslash(res, src, QUDA_EVEN_PARITY); - - // actual benchmark with timings + // first run to get the quda tuning out of the way dirac.Flops(); // reset flops counter - device_timer_t device_timer; - device_timer.start(); - double start_time = get_timestamp(); - for (int iter = 0; iter < niter; ++iter) - dirac.Dslash(res, src, QUDA_EVEN_PARITY); - double end_time = get_timestamp(); - device_timer.stop(); + f(); + double flops = 1.0 * dirac.Flops(); - double secs = device_timer.last() / niter; + // actual benchmarking + double start_time = get_timestamp(); + double secs = bench(f, target_time); + double end_time = get_timestamp(); #ifdef FLOP_COUNTING_GRID // this is the flop counting from Benchmark_Grid double Nc = 3; double Nd = 4; double Ns = 4; - double flops = - (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); flops *= L * L * L * L / 2.0; -#else - double flops = 1.0 * dirac.Flops() / niter; #endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); @@ -240,10 +254,8 @@ void benchmark_wilson(std::vector const &L_list, int niter) } } -void benchmark_dwf(std::vector const &L_list, int niter) +void benchmark_dwf(std::vector const &L_list, double target_time) { - int niter_warmup = 10; - printfQuda("==================== domain wall dirac operator ====================\n"); #ifdef FLOP_COUNTING_GRID printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); @@ -255,6 +267,7 @@ void benchmark_dwf(std::vector const &L_list, int niter) int Ls = 12; for (int L : L_list) { + // printfQuda("starting dwf L=%d\n", L); auto U = make_gauge_field(L); auto src = make_source(L, Ls); @@ -270,35 +283,26 @@ void benchmark_dwf(std::vector const &L_list, int niter) // insert gauge field into the dirac operator // (the additional nullptr's are for smeared links and fancy preconditioners and such) dirac.updateFields(&U, nullptr, nullptr, nullptr); - auto res = ColorSpinorField(ColorSpinorParam(src)); + auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; - // couple iterations without timing to warm up - for (int iter = 0; iter < niter_warmup; ++iter) - dirac.Dslash(res, src, QUDA_EVEN_PARITY); - - // actual benchmark with timings + // first run to get the quda tuning out of the way dirac.Flops(); // reset flops counter - device_timer_t device_timer; - device_timer.start(); - double start_time = get_timestamp(); - for (int iter = 0; iter < niter; ++iter) - dirac.Dslash(res, src, QUDA_EVEN_PARITY); - double end_time = get_timestamp(); - device_timer.stop(); + f(); + double flops = 1.0 * dirac.Flops(); - double secs = device_timer.last() / niter; + // actual benchmarking + double start_time = get_timestamp(); + double secs = bench(f, target_time); + double end_time = get_timestamp(); #ifdef FLOP_COUNTING_GRID // this is the flop counting from Benchmark_Grid double Nc = 3; double Nd = 4; double Ns = 4; - double flops = - (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); + flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); flops *= L * L * L * L * Ls / 2.0; -#else - double flops = 1.0 * dirac.Flops() / niter; #endif printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); @@ -311,11 +315,11 @@ void benchmark_dwf(std::vector const &L_list, int niter) } } -void benchmark_axpy(std::vector const &L_list, int niter) +void benchmark_axpy(std::vector const &L_list, double target_time) { // number of iterations for warmup / measurement // (feel free to change for noise/time tradeoff) - constexpr int niter_warmup = 10; + constexpr int niter_warmup = 5; printfQuda("==================== axpy / memory ====================\n"); @@ -341,8 +345,9 @@ void benchmark_axpy(std::vector const &L_list, int niter) "GiB/s/rank", "Gflop/s/rank"); for (int L : L_list) { - // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` - // are LOCAL, i.e. per rank / per GPU + // printfQuda("starting axpy L=%d\n", L); + // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` + // are LOCAL, i.e. per rank / per GPU param.x[0] = L; param.x[1] = L; @@ -369,20 +374,16 @@ void benchmark_axpy(std::vector const &L_list, int niter) double flops = 2 * field_elements; double memory = 3 * sizeof(float) * field_elements; - // do some iterations to to let QUDA do its internal tuning and also stabilize cache - // behaviour and such - for (int iter = 0; iter < niter_warmup; ++iter) - blas::axpy(1.234, fieldA, fieldB); + auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); }; - // running the actual benchmark - device_timer_t device_timer; - device_timer.start(); + // first run to get the quda tuning out of the way + f(); + + // actual benchmarking double start_time = get_timestamp(); - for (int iter = 0; iter < niter; ++iter) - blas::axpy(1.234, fieldA, fieldB); + double secs = bench(f, target_time); double end_time = get_timestamp(); - device_timer.stop(); - double secs = device_timer.last() / niter; // seconds per iteration + double mem_MiB = memory / 1024. / 1024.; double GBps = mem_MiB / 1024 / secs; printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps, @@ -419,11 +420,11 @@ int main(int argc, char **argv) printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], mpi_grid[3]); - benchmark_axpy({8, 12, 16, 24, 32, 48}, 20); + benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0); setVerbosity(QUDA_SILENT); - benchmark_wilson({8, 12, 16, 24, 32, 48}, 20); - benchmark_dwf({8, 12, 16, 24, 32}, 20); + benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0); + benchmark_dwf({8, 12, 16, 24, 32}, 1.0); setVerbosity(QUDA_SUMMARIZE); printfQuda("==================== done with all benchmarks ====================\n"); -- 2.45.2 From 5103c2a592808bb1c0ac82c6363e778c102359d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Wed, 21 Jun 2023 14:42:03 +0100 Subject: [PATCH 15/16] fix bug that made benchmark_quda hang randomly --- Quda/Benchmark_Quda.cpp | 5 +++++ Quda/build-benchmark.sh | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 67c81bc..11d8c6e 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,10 @@ template double bench(F const &f, double target_time, int niter_warmup // niter = std::min(1000, niter); // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter); + // important: each rank has its own timer, so their measurements can slightly vary. But + // 'niter' needs to be consistent (bug took me a couple hours to track down) + comm_broadcast_global(&niter, sizeof(niter), 0); + timer.reset(__FUNCTION__, __FILE__, __LINE__); timer.start(); for (int iter = 0; iter < niter; ++iter) diff --git a/Quda/build-benchmark.sh b/Quda/build-benchmark.sh index 9a9892c..288f3ac 100755 --- a/Quda/build-benchmark.sh +++ b/Quda/build-benchmark.sh @@ -28,5 +28,5 @@ mkdir -p "${PREFIX_DIR}" LINK_FLAGS="-Wl,-rpath,$QUDA_DIR/lib: $QUDA_DIR/lib/libquda.so $EXTRA_LIBS -lpthread -lmpi" -g++ $BUILD_FLAGS -I$QUDA_DIR/include -c -o $BUILD_DIR/Benchmark_Quda.o $script_dir/Benchmark_Quda.cpp +g++ $BUILD_FLAGS -I$QUDA_DIR/include/targets/cuda -I$QUDA_DIR/include -c -o $BUILD_DIR/Benchmark_Quda.o $script_dir/Benchmark_Quda.cpp g++ -g -O3 $BUILD_DIR/Benchmark_Quda.o -o $PREFIX_DIR/Benchmark_Quda $LINK_FLAGS -lmpi -- 2.45.2 From 2351edf3f78c1b290359ed6c2fecebfe73de311f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20B=C3=BCrger?= Date: Wed, 28 Jun 2023 13:29:58 +0100 Subject: [PATCH 16/16] log iso-timestamp instead of seconds-since-start --- Quda/Benchmark_Quda.cpp | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/Quda/Benchmark_Quda.cpp b/Quda/Benchmark_Quda.cpp index 11d8c6e..7421652 100644 --- a/Quda/Benchmark_Quda.cpp +++ b/Quda/Benchmark_Quda.cpp @@ -22,15 +22,20 @@ json json_results; using namespace quda; -// timestamp = seconds since program start. -// these are written to the json output with the goal of later matching them against -// power-measurments to determine energy efficiency. -using Clock = std::chrono::steady_clock; -Clock::time_point program_start_time = Clock::now(); -double get_timestamp() +// thanks chatGPT :) +std::string get_timestamp() { - auto dur = Clock::now() - program_start_time; - return std::chrono::duration_cast(dur).count() * 1.0e-6; + // Get the current time + auto now = std::chrono::system_clock::now(); + + // Convert the current time to a time_t object + std::time_t currentTime = std::chrono::system_clock::to_time_t(now); + + // Format the time using std::put_time + std::stringstream ss; + ss << std::put_time(std::localtime(¤tTime), "%Y%m%d %H:%M:%S"); + + return ss.str(); } // This is the MPI grid, i.e. the layout of ranks @@ -235,9 +240,9 @@ void benchmark_wilson(std::vector const &L_list, double target_time) double flops = 1.0 * dirac.Flops(); // actual benchmarking - double start_time = get_timestamp(); + auto start_time = get_timestamp(); double secs = bench(f, target_time); - double end_time = get_timestamp(); + auto end_time = get_timestamp(); #ifdef FLOP_COUNTING_GRID // this is the flop counting from Benchmark_Grid @@ -297,9 +302,9 @@ void benchmark_dwf(std::vector const &L_list, double target_time) double flops = 1.0 * dirac.Flops(); // actual benchmarking - double start_time = get_timestamp(); + auto start_time = get_timestamp(); double secs = bench(f, target_time); - double end_time = get_timestamp(); + auto end_time = get_timestamp(); #ifdef FLOP_COUNTING_GRID // this is the flop counting from Benchmark_Grid @@ -385,9 +390,9 @@ void benchmark_axpy(std::vector const &L_list, double target_time) f(); // actual benchmarking - double start_time = get_timestamp(); + auto start_time = get_timestamp(); double secs = bench(f, target_time); - double end_time = get_timestamp(); + auto end_time = get_timestamp(); double mem_MiB = memory / 1024. / 1024.; double GBps = mem_MiB / 1024 / secs; -- 2.45.2