From 22caa158cc93414b3706e1705d299134943976e4 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 9 Oct 2020 20:29:40 +0100 Subject: [PATCH] multi-pass I/O benchmark, with statistic and robustness summary --- benchmarks/Benchmark_IO.cc | 202 ++++++++++++++++++++++++++++-------- benchmarks/Benchmark_IO.hpp | 54 +++++++--- 2 files changed, 198 insertions(+), 58 deletions(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index 5e4cef9f..0393257d 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -1,8 +1,16 @@ #include "Benchmark_IO.hpp" +#ifndef BENCH_IO_LMIN +#define BENCH_IO_LMIN 20 +#endif + #ifndef BENCH_IO_LMAX -#define BENCH_IO_LMAX 40 +#define BENCH_IO_LMAX 30 +#endif + +#ifndef BENCH_IO_NPASS +#define BENCH_IO_NPASS 10 #endif using namespace Grid; @@ -12,62 +20,174 @@ std::string filestem(const int l) return "iobench_l" + std::to_string(l); } +int vol(const int i) +{ + return BENCH_IO_LMIN + 2*i; +} + +int volInd(const int l) +{ + return (l - BENCH_IO_LMIN)/2; +} + +template +void stats(Mat &mean, Mat &stdDev, const std::vector &data) +{ + auto nr = data[0].rows(), nc = data[0].cols(); + Eigen::MatrixXd sqSum(nr, nc); + double n = static_cast(data.size()); + + assert(n > 1.); + mean = Mat::Zero(nr, nc); + sqSum = Mat::Zero(nr, nc); + for (auto &d: data) + { + mean += d; + sqSum += d.cwiseProduct(d); + } + stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt(); + mean /= n; +} + +#define grid_printf(...) \ +MSG << "";\ +printf(__VA_ARGS__); + +enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3}; + int main (int argc, char ** argv) { Grid_init(&argc,&argv); - int64_t threads = GridThread::GetThreads(); - auto mpi = GridDefaultMpi(); - std::vector latt; + int64_t threads = GridThread::GetThreads(); + auto mpi = GridDefaultMpi(); + unsigned int nVol = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1; + unsigned int nRelVol = (BENCH_IO_LMAX - 24)/2 + 1; + std::vector perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4)); + std::vector avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4)); + std::vector latt; MSG << "Grid is setup to use " << threads << " threads" << std::endl; MSG << "MPI partition " << mpi << std::endl; - - MSG << SEP << std::endl; - MSG << "Benchmark std write" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) + for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i) { - latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + MSG << BIGSEP << std::endl; + MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl; + MSG << BIGSEP << std::endl; + MSG << SEP << std::endl; + MSG << "Benchmark std write" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - MSG << "-- Local volume " << l << "^4" << std::endl; - writeBenchmark(latt, filestem(l), stdWrite); - } + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), stdWrite); + perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond; + } - MSG << SEP << std::endl; - MSG << "Benchmark std read" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) - { - latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + MSG << SEP << std::endl; + MSG << "Benchmark std read" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - MSG << "-- Local volume " << l << "^4" << std::endl; - readBenchmark(latt, filestem(l), stdRead); - } + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), stdRead); + perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond; + } -#ifdef HAVE_LIME - MSG << SEP << std::endl; - MSG << "Benchmark Grid C-Lime write" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) - { - latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + #ifdef HAVE_LIME + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime write" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - MSG << "-- Local volume " << l << "^4" << std::endl; - writeBenchmark(latt, filestem(l), limeWrite); - } + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), limeWrite); + perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond; + } - MSG << SEP << std::endl; - MSG << "Benchmark Grid C-Lime read" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) - { - latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime read" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - MSG << "-- Local volume " << l << "^4" << std::endl; - readBenchmark(latt, filestem(l), limeRead); - } + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), limeRead); + perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond; + } #endif + avPerf[i].fill(0.); + for (int f = 0; f < 4; ++f) + for (int l = 24; l <= BENCH_IO_LMAX; l += 2) + { + avPerf[i](f) += perf[i](volInd(l), f); + } + avPerf[i] /= nRelVol; + } + + Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4); + Eigen::VectorXd avMean(4), avStdDev(4), avRob(4); + double n = BENCH_IO_NPASS; + + stats(mean, stdDev, perf); + stats(avMean, avStdDev, avPerf); + rob.fill(100.); + rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs()); + avRob.fill(100.); + avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs()); + + MSG << BIGSEP << std::endl; + MSG << "SUMMARY" << std::endl; + MSG << BIGSEP << std::endl; + MSG << "Summary of individual results (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << std::endl; + grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", + "L", "std read", "std dev", "std write", "std dev", + "Grid read", "std dev", "Grid write", "std dev"); + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", + l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), + mean(volInd(l), sWrite), stdDev(volInd(l), sWrite), + mean(volInd(l), gRead), stdDev(volInd(l), gRead), + mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); + } + MSG << std::endl; + MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl; + MSG << std::endl; + grid_printf("%4s %12s %12s %12s %12s\n", + "L", "std read", "std write", "Grid read", "Grid write"); + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", + l, rob(volInd(l), sRead), rob(volInd(l), sWrite), + rob(volInd(l), gRead), rob(volInd(l), gWrite)); + } + MSG << std::endl; + MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << std::endl; + grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", + "std read", "std dev", "std write", "std dev", + "Grid read", "std dev", "Grid write", "std dev"); + grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", + avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), + avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite)); + MSG << std::endl; + MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl; + MSG << std::endl; + grid_printf("%12s %12s %12s %12s\n", + "std read", "std write", "Grid read", "Grid write"); + grid_printf("%12.1f %12.1f %12.1f %12.1f\n", + avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite)); Grid_finalize(); diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp index 39af14ba..c4a6ca58 100644 --- a/benchmarks/Benchmark_IO.hpp +++ b/benchmarks/Benchmark_IO.hpp @@ -5,6 +5,8 @@ #ifdef HAVE_LIME #define MSG std::cout << GridLogMessage #define SEP \ +"-----------------------------------------------------------------------------" +#define BIGSEP \ "=============================================================================" namespace Grid { @@ -37,9 +39,12 @@ using ReaderFn = std::function; // ioWatch.Stop(); // std::fclose(file); // size *= vec.Grid()->ProcessorCount(); -// MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() -// << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) -// << " MB/s" << std::endl; +// auto &p = BinaryIO::lastPerf; +// p.size = size; +// p.time = ioWatch.useconds(); +// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); +// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() +// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; // MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; // } // @@ -72,9 +77,12 @@ using ReaderFn = std::function; // MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; // assert(crcData == crcRead); // size *= vec.Grid()->ProcessorCount(); -// MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() -// << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) -// << " MB/s" << std::endl; +// auto &p = BinaryIO::lastPerf; +// p.size = size; +// p.time = ioWatch.useconds(); +// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); +// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() +// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; // MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; // } @@ -100,9 +108,12 @@ void stdWrite(const std::string filestem, Field &vec) file.flush(); ioWatch.Stop(); size *= vec.Grid()->ProcessorCount(); - MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() - << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) - << " MB/s" << std::endl; + auto &p = BinaryIO::lastPerf; + p.size = size; + p.time = ioWatch.useconds(); + p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() + << ", " << p.mbytesPerSecond << " MB/s" << std::endl; MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; } @@ -135,9 +146,12 @@ void stdRead(Field &vec, const std::string filestem) MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; assert(crcData == crcRead); size *= vec.Grid()->ProcessorCount(); - MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() - << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) - << " MB/s" << std::endl; + auto &p = BinaryIO::lastPerf; + p.size = size; + p.time = ioWatch.useconds(); + p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() + << ", " << p.mbytesPerSecond << " MB/s" << std::endl; MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; } @@ -200,12 +214,18 @@ void writeBenchmark(const Coordinate &latt, const std::string filename, auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr gPt; + std::random_device rd; makeGrid(gPt, gBasePt, Ls, rb); - GridBase *g = gPt.get(); - GridParallelRNG rng(g); - Field vec(g); + GridBase *g = gPt.get(); + GridParallelRNG rng(g); + Field vec(g); + + rng.SeedFixedIntegers({static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd())}); random(rng, vec); write(filename, vec); @@ -223,8 +243,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename, makeGrid(gPt, gBasePt, Ls, rb); - GridBase *g = gPt.get(); - Field vec(g); + GridBase *g = gPt.get(); + Field vec(g); read(vec, filename); }