diff --git a/.gitignore b/.gitignore index 8478350..8dac3db 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ /.vscode -build* .buildutils autom4te.cache config.* @@ -12,4 +11,7 @@ install-sh missing Makefile.in .DS_Store -*~ \ No newline at end of file +*~ +/*/env +/*/build +/Grid/json.hpp \ No newline at end of file diff --git a/Grid/.clang-format b/Grid/.clang-format new file mode 100644 index 0000000..9d54a25 --- /dev/null +++ b/Grid/.clang-format @@ -0,0 +1,14 @@ +{ + BasedOnStyle: LLVM, + UseTab: Never, + IndentWidth: 2, + TabWidth: 2, + BreakBeforeBraces: Allman, + AllowShortIfStatementsOnASingleLine: false, + IndentCaseLabels: false, + ColumnLimit: 90, + AccessModifierOffset: -4, + NamespaceIndentation: All, + FixNamespaceComments: false, + SortIncludes: true, +} diff --git a/Grid/Benchmark_Grid.cpp b/Grid/Benchmark_Grid.cpp new file mode 100644 index 0000000..9fd4ced --- /dev/null +++ b/Grid/Benchmark_Grid.cpp @@ -0,0 +1,1054 @@ +/* +Copyright © 2015 Peter Boyle +Copyright © 2022 Antonin Portelli + +This is a refactoring of Benchmark_ITT.cpp from Grid + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +#include "Common.hpp" +#include "json.hpp" +#include + +using namespace Grid; + +std::vector L_list; +std::vector Ls_list; +std::vector mflop_list; + +double mflop_ref; +double mflop_ref_err; + +int NN_global; + +nlohmann::json json_results; + +struct time_statistics +{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v) + { + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size() * (v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; + } +}; + +void comms_header() +{ + std::cout << GridLogMessage << " L " + << "\t" + << " Ls " + << "\t" + << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl; +}; + +Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT}; + +struct controls +{ + int Opt; + int CommsOverlap; + Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch; +}; + +class Benchmark +{ + public: + static void Decomposition(void) + { + + int threads = GridThread::GetThreads(); + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" + << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "Grid Default Decomposition patterns\n"; + std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() + << std::endl; + std::cout << GridLogMessage + << "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi()) + << std::endl; + std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) + << std::endl; + } + + static void Comms(void) + { + int Nloop = 200; + int nmu = 0; + int maxlat = 48; + + Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + for (int mu = 0; mu < Nd; mu++) + if (mpi_layout[mu] > 1) + nmu++; + + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout << GridLogMessage + << "=======================================================================" + "=============================" + << std::endl; + std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " + << nmu << " dimensions" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "=============================" + << std::endl; + grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)", + "time (usec)", "rate (GB/s)", "std dev", "max"); + + for (int lat = 16; lat <= maxlat; lat += 8) + { + int Ls = 12; + + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], + lat * mpi_layout[3]}); + + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank / Nnode; + + std::vector xbuf(8); + std::vector rbuf(8); + uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD); + for (int d = 0; d < 8; d++) + { + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + } + + double dbytes; + + for (int dir = 0; dir < 8; dir++) + { + int mu = dir % 4; + if (mpi_layout[mu] > 1) + { + + std::vector times(Nloop); + for (int i = 0; i < Nloop; i++) + { + + dbytes = 0; + double start = usecond(); + int xmit_to_rank; + int recv_from_rank; + + if (dir == mu) + { + int comm_proc = 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + } + else + { + int comm_proc = mpi_layout[mu] - 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + } + Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, + (void *)&rbuf[dir][0], recv_from_rank, bytes); + dbytes += bytes; + + double stop = usecond(); + t_time[i] = stop - start; // microseconds + } + timestat.statistics(t_time); + + dbytes = dbytes * ppn; + double bidibytes = 2. * dbytes; + double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; + double rate_err = rate * timestat.err / timestat.mean; + double rate_max = rate * timestat.mean / timestat.min; + grid_printf("%5d %5d %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, bytes, + timestat.mean, rate, rate_err, rate_max); + nlohmann::json tmp; + tmp["L"] = lat; + tmp["dir"] = dir; + tmp["bytes"] = bytes; + tmp["time"] = timestat.mean; + tmp["GB_per_second"] = rate; + json_results["comms"].push_back(tmp); + } + } + for (int d = 0; d < 8; d++) + { + acceleratorFreeDevice(xbuf[d]); + acceleratorFreeDevice(rbuf[d]); + } + } + return; + } + + static void Memory(void) + { + const int Nvec = 8; + typedef Lattice> LatticeVec; + typedef iVector Vec; + + Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << " L " + << "\t\t" + << "bytes" + << "\t\t\t" + << "GB/s" + << "\t\t" + << "Gflop/s" + << "\t\t seconds" + << "\t\tGB/s / node" << std::endl; + std::cout << GridLogMessage + << "----------------------------------------------------------" + << std::endl; + + // uint64_t NP; + uint64_t NN; + + uint64_t lmax = 32; +#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat) + + GridSerialRNG sRNG; + sRNG.SeedFixedIntegers(std::vector({45, 12, 81, 9})); + for (int lat = 8; lat <= lmax; lat += 8) + { + + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], + lat * mpi_layout[3]}); + int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; + + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + // NP= Grid.RankCount(); + NN = Grid.NodeCount(); + + Vec rn; + random(sRNG, rn); + + LatticeVec z(&Grid); + z = Zero(); + LatticeVec x(&Grid); + x = Zero(); + LatticeVec y(&Grid); + y = Zero(); + double a = 2.0; + + uint64_t Nloop = NLOOP; + + double start = usecond(); + for (int i = 0; i < Nloop; i++) + { + z = a * x - y; + } + double stop = usecond(); + double time = (stop - start) / Nloop * 1000; + + double flops = vol * Nvec * 2; // mul,add + double bytes = 3.0 * vol * Nvec * sizeof(Real); + std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes + << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" + << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN + << std::endl; + + nlohmann::json tmp; + tmp["L"] = lat; + tmp["bytes"] = bytes; + tmp["gflops"] = flops / time; + tmp["GB_per_second"] = bytes / time; + json_results["axpy"].push_back(tmp); + } + }; + + static void SU4(void) + { + const int Nc4 = 4; + typedef Lattice> LatticeSU4; + + Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << " L " + << "\t\t" + << "bytes" + << "\t\t\t" + << "GB/s" + << "\t\t" + << "Gflop/s" + << "\t\t seconds" + << "\t\tGB/s / node" << std::endl; + std::cout << GridLogMessage + << "----------------------------------------------------------" + << std::endl; + + uint64_t NN; + + uint64_t lmax = 32; + + GridSerialRNG sRNG; + sRNG.SeedFixedIntegers(std::vector({45, 12, 81, 9})); + for (int lat = 8; lat <= lmax; lat += 8) + { + + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], + lat * mpi_layout[3]}); + int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; + + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + NN = Grid.NodeCount(); + + LatticeSU4 z(&Grid); + z = Zero(); + LatticeSU4 x(&Grid); + x = Zero(); + LatticeSU4 y(&Grid); + y = Zero(); + // double a=2.0; + + uint64_t Nloop = NLOOP; + + double start = usecond(); + for (int i = 0; i < Nloop; i++) + { + z = x * y; + } + double stop = usecond(); + double time = (stop - start) / Nloop * 1000; + + double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add + double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF); + std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes + << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" + << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN + << std::endl; + + nlohmann::json tmp; + tmp["L"] = lat; + tmp["bytes"] = bytes; + tmp["GB_per_second"] = bytes / time; + tmp["gflops"] = flops / time; + json_results["SU4"].push_back(tmp); + } + }; + + static double DWF(int Ls, int L) + { + RealD mass = 0.1; + RealD M5 = 1.8; + + double mflops; + double mflops_best = 0; + double mflops_worst = 0; + std::vector mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); + assert(mpi.size() == 4); + Coordinate local({L, L, L, L}); + Coordinate latt4( + {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); + + GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( + latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global = NN; + uint64_t SHM = NP / NN; + + ///////// Welcome message //////////// + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " + << std::endl; + std::cout << GridLogMessage << "* Nc : " << Nc << std::endl; + std::cout << GridLogMessage + << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; + std::cout << GridLogMessage << "* Ls : " << Ls << std::endl; + std::cout << GridLogMessage << "* ranks : " << NP << std::endl; + std::cout << GridLogMessage << "* nodes : " << NN << std::endl; + std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; + std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) + << std::endl; + std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + + ///////// Lattice Init //////////// + GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid( + latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); + GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); + + ///////// RNG Init //////////// + std::vector seeds4({1, 2, 3, 4}); + std::vector seeds5({5, 6, 7, 8}); + GridParallelRNG RNG4(UGrid); + RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); + RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + typedef DomainWallFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + ///////// Source preparation //////////// + Gauge Umu(UGrid); + SU::HotConfiguration(RNG4, Umu); + Fermion src(FGrid); + random(RNG5, src); + Fermion src_e(FrbGrid); + Fermion src_o(FrbGrid); + Fermion r_e(FrbGrid); + Fermion r_o(FrbGrid); + Fermion r_eo(FGrid); + Action Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); + + { + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + const int num_cases = 4; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases[] = { + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, + CartesianCommunicator::CommunicatorPolicyConcurrent}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, + CartesianCommunicator::CommunicatorPolicyConcurrent}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, + CartesianCommunicator::CommunicatorPolicySequential}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, + CartesianCommunicator::CommunicatorPolicySequential}}; + + for (int c = 0; c < num_cases; c++) + { + + WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; + WilsonKernelsStatic::Opt = Cases[c].Opt; + CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); + + std::cout << GridLogMessage + << "===================================================================" + "===============" + << std::endl; + if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) + std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl; + if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) + std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; + if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) + std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; + std::cout << GridLogMessage << "* SINGLE precision " << std::endl; + std::cout << GridLogMessage + << "===================================================================" + "===============" + << std::endl; + + int nwarm = 10; + double t0 = usecond(); + FGrid->Barrier(); + for (int i = 0; i < nwarm; i++) + { + Dw.DhopEO(src_o, r_e, DaggerNo); + } + FGrid->Barrier(); + double t1 = usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0, &ncall, sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per + // second"< t_time(ncall); + for (uint64_t i = 0; i < ncall; i++) + { + t0 = usecond(); + Dw.DhopEO(src_o, r_e, DaggerNo); + t1 = usecond(); + t_time[i] = t1 - t0; + } + FGrid->Barrier(); + + double volume = Ls; + for (int mu = 0; mu < Nd; mu++) + volume = volume * latt4[mu]; + + // Nc=3 gives + // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 + // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 + // double flops=(1344.0*volume)/2; +#if 0 + double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; +#else + double fps = + Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2; +#endif + double flops = (fps * volume) / 2; + double mf_hi, mf_lo, mf_err; + + timestat.statistics(t_time); + mf_hi = flops / timestat.min; + mf_lo = flops / timestat.max; + mf_err = flops / timestat.min * timestat.err / timestat.mean; + + mflops = flops / timestat.mean; + mflops_all.push_back(mflops); + if (mflops_best == 0) + mflops_best = mflops; + if (mflops_worst == 0) + mflops_worst = mflops; + if (mflops > mflops_best) + mflops_best = mflops; + if (mflops < mflops_worst) + mflops_worst = mflops; + + std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) + << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo + << "-" << mf_hi << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) + << "Deo mflop/s per rank " << mflops / NP << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) + << "Deo mflop/s per node " << mflops / NN << std::endl; + } + + std::cout << GridLogMessage + << "=====================================================================" + "=============" + << std::endl; + std::cout << GridLogMessage << L << "^4 x " << Ls + << " Deo Best mflop/s = " << mflops_best << " ; " + << mflops_best / NN << " per node " << std::endl; + std::cout << GridLogMessage << L << "^4 x " << Ls + << " Deo Worst mflop/s = " << mflops_worst << " ; " + << mflops_worst / NN << " per node " << std::endl; + std::cout << GridLogMessage << fmt << std::endl; + std::cout << GridLogMessage; + + for (int i = 0; i < mflops_all.size(); i++) + { + std::cout << mflops_all[i] / NN << " ; "; + } + std::cout << std::endl; + std::cout << GridLogMessage + << "=====================================================================" + "=============" + << std::endl; + } + return mflops_best; + } + + static double Staggered(int L) + { + double mflops; + double mflops_best = 0; + double mflops_worst = 0; + std::vector mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); + assert(mpi.size() == 4); + Coordinate local({L, L, L, L}); + Coordinate latt4( + {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); + + GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( + latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global = NN; + uint64_t SHM = NP / NN; + + ///////// Welcome message //////////// + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L + << "^4 local volume " << std::endl; + std::cout << GridLogMessage + << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; + std::cout << GridLogMessage << "* ranks : " << NP << std::endl; + std::cout << GridLogMessage << "* nodes : " << NN << std::endl; + std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; + std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) + << std::endl; + std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + + ///////// Lattice Init //////////// + GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid( + latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid); + + ///////// RNG Init //////////// + std::vector seeds4({1, 2, 3, 4}); + GridParallelRNG RNG4(FGrid); + RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass = 0.1; + RealD c1 = 9.0 / 8.0; + RealD c2 = -1.0 / 24.0; + RealD u0 = 1.0; + + typedef ImprovedStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); + SU::HotConfiguration(RNG4, Umu); + + typename Action::ImplParams params; + Action Ds(Umu, Umu, *FGrid, *FrbGrid, mass, c1, c2, u0, params); + + ///////// Source preparation //////////// + Fermion src(FGrid); + random(RNG4, src); + Fermion src_e(FrbGrid); + Fermion src_o(FrbGrid); + Fermion r_e(FrbGrid); + Fermion r_o(FrbGrid); + Fermion r_eo(FGrid); + + { + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + const int num_cases = 4; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases[] = { + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, + CartesianCommunicator::CommunicatorPolicyConcurrent}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, + CartesianCommunicator::CommunicatorPolicyConcurrent}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, + CartesianCommunicator::CommunicatorPolicySequential}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, + CartesianCommunicator::CommunicatorPolicySequential}}; + + for (int c = 0; c < num_cases; c++) + { + + StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap; + StaggeredKernelsStatic::Opt = Cases[c].Opt; + CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); + + std::cout << GridLogMessage + << "===================================================================" + "===============" + << std::endl; + if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric) + std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" + << std::endl; + if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute) + std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; + if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) + std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; + std::cout << GridLogMessage << "* SINGLE precision " << std::endl; + std::cout << GridLogMessage + << "===================================================================" + "===============" + << std::endl; + + int nwarm = 10; + double t0 = usecond(); + FGrid->Barrier(); + for (int i = 0; i < nwarm; i++) + { + Ds.DhopEO(src_o, r_e, DaggerNo); + } + FGrid->Barrier(); + double t1 = usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0, &ncall, sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per + // second"< t_time(ncall); + for (uint64_t i = 0; i < ncall; i++) + { + t0 = usecond(); + Ds.DhopEO(src_o, r_e, DaggerNo); + t1 = usecond(); + t_time[i] = t1 - t0; + } + FGrid->Barrier(); + + double volume = 1; + for (int mu = 0; mu < Nd; mu++) + volume = volume * latt4[mu]; + double flops = (1146.0 * volume) / 2; + double mf_hi, mf_lo, mf_err; + + timestat.statistics(t_time); + mf_hi = flops / timestat.min; + mf_lo = flops / timestat.max; + mf_err = flops / timestat.min * timestat.err / timestat.mean; + + mflops = flops / timestat.mean; + mflops_all.push_back(mflops); + if (mflops_best == 0) + mflops_best = mflops; + if (mflops_worst == 0) + mflops_worst = mflops; + if (mflops > mflops_best) + mflops_best = mflops; + if (mflops < mflops_worst) + mflops_worst = mflops; + + std::cout << GridLogMessage << std::fixed << std::setprecision(1) + << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo + << "-" << mf_hi << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) + << "Deo mflop/s per rank " << mflops / NP << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) + << "Deo mflop/s per node " << mflops / NN << std::endl; + } + + std::cout << GridLogMessage + << "=====================================================================" + "=============" + << std::endl; + std::cout << GridLogMessage << L + << "^4 Deo Best mflop/s = " << mflops_best << " ; " + << mflops_best / NN << " per node " << std::endl; + std::cout << GridLogMessage << L + << "^4 Deo Worst mflop/s = " << mflops_worst << " ; " + << mflops_worst / NN << " per node " << std::endl; + std::cout << GridLogMessage << fmt << std::endl; + std::cout << GridLogMessage; + + for (int i = 0; i < mflops_all.size(); i++) + { + std::cout << mflops_all[i] / NN << " ; "; + } + std::cout << std::endl; + } + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + return mflops_best; + } +}; + +int main(int argc, char **argv) +{ + Grid_init(&argc, &argv); + + std::string json_filename = ""; // empty indicates no json output + for (int i = 0; i < argc; i++) + { + if (std::string(argv[i]) == "--json-out") + json_filename = argv[i + 1]; + } + + CartesianCommunicator::SetCommunicatorPolicy( + CartesianCommunicator::CommunicatorPolicySequential); +#ifdef KNL + LebesgueOrder::Block = std::vector({8, 2, 2, 2}); +#else + LebesgueOrder::Block = std::vector({2, 2, 2, 2}); +#endif + Benchmark::Decomposition(); + + int do_su4 = 1; + int do_memory = 1; + int do_comms = 1; + int do_flops = 1; + int Ls = 1; + + int sel = 4; + std::vector L_list({8, 12, 16, 24, 32}); + int selm1 = sel - 1; + + std::vector wilson; + std::vector dwf4; + std::vector staggered; + + if (do_flops) + { + Ls = 1; + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl; + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + wilson.push_back(Benchmark::DWF(Ls, L_list[l])); + } + + Ls = 12; + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl; + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + double result = Benchmark::DWF(Ls, L_list[l]); + dwf4.push_back(result); + } + + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" + << std::endl; + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + double result = Benchmark::Staggered(L_list[l]); + staggered.push_back(result); + } + + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl; + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " + << dwf4[l] << " \t\t " << staggered[l] << std::endl; + nlohmann::json tmp; + tmp["L"] = L_list[l]; + tmp["mflops_wilson"] = wilson[l]; + tmp["mflops_dwf4"] = dwf4[l]; + tmp["mflops_staggered"] = staggered[l]; + json_results["flops"].push_back(tmp); + } + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + } + + int NN = NN_global; + if (do_memory) + { + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << " Memory benchmark " << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + Benchmark::Memory(); + } + + if (do_su4) + { + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << " SU(4) benchmark " << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + Benchmark::SU4(); + } + + if (do_comms) + { + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << " Communications benchmark " << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + Benchmark::Comms(); + } + + if (do_flops) + { + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl; + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " + << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl; + } + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage + << " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN + << " Mflop/s per node" << std::endl; + std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" + << dwf4[selm1] / NN << ") " << std::endl; + std::cout << std::setprecision(3); + std::cout + << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + } + + if (!json_filename.empty()) + { + std::cout << GridLogMessage << "writing benchmark results to " << json_filename + << std::endl; + + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + if (me == 0) + { + std::ofstream json_file(json_filename); + json_file << std::setw(4) << json_results; + } + } + + Grid_finalize(); +} diff --git a/Grid/Benchmark_IO.cpp b/Grid/Benchmark_IO.cpp index da6b78f..96ef3e3 100644 --- a/Grid/Benchmark_IO.cpp +++ b/Grid/Benchmark_IO.cpp @@ -32,23 +32,13 @@ along with this program. If not, see . #ifdef HAVE_LIME using namespace Grid; -std::string filestem(const int l) -{ - return "iobench_l" + std::to_string(l); -} +std::string filestem(const int l) { return "iobench_l" + std::to_string(l); } -int vol(const int i) -{ - return BENCH_IO_LMIN + 2 * i; -} +int vol(const int i) { return BENCH_IO_LMIN + 2 * i; } -int volInd(const int l) -{ - return (l - BENCH_IO_LMIN) / 2; -} +int volInd(const int l) { return (l - BENCH_IO_LMIN) / 2; } -template -void stats(Mat &mean, Mat &stdDev, const std::vector &data) +template void stats(Mat &mean, Mat &stdDev, const std::vector &data) { auto nr = data[0].rows(), nc = data[0].cols(); Eigen::MatrixXd sqSum(nr, nc); @@ -66,11 +56,11 @@ void stats(Mat &mean, Mat &stdDev, const std::vector &data) mean /= n; } -#define grid_printf(...) \ - { \ - char _buf[1024]; \ - sprintf(_buf, __VA_ARGS__); \ - MSG << _buf; \ +#define grid_printf(...) \ + { \ + char _buf[1024]; \ + sprintf(_buf, __VA_ARGS__); \ + MSG << _buf; \ } enum @@ -173,47 +163,49 @@ int main(int argc, char **argv) MSG << "SUMMARY" << std::endl; MSG << BIGSEP << std::endl; MSG << "Summary of individual results (all results in MB/s)." << std::endl; - MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." + << std::endl; MSG << std::endl; - grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", - "L", "std read", "std dev", "std write", "std dev", - "Grid read", "std dev", "Grid write", "std dev"); + grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev", + "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev"); for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) { - grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", - l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), - mean(volInd(l), sWrite), stdDev(volInd(l), sWrite), - mean(volInd(l), gRead), stdDev(volInd(l), gRead), - mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", l, + mean(volInd(l), sRead), stdDev(volInd(l), sRead), mean(volInd(l), sWrite), + stdDev(volInd(l), sWrite), mean(volInd(l), gRead), + stdDev(volInd(l), gRead), mean(volInd(l), gWrite), + stdDev(volInd(l), gWrite)); } MSG << std::endl; - MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl; + MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" + << std::endl; MSG << std::endl; - grid_printf("%4s %12s %12s %12s %12s\n", - "L", "std read", "std write", "Grid read", "Grid write"); + grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read", + "Grid write"); for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) { - grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", - l, rob(volInd(l), sRead), rob(volInd(l), sWrite), - rob(volInd(l), gRead), rob(volInd(l), gWrite)); + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead), + rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite)); } MSG << std::endl; - MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl; - MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX + << "^4 (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." + << std::endl; MSG << std::endl; - grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", - "std read", "std dev", "std write", "std dev", - "Grid read", "std dev", "Grid write", "std dev"); - grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", - avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), - avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite)); + grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev", + "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev"); + grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead), + avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead), + avStdDev(gRead), avMean(gWrite), avStdDev(gWrite)); MSG << std::endl; - MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl; + MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" + << std::endl; MSG << std::endl; - grid_printf("%12s %12s %12s %12s\n", - "std read", "std write", "Grid read", "Grid write"); - grid_printf("%12.1f %12.1f %12.1f %12.1f\n", - avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite)); + grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read", + "Grid write"); + grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead), + avRob(gWrite)); Grid_finalize(); diff --git a/Grid/Benchmark_IO.hpp b/Grid/Benchmark_IO.hpp index 24e2214..d71e943 100644 --- a/Grid/Benchmark_IO.hpp +++ b/Grid/Benchmark_IO.hpp @@ -20,9 +20,9 @@ along with this program. If not, see . #include #define MSG std::cout << GridLogMessage -#define SEP \ +#define SEP \ "-----------------------------------------------------------------------------" -#define BIGSEP \ +#define BIGSEP \ "=============================================================================" #ifdef HAVE_LIME @@ -36,16 +36,15 @@ namespace Grid // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API // - // template - // void stdWrite(const std::string filestem, Field &vec) + // template void stdWrite(const std::string filestem, Field &vec) // { - // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); - // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); - // size_t size; - // uint32_t crc; + // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); + // size_t size; + // uint32_t crc; // GridStopWatch ioWatch, crcWatch; - // size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + // size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object); // autoView(vec_v, vec, CpuRead); // crcWatch.Start(); // crc = GridChecksum::crc32(vec_v.cpu_ptr, size); @@ -53,36 +52,39 @@ namespace Grid // crcWatch.Stop(); // MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; // ioWatch.Start(); - // std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); + // std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), + // vec.Grid()->lSites(), file); // ioWatch.Stop(); // std::fclose(file); // size *= vec.Grid()->ProcessorCount(); // auto &p = BinaryIO::lastPerf; - // p.size = size; - // p.time = ioWatch.useconds(); - // p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + // p.size = size; + // p.time = ioWatch.useconds(); + // p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); // MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() - // << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + // << ", + // " + // << p.mbytesPerSecond << " MB/s" << std::endl; // MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; // } - // - // template - // void stdRead(Field &vec, const std::string filestem) + + // template void stdRead(Field &vec, const std::string filestem) // { - // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); - // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); - // size_t size; - // uint32_t crcRead, crcData; + // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); + // size_t size; + // uint32_t crcRead, crcData; // GridStopWatch ioWatch, crcWatch; - // size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + // size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object); // crcWatch.Start(); // std::fread(&crcRead, sizeof(uint32_t), 1, file); // crcWatch.Stop(); // { // autoView(vec_v, vec, CpuWrite); // ioWatch.Start(); - // std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); + // std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), + // vec.Grid()->lSites(), file); // ioWatch.Stop(); // std::fclose(file); // } @@ -96,19 +98,19 @@ namespace Grid // assert(crcData == crcRead); // size *= vec.Grid()->ProcessorCount(); // auto &p = BinaryIO::lastPerf; - // p.size = size; - // p.time = ioWatch.useconds(); - // p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); - // MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() - // << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + // p.size = size; + // p.time = ioWatch.useconds(); + // p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); + // MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", " + // << p.mbytesPerSecond << " MB/s" << std::endl; // MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; // } - template - void stdWrite(const std::string filestem, Field &vec) + template void stdWrite(const std::string filestem, Field &vec) { std::string rankStr = std::to_string(vec.Grid()->ThisRank()); - std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary); + std::ofstream file(filestem + "." + rankStr + ".bin", + std::ios::out | std::ios::binary); size_t size, sizec; uint32_t crc; GridStopWatch ioWatch, crcWatch; @@ -130,16 +132,16 @@ namespace Grid p.size = size; p.time = ioWatch.useconds(); p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); - MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() - << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", " + << p.mbytesPerSecond << " MB/s" << std::endl; MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; } - template - void stdRead(Field &vec, const std::string filestem) + template void stdRead(Field &vec, const std::string filestem) { std::string rankStr = std::to_string(vec.Grid()->ThisRank()); - std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary); + std::ifstream file(filestem + "." + rankStr + ".bin", + std::ios::in | std::ios::binary); size_t size, sizec; uint32_t crcRead, crcData; GridStopWatch ioWatch, crcWatch; @@ -168,13 +170,12 @@ namespace Grid p.size = size; p.time = ioWatch.useconds(); p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); - MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() - << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", " + << p.mbytesPerSecond << " MB/s" << std::endl; MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; } - template - void limeWrite(const std::string filestem, Field &vec) + template void limeWrite(const std::string filestem, Field &vec) { emptyUserRecord record; ScidacWriter binWriter(vec.Grid()->IsBoss()); @@ -184,8 +185,7 @@ namespace Grid binWriter.close(); } - template - void limeRead(Field &vec, const std::string filestem) + template void limeRead(Field &vec, const std::string filestem) { emptyUserRecord record; ScidacReader binReader; @@ -225,12 +225,13 @@ namespace Grid template void writeBenchmark(const Coordinate &latt, const std::string filename, - const WriterFn &write, - const unsigned int Ls = 1, const bool rb = false) + const WriterFn &write, const unsigned int Ls = 1, + const bool rb = false) { auto mpi = GridDefaultMpi(); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); - std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); + std::shared_ptr gBasePt( + SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr gPt; std::random_device rd; @@ -251,12 +252,13 @@ namespace Grid template void readBenchmark(const Coordinate &latt, const std::string filename, - const ReaderFn &read, - const unsigned int Ls = 1, const bool rb = false) + const ReaderFn &read, const unsigned int Ls = 1, + const bool rb = false) { auto mpi = GridDefaultMpi(); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); - std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); + std::shared_ptr gBasePt( + SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr gPt; makeGrid(gPt, gBasePt, Ls, rb); diff --git a/Grid/Benchmark_ITT.cpp b/Grid/Benchmark_ITT.cpp deleted file mode 100644 index e618aff..0000000 --- a/Grid/Benchmark_ITT.cpp +++ /dev/null @@ -1,801 +0,0 @@ -/* -Copyright © 2015 Peter Boyle -Copyright © 2022 Antonin Portelli - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - -#include - -using namespace Grid; - -std::vector L_list; -std::vector Ls_list; -std::vector mflop_list; - -double mflop_ref; -double mflop_ref_err; - -int NN_global; - -struct time_statistics -{ - double mean; - double err; - double min; - double max; - - void statistics(std::vector v) - { - double sum = std::accumulate(v.begin(), v.end(), 0.0); - mean = sum / v.size(); - - std::vector diff(v.size()); - std::transform(v.begin(), v.end(), diff.begin(), [=](double x) - { return x - mean; }); - double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); - err = std::sqrt(sq_sum / (v.size() * (v.size() - 1))); - - auto result = std::minmax_element(v.begin(), v.end()); - min = *result.first; - max = *result.second; - } -}; - -void comms_header() -{ - std::cout << GridLogMessage << " L " - << "\t" - << " Ls " - << "\t" - << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl; -}; - -Gamma::Algebra Gmu[] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT}; -struct controls -{ - int Opt; - int CommsOverlap; - Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch; -}; - -class Benchmark -{ -public: - static void Decomposition(void) - { - - int threads = GridThread::GetThreads(); - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "Grid Default Decomposition patterns\n"; - std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() << std::endl; - std::cout << GridLogMessage << "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi()) << std::endl; - std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - } - - static void Comms(void) - { - int Nloop = 200; - int nmu = 0; - int maxlat = 32; - - Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); - Coordinate mpi_layout = GridDefaultMpi(); - - for (int mu = 0; mu < Nd; mu++) - if (mpi_layout[mu] > 1) - nmu++; - - std::vector t_time(Nloop); - time_statistics timestat; - - std::cout << GridLogMessage << "====================================================================================================" << std::endl; - std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl; - std::cout << GridLogMessage << "====================================================================================================" << std::endl; - comms_header(); - - for (int lat = 16; lat <= maxlat; lat += 8) - { - // for(int Ls=8;Ls<=8;Ls*=2){ - { - int Ls = 12; - - Coordinate latt_size({lat * mpi_layout[0], - lat * mpi_layout[1], - lat * mpi_layout[2], - lat * mpi_layout[3]}); - - GridCartesian Grid(latt_size, simd_layout, mpi_layout); - RealD Nrank = Grid._Nprocessors; - RealD Nnode = Grid.NodeCount(); - RealD ppn = Nrank / Nnode; - - std::vector xbuf(8); - std::vector rbuf(8); - // Grid.ShmBufferFreeAll(); - uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD); - for (int d = 0; d < 8; d++) - { - xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); - rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); - // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); - // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); - } - - // int ncomm; - double dbytes; - - for (int dir = 0; dir < 8; dir++) - { - int mu = dir % 4; - if (mpi_layout[mu] > 1) - { - - std::vector times(Nloop); - for (int i = 0; i < Nloop; i++) - { - - dbytes = 0; - double start = usecond(); - int xmit_to_rank; - int recv_from_rank; - - if (dir == mu) - { - int comm_proc = 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - else - { - int comm_proc = mpi_layout[mu] - 1; - Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); - } - Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, - bytes); - dbytes += bytes; - - double stop = usecond(); - t_time[i] = stop - start; // microseconds - } - timestat.statistics(t_time); - - dbytes = dbytes * ppn; - double xbytes = dbytes * 0.5; - double bidibytes = dbytes; - - std::cout << GridLogMessage << lat << "\t" << Ls << "\t " - << bytes << " \t " - << xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t " - << xbytes / timestat.max << " " << xbytes / timestat.min - << "\t\t" << bidibytes / timestat.mean << " " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " " - << bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl; - } - } - for (int d = 0; d < 8; d++) - { - acceleratorFreeDevice(xbuf[d]); - acceleratorFreeDevice(rbuf[d]); - } - } - } - return; - } - - static void Memory(void) - { - const int Nvec = 8; - typedef Lattice> LatticeVec; - typedef iVector Vec; - - Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd()); - Coordinate mpi_layout = GridDefaultMpi(); - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " L " - << "\t\t" - << "bytes" - << "\t\t\t" - << "GB/s" - << "\t\t" - << "Gflop/s" - << "\t\t seconds" - << "\t\tGB/s / node" << std::endl; - std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl; - - // uint64_t NP; - uint64_t NN; - - uint64_t lmax = 32; -#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat) - - GridSerialRNG sRNG; - sRNG.SeedFixedIntegers(std::vector({45, 12, 81, 9})); - for (int lat = 8; lat <= lmax; lat += 8) - { - - Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); - int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; - - GridCartesian Grid(latt_size, simd_layout, mpi_layout); - - // NP= Grid.RankCount(); - NN = Grid.NodeCount(); - - Vec rn; - random(sRNG, rn); - - LatticeVec z(&Grid); - z = Zero(); - LatticeVec x(&Grid); - x = Zero(); - LatticeVec y(&Grid); - y = Zero(); - double a = 2.0; - - uint64_t Nloop = NLOOP; - - double start = usecond(); - for (int i = 0; i < Nloop; i++) - { - z = a * x - y; - } - double stop = usecond(); - double time = (stop - start) / Nloop * 1000; - - double flops = vol * Nvec * 2; // mul,add - double bytes = 3.0 * vol * Nvec * sizeof(Real); - std::cout << GridLogMessage << std::setprecision(3) - << lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000. - << "\t\t" << bytes / time / NN << std::endl; - } - }; - - static void SU4(void) - { - const int Nc4 = 4; - typedef Lattice> LatticeSU4; - - Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd()); - Coordinate mpi_layout = GridDefaultMpi(); - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " L " - << "\t\t" - << "bytes" - << "\t\t\t" - << "GB/s" - << "\t\t" - << "Gflop/s" - << "\t\t seconds" - << "\t\tGB/s / node" << std::endl; - std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl; - - uint64_t NN; - - uint64_t lmax = 32; - - GridSerialRNG sRNG; - sRNG.SeedFixedIntegers(std::vector({45, 12, 81, 9})); - for (int lat = 8; lat <= lmax; lat += 8) - { - - Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); - int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; - - GridCartesian Grid(latt_size, simd_layout, mpi_layout); - - NN = Grid.NodeCount(); - - LatticeSU4 z(&Grid); - z = Zero(); - LatticeSU4 x(&Grid); - x = Zero(); - LatticeSU4 y(&Grid); - y = Zero(); - // double a=2.0; - - uint64_t Nloop = NLOOP; - - double start = usecond(); - for (int i = 0; i < Nloop; i++) - { - z = x * y; - } - double stop = usecond(); - double time = (stop - start) / Nloop * 1000; - - double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add - double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF); - std::cout << GridLogMessage << std::setprecision(3) - << lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000. - << "\t\t" << bytes / time / NN << std::endl; - } - }; - - static double DWF(int Ls, int L) - { - RealD mass = 0.1; - RealD M5 = 1.8; - - double mflops; - double mflops_best = 0; - double mflops_worst = 0; - std::vector mflops_all; - - /////////////////////////////////////////////////////// - // Set/Get the layout & grid size - /////////////////////////////////////////////////////// - int threads = GridThread::GetThreads(); - Coordinate mpi = GridDefaultMpi(); - assert(mpi.size() == 4); - Coordinate local({L, L, L, L}); - Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); - - GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, - GridDefaultSimd(Nd, vComplex::Nsimd()), - GridDefaultMpi()); - uint64_t NP = TmpGrid->RankCount(); - uint64_t NN = TmpGrid->NodeCount(); - NN_global = NN; - uint64_t SHM = NP / NN; - - ///////// Welcome message //////////// - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " << std::endl; - std::cout << GridLogMessage << "* Nc : " << Nc << std::endl; - std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; - std::cout << GridLogMessage << "* Ls : " << Ls << std::endl; - std::cout << GridLogMessage << "* ranks : " << NP << std::endl; - std::cout << GridLogMessage << "* nodes : " << NN << std::endl; - std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; - std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl; - std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - - ///////// Lattice Init //////////// - GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); - GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); - GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); - GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); - - ///////// RNG Init //////////// - std::vector seeds4({1, 2, 3, 4}); - std::vector seeds5({5, 6, 7, 8}); - GridParallelRNG RNG4(UGrid); - RNG4.SeedFixedIntegers(seeds4); - GridParallelRNG RNG5(FGrid); - RNG5.SeedFixedIntegers(seeds5); - std::cout << GridLogMessage << "Initialised RNGs" << std::endl; - - typedef DomainWallFermionF Action; - typedef typename Action::FermionField Fermion; - typedef LatticeGaugeFieldF Gauge; - - ///////// Source preparation //////////// - Gauge Umu(UGrid); - SU::HotConfiguration(RNG4, Umu); - Fermion src(FGrid); - random(RNG5, src); - Fermion src_e(FrbGrid); - Fermion src_o(FrbGrid); - Fermion r_e(FrbGrid); - Fermion r_o(FrbGrid); - Fermion r_eo(FGrid); - Action Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); - - { - - pickCheckerboard(Even, src_e, src); - pickCheckerboard(Odd, src_o, src); - - const int num_cases = 4; - std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); - - controls Cases[] = { - {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, - {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, - {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential}, - {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}}; - - for (int c = 0; c < num_cases; c++) - { - - WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; - WilsonKernelsStatic::Opt = Cases[c].Opt; - CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) - std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl; - if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) - std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; - if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) - std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; - std::cout << GridLogMessage << "* SINGLE precision " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - - int nwarm = 10; - double t0 = usecond(); - FGrid->Barrier(); - for (int i = 0; i < nwarm; i++) - { - Dw.DhopEO(src_o, r_e, DaggerNo); - } - FGrid->Barrier(); - double t1 = usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0, &ncall, sizeof(ncall)); - - // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for (uint64_t i = 0; i < ncall; i++) - { - t0 = usecond(); - Dw.DhopEO(src_o, r_e, DaggerNo); - t1 = usecond(); - t_time[i] = t1 - t0; - } - FGrid->Barrier(); - - double volume = Ls; - for (int mu = 0; mu < Nd; mu++) - volume = volume * latt4[mu]; - - // Nc=3 gives - // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 - // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 - // double flops=(1344.0*volume)/2; -#if 0 - double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; -#else - double fps = Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2; -#endif - double flops = (fps * volume) / 2; - double mf_hi, mf_lo, mf_err; - - timestat.statistics(t_time); - mf_hi = flops / timestat.min; - mf_lo = flops / timestat.max; - mf_err = flops / timestat.min * timestat.err / timestat.mean; - - mflops = flops / timestat.mean; - mflops_all.push_back(mflops); - if (mflops_best == 0) - mflops_best = mflops; - if (mflops_worst == 0) - mflops_worst = mflops; - if (mflops > mflops_best) - mflops_best = mflops; - if (mflops < mflops_worst) - mflops_worst = mflops; - - std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl; - std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl; - std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank " << mflops / NP << std::endl; - std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node " << mflops / NN << std::endl; - } - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Best mflop/s = " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl; - std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Worst mflop/s = " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl; - std::cout << GridLogMessage << fmt << std::endl; - std::cout << GridLogMessage; - - for (int i = 0; i < mflops_all.size(); i++) - { - std::cout << mflops_all[i] / NN << " ; "; - } - std::cout << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - } - return mflops_best; - } - - static double Staggered(int L) - { - double mflops; - double mflops_best = 0; - double mflops_worst = 0; - std::vector mflops_all; - - /////////////////////////////////////////////////////// - // Set/Get the layout & grid size - /////////////////////////////////////////////////////// - int threads = GridThread::GetThreads(); - Coordinate mpi = GridDefaultMpi(); - assert(mpi.size() == 4); - Coordinate local({L, L, L, L}); - Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); - - GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, - GridDefaultSimd(Nd, vComplex::Nsimd()), - GridDefaultMpi()); - uint64_t NP = TmpGrid->RankCount(); - uint64_t NN = TmpGrid->NodeCount(); - NN_global = NN; - uint64_t SHM = NP / NN; - - ///////// Welcome message //////////// - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L << "^4 local volume " << std::endl; - std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; - std::cout << GridLogMessage << "* ranks : " << NP << std::endl; - std::cout << GridLogMessage << "* nodes : " << NN << std::endl; - std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; - std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl; - std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - - ///////// Lattice Init //////////// - GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); - GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid); - - ///////// RNG Init //////////// - std::vector seeds4({1, 2, 3, 4}); - GridParallelRNG RNG4(FGrid); - RNG4.SeedFixedIntegers(seeds4); - std::cout << GridLogMessage << "Initialised RNGs" << std::endl; - - RealD mass = 0.1; - RealD c1 = 9.0 / 8.0; - RealD c2 = -1.0 / 24.0; - RealD u0 = 1.0; - - typedef ImprovedStaggeredFermionF Action; - typedef typename Action::FermionField Fermion; - typedef LatticeGaugeFieldF Gauge; - - Gauge Umu(FGrid); - SU::HotConfiguration(RNG4, Umu); - - typename Action::ImplParams params; - Action Ds(Umu, Umu, *FGrid, *FrbGrid, mass, c1, c2, u0, params); - - ///////// Source preparation //////////// - Fermion src(FGrid); - random(RNG4, src); - Fermion src_e(FrbGrid); - Fermion src_o(FrbGrid); - Fermion r_e(FrbGrid); - Fermion r_o(FrbGrid); - Fermion r_eo(FGrid); - - { - - pickCheckerboard(Even, src_e, src); - pickCheckerboard(Odd, src_o, src); - - const int num_cases = 4; - std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); - - controls Cases[] = { - {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, - {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, - {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential}, - {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}}; - - for (int c = 0; c < num_cases; c++) - { - - StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap; - StaggeredKernelsStatic::Opt = Cases[c].Opt; - CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric) - std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" << std::endl; - if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute) - std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; - if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) - std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; - std::cout << GridLogMessage << "* SINGLE precision " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - - int nwarm = 10; - double t0 = usecond(); - FGrid->Barrier(); - for (int i = 0; i < nwarm; i++) - { - Ds.DhopEO(src_o, r_e, DaggerNo); - } - FGrid->Barrier(); - double t1 = usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0, &ncall, sizeof(ncall)); - - // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for (uint64_t i = 0; i < ncall; i++) - { - t0 = usecond(); - Ds.DhopEO(src_o, r_e, DaggerNo); - t1 = usecond(); - t_time[i] = t1 - t0; - } - FGrid->Barrier(); - - double volume = 1; - for (int mu = 0; mu < Nd; mu++) - volume = volume * latt4[mu]; - double flops = (1146.0 * volume) / 2; - double mf_hi, mf_lo, mf_err; - - timestat.statistics(t_time); - mf_hi = flops / timestat.min; - mf_lo = flops / timestat.max; - mf_err = flops / timestat.min * timestat.err / timestat.mean; - - mflops = flops / timestat.mean; - mflops_all.push_back(mflops); - if (mflops_best == 0) - mflops_best = mflops; - if (mflops_worst == 0) - mflops_worst = mflops; - if (mflops > mflops_best) - mflops_best = mflops; - if (mflops < mflops_worst) - mflops_worst = mflops; - - std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl; - std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank " << mflops / NP << std::endl; - std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node " << mflops / NN << std::endl; - } - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << L << "^4 Deo Best mflop/s = " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl; - std::cout << GridLogMessage << L << "^4 Deo Worst mflop/s = " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl; - std::cout << GridLogMessage << fmt << std::endl; - std::cout << GridLogMessage; - - for (int i = 0; i < mflops_all.size(); i++) - { - std::cout << mflops_all[i] / NN << " ; "; - } - std::cout << std::endl; - } - std::cout << GridLogMessage << "==================================================================================" << std::endl; - return mflops_best; - } -}; - -int main(int argc, char **argv) -{ - Grid_init(&argc, &argv); - - CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); -#ifdef KNL - LebesgueOrder::Block = std::vector({8, 2, 2, 2}); -#else - LebesgueOrder::Block = std::vector({2, 2, 2, 2}); -#endif - Benchmark::Decomposition(); - - int do_su4 = 1; - int do_memory = 1; - int do_comms = 1; - - int sel = 4; - std::vector L_list({8, 12, 16, 24, 32}); - int selm1 = sel - 1; - - std::vector wilson; - std::vector dwf4; - std::vector staggered; - - int Ls = 1; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - for (int l = 0; l < L_list.size(); l++) - { - wilson.push_back(Benchmark::DWF(Ls, L_list[l])); - } - - Ls = 12; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - for (int l = 0; l < L_list.size(); l++) - { - double result = Benchmark::DWF(Ls, L_list[l]); - dwf4.push_back(result); - } - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - for (int l = 0; l < L_list.size(); l++) - { - double result = Benchmark::Staggered(L_list[l]); - staggered.push_back(result); - } - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl; - for (int l = 0; l < L_list.size(); l++) - { - std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " << dwf4[l] << " \t\t " << staggered[l] << std::endl; - } - std::cout << GridLogMessage << "==================================================================================" << std::endl; - - int NN = NN_global; - if (do_memory) - { - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Memory benchmark " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - Benchmark::Memory(); - } - - if (do_su4) - { - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " SU(4) benchmark " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - Benchmark::SU4(); - } - - if (do_comms) - { - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Communications benchmark " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - Benchmark::Comms(); - } - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl; - for (int l = 0; l < L_list.size(); l++) - { - std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl; - } - std::cout << GridLogMessage << "==================================================================================" << std::endl; - - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN << " Mflop/s per node" << std::endl; - std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl; - std::cout << std::setprecision(3); - std::cout << GridLogMessage << "==================================================================================" << std::endl; - - Grid_finalize(); -} diff --git a/Grid/Benchmark_comms_host_device.cpp b/Grid/Benchmark_comms_host_device.cpp index 56227c4..e213859 100644 --- a/Grid/Benchmark_comms_host_device.cpp +++ b/Grid/Benchmark_comms_host_device.cpp @@ -21,231 +21,245 @@ along with this program. If not, see . using namespace std; using namespace Grid; -struct time_statistics{ +struct time_statistics +{ double mean; double err; double min; double max; - void statistics(std::vector v){ - double sum = std::accumulate(v.begin(), v.end(), 0.0); - mean = sum / v.size(); + void statistics(std::vector v) + { + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); - std::vector diff(v.size()); - std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); - double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); - err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size() * (v.size() - 1))); - auto result = std::minmax_element(v.begin(), v.end()); - min = *result.first; - max = *result.second; -} + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; + } }; -void header(){ - std::cout <1) nmu++; + int Nloop = 250; + int nmu = 0; + int maxlat = 32; + for (int mu = 0; mu < Nd; mu++) + if (mpi_layout[mu] > 1) + nmu++; - std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; + std::cout << GridLogMessage << "Number of iterations to average: " << Nloop + << std::endl; std::vector t_time(Nloop); // time_statistics timestat; - std::cout< > xbuf(8); - std::vector > rbuf(8); + std::vector> xbuf(8); + std::vector> rbuf(8); - for(int mu=0;mu<8;mu++){ - xbuf[mu].resize(lat*lat*lat*Ls); - rbuf[mu].resize(lat*lat*lat*Ls); + for (int mu = 0; mu < 8; mu++) + { + xbuf[mu].resize(lat * lat * lat * Ls); + rbuf[mu].resize(lat * lat * lat * Ls); } - uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD); int ncomm; - for(int mu=0;mu<4;mu++){ - if (mpi_layout[mu]>1 ) { - double start=usecond(); - for(int i=0;i 1) + { + double start = usecond(); + for (int i = 0; i < Nloop; i++) + { - ncomm=0; - - - ncomm++; - int comm_proc=1; - int xmit_to_rank; - int recv_from_rank; - - { - std::vector requests; - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFrom((void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); - } + ncomm = 0; - comm_proc = mpi_layout[mu]-1; - { - std::vector requests; - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); - } - } - Grid.Barrier(); - double stop=usecond(); - double mean=(stop-start)/Nloop; - double dbytes = bytes*ppn; - double xbytes = dbytes*2.0*ncomm; - double rbytes = xbytes; - double bidibytes = xbytes+rbytes; + ncomm++; + int comm_proc = 1; + int xmit_to_rank; + int recv_from_rank; - std::cout< requests; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank, + (void *)&rbuf[mu][0], recv_from_rank, bytes); + } + comm_proc = mpi_layout[mu] - 1; + { + std::vector requests; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank, + (void *)&rbuf[mu + 4][0], recv_from_rank, bytes); + } + } + Grid.Barrier(); + double stop = usecond(); + double mean = (stop - start) / Nloop; + double dbytes = bytes * ppn; + double xbytes = dbytes * 2.0 * ncomm; + double rbytes = xbytes; + double bidibytes = xbytes + rbytes; - - } + std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t" + << std::setw(11) << bytes << std::fixed << std::setprecision(1) + << std::setw(7) << " " << std::right << xbytes / mean << " " + << "\t\t" << std::setw(7) << bidibytes / mean << std::endl; + } } - - - } } - std::cout< xbuf(8); std::vector rbuf(8); - uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); - for(int d=0;d<8;d++){ - xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); - rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD); + for (int d = 0; d < 8; d++) + { + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); } int ncomm; - for(int mu=0;mu<4;mu++){ - if (mpi_layout[mu]>1 ) { - double start=usecond(); - for(int i=0;i 1) + { + double start = usecond(); + for (int i = 0; i < Nloop; i++) + { - ncomm=0; - - - ncomm++; - int comm_proc=1; - int xmit_to_rank; - int recv_from_rank; - - { - std::vector requests; - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFrom((void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); - } + ncomm = 0; - comm_proc = mpi_layout[mu]-1; - { - std::vector requests; - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); - } - } - Grid.Barrier(); - double stop=usecond(); - double mean=(stop-start)/Nloop; - double dbytes = bytes*ppn; - double xbytes = dbytes*2.0*ncomm; - double rbytes = xbytes; - double bidibytes = xbytes+rbytes; + ncomm++; + int comm_proc = 1; + int xmit_to_rank; + int recv_from_rank; - std::cout< requests; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank, + (void *)&rbuf[mu][0], recv_from_rank, bytes); + } + comm_proc = mpi_layout[mu] - 1; + { + std::vector requests; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank, + (void *)&rbuf[mu + 4][0], recv_from_rank, bytes); + } + } + Grid.Barrier(); + double stop = usecond(); + double mean = (stop - start) / Nloop; + double dbytes = bytes * ppn; + double xbytes = dbytes * 2.0 * ncomm; + double rbytes = xbytes; + double bidibytes = xbytes + rbytes; - - } + std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t" + << std::setw(11) << bytes << std::fixed << std::setprecision(1) + << std::setw(7) << " " << std::right << xbytes / mean << " " + << "\t\t" << std::setw(7) << bidibytes / mean << std::endl; + } } - for(int d=0;d<8;d++){ - acceleratorFreeDevice(xbuf[d]); - acceleratorFreeDevice(rbuf[d]); + for (int d = 0; d < 8; d++) + { + acceleratorFreeDevice(xbuf[d]); + acceleratorFreeDevice(rbuf[d]); } - - } } - - std::cout< Copyright © 2022 Antonin Portelli +Copyright © 2023 Simon Bürger This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -16,6 +17,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ +#include "json.hpp" #include #ifdef GRID_CUDA #define CUDA_PROFILE @@ -28,17 +30,13 @@ along with this program. If not, see . using namespace std; using namespace Grid; -template -struct scal +template struct scal { d internal; }; -Gamma::Algebra Gmu[] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT}; +Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT}; int main(int argc, char **argv) { @@ -48,24 +46,41 @@ int main(int argc, char **argv) Coordinate latt4 = GridDefaultLatt(); int Ls = 16; + std::string json_filename = ""; // empty indicates no json output + nlohmann::json json; + + // benchmark specific command line arguments for (int i = 0; i < argc; i++) + { if (std::string(argv[i]) == "-Ls") { std::stringstream ss(argv[i + 1]); ss >> Ls; } + if (std::string(argv[i]) == "--json-out") + json_filename = argv[i + 1]; + } GridLogLayout(); long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc); - GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + json["single_site_flops"] = single_site_flops; + + GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid( + GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); + json["grid"] = FGrid->FullDimensions().toVector(); + json["local_grid"] = FGrid->LocalDimensions().toVector(); + std::cout << GridLogMessage << "Making s innermost grids" << std::endl; - GridCartesian *sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi()); + GridCartesian *sUGrid = + SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi()); + GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid); GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid); @@ -177,13 +192,28 @@ int main(int argc, char **argv) RealD NP = UGrid->_Nprocessors; RealD NN = UGrid->NodeCount(); - std::cout << GridLogMessage << "*****************************************************************" << std::endl; - std::cout << GridLogMessage << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" << std::endl; - std::cout << GridLogMessage << "*****************************************************************" << std::endl; - std::cout << GridLogMessage << "*****************************************************************" << std::endl; - std::cout << GridLogMessage << "* Benchmarking DomainWallFermionR::Dhop " << std::endl; - std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl; - std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B" << std::endl; + json["ranks"] = NP; + json["nodes"] = NN; + + std::cout << GridLogMessage + << "*****************************************************************" + << std::endl; + std::cout << GridLogMessage + << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" + << std::endl; + std::cout << GridLogMessage + << "*****************************************************************" + << std::endl; + std::cout << GridLogMessage + << "*****************************************************************" + << std::endl; + std::cout << GridLogMessage + << "* Benchmarking DomainWallFermionR::Dhop " << std::endl; + std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() + << std::endl; + std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B" + << std::endl; + if (sizeof(RealF) == 4) std::cout << GridLogMessage << "* SINGLE precision " << std::endl; if (sizeof(RealF) == 8) @@ -199,8 +229,11 @@ int main(int argc, char **argv) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl; if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm) + std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl; - std::cout << GridLogMessage << "*****************************************************************" << std::endl; + std::cout << GridLogMessage + << "*****************************************************************" + << std::endl; DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); int ncall = 300; @@ -230,19 +263,38 @@ int main(int argc, char **argv) auto simdwidth = sizeof(vComplex); // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors - double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.); + double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth / + nsimd * ncall / (1024. * 1024. * 1024.); // mem: Nd Wilson * Ls, Nd gauge, Nc colors - double data_mem = (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.); + double data_mem = + (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth / + nsimd * ncall / (1024. * 1024. * 1024.); - std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0 << " us" << std::endl; + json["Dw"]["calls"] = ncall; + json["Dw"]["time"] = t1 - t0; + json["Dw"]["mflops"] = flops / (t1 - t0); + json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP; + json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN; + json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0)); + json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0)); + + std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0 + << " us" << std::endl; // std::cout<Barrier(); @@ -386,9 +449,18 @@ int main(int argc, char **argv) volume = volume * latt4[mu]; double flops = (single_site_flops * volume * ncall) / 2.0; + json["Deo"]["calls"] = ncall; + json["Deo"]["time"] = t1 - t0; + json["Deo"]["mflops"] = flops / (t1 - t0); + json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP; + json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN; + std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl; - std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP << std::endl; - std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN << std::endl; + std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP + << std::endl; + std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN + << std::endl; + Dw.Report(); } Dw.DhopEO(src_o, r_e, DaggerNo); @@ -420,6 +492,21 @@ int main(int argc, char **argv) assert(norm2(src_e) < 1.0e-4); assert(norm2(src_o) < 1.0e-4); + + if (!json_filename.empty()) + { + std::cout << GridLogMessage << "writing benchmark results to " << json_filename + << std::endl; + + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + if (me == 0) + { + std::ofstream json_file(json_filename); + json_file << std::setw(4) << json; + } + } + Grid_finalize(); exit(0); } diff --git a/Grid/Common.hpp b/Grid/Common.hpp new file mode 100644 index 0000000..ccae02c --- /dev/null +++ b/Grid/Common.hpp @@ -0,0 +1,36 @@ +/* +Copyright © 2022 Antonin Portelli + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +#ifndef Grid_Benchmarks_Common_hpp_ +#define Grid_Benchmarks_Common_hpp_ + +#ifndef GRID_MSG +#define GRID_MSG std::cout << GridLogMessage +#endif + +#ifndef GRID_MSG_MAXSIZE +#define GRID_MSG_MAXSIZE 1024 +#endif + +#define grid_printf(...) \ + { \ + char _buf[GRID_MSG_MAXSIZE]; \ + snprintf(_buf, GRID_MSG_MAXSIZE, __VA_ARGS__); \ + GRID_MSG << _buf; \ + } + +#endif // Grid_Benchmarks_Common_hpp_ diff --git a/Grid/Makefile.am b/Grid/Makefile.am index 8557f11..a9da13e 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -3,10 +3,10 @@ ACLOCAL_AMFLAGS = -I .buildutils/m4 bin_PROGRAMS = \ Benchmark_comms_host_device \ Benchmark_dwf_fp32 \ - Benchmark_ITT \ + Benchmark_Grid \ Benchmark_IO Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp Benchmark_dwf_fp32_SOURCES = Benchmark_dwf_fp32.cpp -Benchmark_ITT_SOURCES = Benchmark_ITT.cpp +Benchmark_Grid_SOURCES = Benchmark_Grid.cpp Benchmark_IO_SOURCES = Benchmark_IO.cpp diff --git a/Grid/Readme.md b/Grid/Readme.md new file mode 100644 index 0000000..10d6bbd --- /dev/null +++ b/Grid/Readme.md @@ -0,0 +1,69 @@ +# Grid benchmarks + +This folder contains benchmarks for the [Grid](https://github.com/aportelli/) library. +The benchmarks can be summarised as follows + +- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion +matrices, as well as bandwidth measurement for different operations. Measurements are +performed for a fixed range of problem sizes. + +## TL;DR +Build and install Grid, all dependencies, and the benchmark with +```bash +systems//bootstrap-env.sh # build dependencies, takes a long time +./build-grid.sh # build Grid +./build-benchmark.sh # build benchmarks +``` +where `` is an arbitrary directory where every product will be stored, `` +is a sub-directory of `systems` containing system-specific scripts +(an existing preset or your own), and finally `` is the name of a build config +in `systems//grid-config.json`. After a successful execution the benchmark binaries +will be in `/prefix/gridbench_`. + +## Environment setup +A complete runtime environnement can be deploy using scripts from this repository. System-specific scripts are in the `systems` directory. + +You should first deploy the environment for the specific system you are using, for example +```bash +systems/tursa/bootstrap-env.sh ./env +``` +will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set +of packages, and might take some time to complete. + +After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment +```bash +source ./env/env.sh +``` +Additional scripts `env-*.sh` can be sourced after to activate more specific environments, +this should be done after sourcing `env.sh` as above. + +## Building the benchmarks +The environnement directory contains a `grid-config.json` file specifying compilation flag +configurations for Grid (please see Grid's repository for documentation). All entries have +the form +```json +{ + "name": "foo", // name of the configuration + "env-script": "bar.sh", // script to source before building + // (path relative to the environment directory) + "commit": "...", // Grid commit to use + // (anything that can be an argument of git checkout) + "config-options": "..." // options to pass to the configure script, + "env" : { // environment variables + "VAR": "value" // export VAR="value" before building + } +} +``` +Grid can then be built with +``` +./build-grid.sh +``` +where `` is the environment directory and `` is the build config name in +`grid-config.json`. Similarly, the benchmarks can then be built with +``` +./build-grid +``` + +## Running the benchmarks +After building the benchmarks as above you can find the binaries in +`/prefix/gridbench_`. \ No newline at end of file diff --git a/Grid/bootstrap.sh b/Grid/bootstrap.sh index 808ebeb..ad5a273 100755 --- a/Grid/bootstrap.sh +++ b/Grid/bootstrap.sh @@ -2,5 +2,10 @@ set -euo pipefail +json_url='https://raw.githubusercontent.com/nlohmann/json/bc889afb4c5bf1c0d8ee29ef35eaaf4c8bef8a5d/single_include/nlohmann/json.hpp' + +if [ ! -f json.hpp ]; then + wget ${json_url} +fi mkdir -p .buildutils/m4 autoreconf -fvi diff --git a/Grid/build-benchmark.sh b/Grid/build-benchmark.sh new file mode 100755 index 0000000..92ce696 --- /dev/null +++ b/Grid/build-benchmark.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1090,SC1091 + +set -euo pipefail + +if (( $# != 2 )); then + echo "usage: $(basename "$0") " 1>&2 + exit 1 +fi +env_dir=$1 +cfg=$2 + +call_dir=$(pwd -P) +script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" +cd "${env_dir}" +env_dir=$(pwd -P) +cd "${call_dir}" +build_dir="${env_dir}/build/Grid-benchmarks/${cfg}" +mkdir -p "${build_dir}" +source "${env_dir}/env.sh" +entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json) +env_script=$(echo "${entry}" | jq -r ".\"env-script\"") +cd "${build_dir}" || return +source "${env_dir}/${env_script}" +if [ ! -f Makefile ]; then + "${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \ + --prefix="${env_dir}/prefix/gridbench_${cfg}" +fi +make -j 128 +make install +cd "${call_dir}" diff --git a/Grid/build-grid.sh b/Grid/build-grid.sh new file mode 100755 index 0000000..c8af0a4 --- /dev/null +++ b/Grid/build-grid.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1090,SC1091 + +set -euo pipefail + +if (( $# != 2 )); then + echo "usage: $(basename "$0") " 1>&2 + exit 1 +fi +env_dir=$1 +cfg=$2 + +call_dir=$(pwd -P) +cd "${env_dir}" +env_dir=$(pwd -P) +cd "${call_dir}" +build_dir="${env_dir}/build/Grid/${cfg}" +if [ -d "${build_dir}" ]; then + echo "error: directory '${build_dir}' exists" + exit 1 +fi +mkdir -p "${build_dir}" +source "${env_dir}/env.sh" +entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json) +IFS=" " read -r -a args <<< "$(echo "${entry}" | jq -r ".\"config-options\"")" +env_script=$(echo "${entry}" | jq -r ".\"env-script\"") +cd "${build_dir}" || return +source "${env_dir}/${env_script}" +extra_env=$(mktemp) +echo "${entry}" | jq -r '.env|to_entries|map("export \(.key)='\''\(.value|tostring)'\''")|.[]' > "${extra_env}" +commit=$(echo "${entry}" | jq -r ".commit") +git clone https://github.com/paboyle/Grid.git "${build_dir}" +cd "${build_dir}" +git checkout "${commit}" +./bootstrap.sh +mkdir build; cd build +source "${extra_env}" +../configure --prefix="${env_dir}/prefix/grid_${cfg}" "${args[@]}" +make -j128 +make install +rm -rf "${extra_env}" +cd "${call_dir}" diff --git a/Grid/configure.ac b/Grid/configure.ac index e5a5bf5..a2dfe93 100644 --- a/Grid/configure.ac +++ b/Grid/configure.ac @@ -3,55 +3,69 @@ AC_INIT([lattice-bench], [0.1], [antonin.portelli@me.com]) AC_CANONICAL_BUILD AC_CANONICAL_HOST AC_CANONICAL_TARGET -AC_CONFIG_SRCDIR([Benchmark_ITT.cpp]) +AC_CONFIG_SRCDIR([Benchmark_Grid.cpp]) AC_CONFIG_MACRO_DIR([.buildutils/m4]) AC_CONFIG_HEADERS([config.h]) AM_INIT_AUTOMAKE([-Wall -Werror foreign]) m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) -# Checks for programs. -AC_PROG_CXX -AC_PROG_CC -AC_PROG_RANLIB -AM_PROG_AR -AC_LANG([C++]) - AC_ARG_WITH([grid], [AS_HELP_STRING([--with-grid=], [try this for a non-standard install prefix of Grid])], [PATH="$with_grid/bin$PATH_SEPARATOR$PATH"] [CXXFLAGS="$CXXFLAGS -I$with_grid/include"] [LDFLAGS="$LDFLAGS -L$with_grid/lib"]) + AC_CHECK_PROG([GRIDCONF],[grid-config],[yes]) if test x"$GRIDCONF" != x"yes" ; then AC_MSG_ERROR([grid-config not found]) fi +if test x"$CXX" == x ; then + CXX="`grid-config --cxx`" +elif test "$CXX" != "`grid-config --cxx`" ; then + AC_MSG_WARN([CXX differs from that reported by grid-config]) +fi +if test x"$CXXLD" == x ; then + CXXLD="`grid-config --cxxld`" +elif test "$CXXLD" != "`grid-config --cxxld`" ; then + AC_MSG_WARN([CXXLD differs from that reported by grid-config]) +fi CXXFLAGS="$CXXFLAGS `grid-config --cxxflags`" -LDFLAGS="$LDFLAGS `grid-config --ldflags`" CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS" + +AC_PROG_CC +AM_PROG_CC_C_O +AC_PROG_CXX + +LDFLAGS="$LDFLAGS `grid-config --ldflags`" LDFLAGS="$AM_LDFLAGS $LDFLAGS" -LIBS=" -lGrid $LIBS `grid-config --libs`" +LIBS=" -ldl -lGrid $LIBS `grid-config --libs`" + +AC_PROG_RANLIB +AM_PROG_AR +AC_LANG([C++]) AC_MSG_CHECKING([that a minimal Grid program compiles]); AC_LINK_IFELSE( - [AC_LANG_SOURCE([[ + [AC_LANG_SOURCE([[ #include - + using namespace Grid; - + int main(int argc, char *argv[]) { Grid_init(&argc, &argv); Grid_finalize(); - + return 0; } - + ]])], - [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([yes])], [AC_MSG_RESULT([no])] - [AC_MSG_ERROR([Could not compile a minimal Grid program])]) + [AC_MSG_ERROR([impossible to compile a minimal Grid program])]) +AC_SUBST([CXXLD]) AC_SUBST([AM_CXXFLAGS]) AC_SUBST([AM_LDFLAGS]) AC_CONFIG_FILES([Makefile]) diff --git a/Grid/systems/tursa/bootstrap-env.sh b/Grid/systems/tursa/bootstrap-env.sh new file mode 100755 index 0000000..d0e62c8 --- /dev/null +++ b/Grid/systems/tursa/bootstrap-env.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -euo pipefail + +if (( $# != 1 )); then + echo "usage: $(basename "$0") " 1>&2 + exit 1 +fi +dir=$1 + +call_dir=$(pwd -P) +script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" +if [ -d "${dir}" ]; then + echo "error: directory '${dir}' exists" + exit 1 +fi +mkdir -p "${dir}" +cd "${dir}" +git clone https://github.com/spack/spack.git +cd "${call_dir}" +cp "${script_dir}"/files/* "${dir}" +cp "${script_dir}/env.sh" "${script_dir}/grid-config.json" "${dir}" +source "${dir}"/spack/share/spack/setup-env.sh +"${script_dir}"/spack-bootstrap.sh "${dir}" diff --git a/Grid/systems/tursa/env-cpu.sh b/Grid/systems/tursa/env-cpu.sh deleted file mode 100644 index 2df240b..0000000 --- a/Grid/systems/tursa/env-cpu.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash -# shellcheck disable=SC2046 - -script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" -spack load $(cat "${script_dir}"/grid-cpu.spack) diff --git a/Grid/systems/tursa/env-gpu.sh b/Grid/systems/tursa/env-gpu.sh deleted file mode 100644 index 0d0d85e..0000000 --- a/Grid/systems/tursa/env-gpu.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash -# shellcheck disable=SC2046 - -script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" -spack load $(cat "${script_dir}"/grid-gpu.spack) diff --git a/Grid/systems/tursa/env.sh b/Grid/systems/tursa/env.sh new file mode 100644 index 0000000..9534176 --- /dev/null +++ b/Grid/systems/tursa/env.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1091 + +GRIDENVDIR="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" +export GRIDENVDIR +export PATH="${GRIDENVDIR}/prefix/base/bin:${PATH}" +export ACLOCAL_PATH="${GRIDENVDIR}/prefix/base/share/aclocal:${ACLOCAL_PATH}" +source "${GRIDENVDIR}"/spack/share/spack/setup-env.sh diff --git a/Grid/systems/tursa/cpu-mpi-wrapper.sh b/Grid/systems/tursa/files/cpu-mpi-wrapper.sh similarity index 100% rename from Grid/systems/tursa/cpu-mpi-wrapper.sh rename to Grid/systems/tursa/files/cpu-mpi-wrapper.sh diff --git a/Grid/systems/tursa/files/env-cpu.sh b/Grid/systems/tursa/files/env-cpu.sh new file mode 100644 index 0000000..ec64373 --- /dev/null +++ b/Grid/systems/tursa/files/env-cpu.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +spack env activate grid-cpu +spack env status diff --git a/Grid/systems/tursa/files/env-gpu.sh b/Grid/systems/tursa/files/env-gpu.sh new file mode 100644 index 0000000..231647f --- /dev/null +++ b/Grid/systems/tursa/files/env-gpu.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +spack env activate grid-gpu +spack env status diff --git a/Grid/systems/tursa/gpu-mpi-wrapper.sh b/Grid/systems/tursa/files/gpu-mpi-wrapper.sh similarity index 100% rename from Grid/systems/tursa/gpu-mpi-wrapper.sh rename to Grid/systems/tursa/files/gpu-mpi-wrapper.sh diff --git a/Grid/systems/tursa/grid-config.json b/Grid/systems/tursa/grid-config.json new file mode 100644 index 0000000..d91d9e6 --- /dev/null +++ b/Grid/systems/tursa/grid-config.json @@ -0,0 +1,25 @@ +{ + "configs": [ + { + "name": "gpu", + "env-script": "env-gpu.sh", + "commit": "796abfad80625d81bb16af7ff6ec612a836f17d8", + "config-options": "--enable-comms=mpi --enable-simd=GPU --enable-alloc-align=4k --enable-accelerator-cshift --enable-shm=nvlink --enable-gen-simd-width=64 --enable-accelerator=cuda --disable-unified --enable-gparity=no --enable-fermion-reps=no", + "env" : { + "CXX": "nvcc", + "LDFLAGS": "-cudart shared", + "CXXFLAGS": "-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" + } + }, + { + "name": "cpu", + "env-script": "env-cpu.sh", + "commit": "796abfad80625d81bb16af7ff6ec612a836f17d8", + "config-options": "--enable-comms=mpi-auto --enable-simd=AVX2 --enable-alloc-align=4k --enable-shm=shmget --enable-gparity=no --enable-fermion-reps=no", + "env" : { + "CXX": "clang++", + "MPICXX": "mpicxx" + } + } + ] +} diff --git a/Grid/systems/tursa/grid-cpu.spack b/Grid/systems/tursa/grid-cpu.spack deleted file mode 100644 index 85e1164..0000000 --- a/Grid/systems/tursa/grid-cpu.spack +++ /dev/null @@ -1,10 +0,0 @@ -gcc@9.4.0 -llvm@12.0.1 -ucx@1.12.0.CPU%gcc@9.4.0 -openmpi@4.1.1.CPU%gcc@9.4.0 -hdf5^openmpi@4.1.1.CPU%gcc@9.4.0 -fftw^openmpi@4.1.1.CPU%gcc@9.4.0 -openssl -gmp%gcc@9.4.0 -mpfr%gcc@9.4.0 -c-lime diff --git a/Grid/systems/tursa/grid-gpu.spack b/Grid/systems/tursa/grid-gpu.spack deleted file mode 100644 index f3d9660..0000000 --- a/Grid/systems/tursa/grid-gpu.spack +++ /dev/null @@ -1,10 +0,0 @@ -gcc@9.4.0 -cuda@11.4.0 -ucx@1.12.0.GPU%gcc@9.4.0 -openmpi@4.1.1.GPU%gcc@9.4.0 -hdf5^openmpi@4.1.1.GPU%gcc@9.4.0 -fftw^openmpi@4.1.1.GPU%gcc@9.4.0 -openssl -gmp%gcc@9.4.0 -mpfr%gcc@9.4.0 -c-lime diff --git a/Grid/systems/tursa/spack-bootstrap.sh b/Grid/systems/tursa/spack-bootstrap.sh old mode 100644 new mode 100755 index 33e515c..582cb93 --- a/Grid/systems/tursa/spack-bootstrap.sh +++ b/Grid/systems/tursa/spack-bootstrap.sh @@ -2,16 +2,19 @@ # shellcheck disable=SC2016 set -euo pipefail -GCC='gcc@9.4.0' -CUDA='cuda@11.4.0' -HDF5='hdf5@1.10.7' +gcc_spec='gcc@9.4.0' +cuda_spec='cuda@11.4.0' +hdf5_spec='hdf5@1.10.7' if (( $# != 1 )); then echo "usage: $(basename "$0") " 1>&2 exit 1 fi -ENVDIR=$1 -CWD=$(pwd -P) +dir=$1 +cwd=$(pwd -P) +cd "${dir}" +dir=$(pwd -P) +cd "${cwd}" # General configuration ######################################################## # build with 128 tasks @@ -33,107 +36,92 @@ echo 'packages: spack config --scope site add -f external.yaml rm external.yaml +# Base compilers ############################################################### # configure system base spack compiler find --scope site -# Base packages ################################################################ -# install GCC -spack install ${GCC} -spack load ${GCC} +# install GCC, CUDA & LLVM +spack install ${gcc_spec} ${cuda_spec} llvm + +spack load llvm spack compiler find --scope site -spack unload ${GCC} +spack unload llvm -# clean -spack clean -spack gc -y - -# install CUDA -spack install ${CUDA} - -# install development tools -dev_tools=("autoconf" "automake" "libtool" "git") -spack install "${dev_tools[@]}" - -# create view for CLI & dev tools -spack view symlink -i "${ENVDIR}/prefix/base" "${dev_tools[@]}" - -# install clang -spack install llvm@12.0.1 - -# locate new compilers -spack load llvm@12.0.1 +spack load ${gcc_spec} spack compiler find --scope site -spack unload llvm@12.0.1 +spack unload ${gcc_spec} # Manual compilation of OpenMPI & UCX ########################################## # set build directories -mkdir -p "${ENVDIR}"/build -cd "${ENVDIR}"/build +mkdir -p "${dir}"/build +cd "${dir}"/build -spack load ${GCC} ${CUDA} +spack load ${gcc_spec} ${cuda_spec} -CUDA_PATH=$(which nvcc | sed "s/bin/@/g" | cut -d "@" -f1) -GDRCOPY_PATH=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1 +cuda_path=$(spack find --format "{prefix}" cuda) +gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1 # Install ucx 1.12.0 -UCX_URL=https://github.com/openucx/ucx/releases/download/v1.12.0/ucx-1.12.0.tar.gz +ucx_url=https://github.com/openucx/ucx/releases/download/v1.12.0/ucx-1.12.0.tar.gz echo "-- building UCX from source" -wget ${UCX_URL} -UCX_AR=$(basename ${UCX_URL}) -tar -xvf "${UCX_AR}" -cd "${UCX_AR%.tar.gz}" +wget ${ucx_url} +ucx_ar=$(basename ${ucx_url}) +tar -xvf "${ucx_ar}" +cd "${ucx_ar%.tar.gz}" # ucx gpu build -mkdir build_gpu; cd build_gpu +mkdir -p build_gpu; cd build_gpu ../configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu \ - --disable-dependency-tracking --prefix="${ENVDIR}"/prefix/ucx_gpu \ + --disable-dependency-tracking --prefix="${dir}"/prefix/ucx_gpu \ --enable-devel-headers --enable-examples --enable-optimizations \ - --with-gdrcopy=${GDRCOPY_PATH} --with-verbs --disable-logging \ + --with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging \ --disable-debug --disable-assertions --enable-cma \ --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \ --without-rocm --without-ugni --without-java \ - --enable-compiler-opt=3 --with-cuda="${CUDA_PATH}" --without-cm \ + --enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm \ --with-rc --with-ud --with-dc --with-mlx5-dv --with-dm \ - --enable-mt LDFLAGS=-L${GDRCOPY_PATH}/lib + --enable-mt --without-go LDFLAGS=-L${gdrcopy_path}/lib make -j 128 make install cd .. # ucx cpu build -mkdir build_cpu; cd build_cpu +mkdir -p build_cpu; cd build_cpu ../configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu \ - --disable-dependency-tracking --prefix="${ENVDIR}"/prefix/ucx_cpu \ + --disable-dependency-tracking --prefix="${dir}"/prefix/ucx_cpu \ --enable-devel-headers --enable-examples --enable-optimizations \ --with-verbs --disable-logging --disable-debug \ --disable-assertions --enable-mt --enable-cma \ --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \ --without-rocm --without-ugni --without-java \ --enable-compiler-opt=3 --without-cm --without-ugni --with-rc \ - --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt + --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go make -j 128 make install -cd "${ENVDIR}"/build +cd "${dir}"/build -# Install openmpi 4.1.1 (needs to be done on a gpu node) -OMPI_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz +# Install openmpi 4.1.1 +ompi_url=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz echo "-- building OpenMPI from source" -wget ${OMPI_URL} -OMPI_AR=$(basename ${OMPI_URL}) -tar -xvf "${OMPI_AR}" -cd "${OMPI_AR%.tar.gz}" +wget ${ompi_url} +ompi_ar=$(basename ${ompi_url}) +tar -xvf "${ompi_ar}" +cd "${ompi_ar%.tar.gz}" +export AUTOMAKE_JOBS=128 +./autogen.pl -f # openmpi gpu build mkdir build_gpu; cd build_gpu -../configure --prefix="${ENVDIR}"/prefix/ompi_gpu --without-xpmem \ - --with-ucx="${ENVDIR}"/prefix/ucx_gpu \ - --with-ucx-libdir="${ENVDIR}"/prefix/ucx_gpu/lib \ +../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem \ + --with-ucx="${dir}"/prefix/ucx_gpu \ + --with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib \ --with-knem=/opt/knem-1.1.4.90mlnx1/ \ --enable-mca-no-build=btl-uct \ - --with-cuda="${CUDA_PATH}" --disable-getpwuid \ + --with-cuda="${cuda_path}" --disable-getpwuid \ --with-verbs --with-slurm --enable-mpi-fortran=all \ --with-pmix=internal --with-libevent=internal make -j 128 @@ -142,61 +130,76 @@ cd .. # openmpi cpu build mkdir build_cpu; cd build_cpu -../configure --prefix="${ENVDIR}"/prefix/ompi_cpu --without-xpmem \ - --with-ucx="${ENVDIR}"/prefix/ucx_cpu \ - --with-ucx-libdir="${ENVDIR}"/prefix/ucx_cpu/lib \ +../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem \ + --with-ucx="${dir}"/prefix/ucx_cpu \ + --with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib \ --with-knem=/opt/knem-1.1.4.90mlnx1/ \ --enable-mca-no-build=btl-uct --disable-getpwuid \ --with-verbs --with-slurm --enable-mpi-fortran=all \ --with-pmix=internal --with-libevent=internal make -j 128 make install -cd "${ENVDIR}" +cd "${dir}" # Add externals to spack echo "packages: ucx: externals: - spec: \"ucx@1.12.0.GPU%gcc@9.4.0\" - prefix: ${ENVDIR}/prefix/ucx_gpu + prefix: ${dir}/prefix/ucx_gpu - spec: \"ucx@1.12.0.CPU%gcc@9.4.0\" - prefix: ${ENVDIR}/prefix/ucx_cpu + prefix: ${dir}/prefix/ucx_cpu buildable: False openmpi: externals: - spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\" - prefix: ${ENVDIR}/prefix/ompi_gpu + prefix: ${dir}/prefix/ompi_gpu - spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\" - prefix: ${ENVDIR}/prefix/ompi_cpu + prefix: ${dir}/prefix/ompi_cpu buildable: False" > spack.yaml spack config --scope site add -f spack.yaml rm spack.yaml -spack install ucx@1.12.0.GPU%gcc@9.4.0 -spack install ucx@1.12.0.CPU%gcc@9.4.0 -spack install openmpi@4.1.1.GPU%gcc@9.4.0 -spack install openmpi@4.1.1.CPU%gcc@9.4.0 +spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0 +spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0 -# Install Grid dependencies #################################################### -cd "${CWD}" +cd "${cwd}" -OPENMPIGPUHASH=$(spack find --format "{hash}" openmpi@4.1.1.GPU) -OPENMPICPUHASH=$(spack find --format "{hash}" openmpi@4.1.1.CPU) +# environments ################################################################# +dev_tools=("autoconf" "automake" "libtool" "jq") +ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU) +ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU) -spack install ${HDF5}+cxx+threadsafe ^/"${OPENMPIGPUHASH}" -spack install ${HDF5}+cxx+threadsafe ^/"${OPENMPICPUHASH}" -spack install fftw ^/"${OPENMPIGPUHASH}" -spack install fftw ^/"${OPENMPICPUHASH}" -spack install openssl gmp mpfr c-lime +spack env create grid-gpu +spack env activate grid-gpu +spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}" +spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0 +spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}" +spack add fftw ^/"${ompi_gpu_hash}" +spack add openssl gmp mpfr c-lime +spack install +spack env deactivate + +spack env create grid-cpu +spack env activate grid-cpu +spack add llvm "${dev_tools[@]}" +spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0 +spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}" +spack add fftw ^/"${ompi_cpu_hash}" +spack add openssl gmp mpfr c-lime +spack install +spack env deactivate # Final setup ################################################################## spack clean +spack gc -y # add more environment variables in module loading -spack config --scope site add 'modules:prefix_inspections:lib:[LIBRARY_PATH]' +spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]' +spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]' spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]' spack module tcl refresh -y # permission change for group access -chmod -R g+rw "${ENVDIR}/spack/var/spack/cache" -setfacl -d -R -m g::rwX "${ENVDIR}/spack/var/spack/cache" +chmod -R g+rw "${dir}/spack/var/spack/cache" +setfacl -d -R -m g::rwX "${dir}/spack/var/spack/cache" diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..b3aefae --- /dev/null +++ b/Readme.md @@ -0,0 +1,8 @@ +# Lattice benchmarks + +This repository is an attempt at packaging benchmarks for various libraries used for +lattice field theory simulations. It is currently only featuring the Grid library but +more will be added later. + +Libraries: +- [Grid](https://github.com/aportelli/) - [Documentation](Grid/Readme.md) \ No newline at end of file