diff --git a/Grid/Benchmark_Grid.cpp b/Grid/Benchmark_Grid.cpp index 1495d13..3e0dd4d 100644 --- a/Grid/Benchmark_Grid.cpp +++ b/Grid/Benchmark_Grid.cpp @@ -44,8 +44,7 @@ struct time_statistics mean = sum / v.size(); std::vector diff(v.size()); - std::transform(v.begin(), v.end(), diff.begin(), [=](double x) - { return x - mean; }); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); err = std::sqrt(sq_sum / (v.size() * (v.size() - 1))); @@ -64,11 +63,8 @@ void comms_header() << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl; }; -Gamma::Algebra Gmu[] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT}; +Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT}; struct controls { @@ -84,19 +80,49 @@ class Benchmark { int threads = GridThread::GetThreads(); - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" + << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; std::cout << GridLogMessage << "Grid Default Decomposition patterns\n"; - std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() << std::endl; - std::cout << GridLogMessage << "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi()) << std::endl; - std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) << std::endl; - std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() + << std::endl; + std::cout << GridLogMessage + << "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi()) + << std::endl; + std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) + << std::endl; + std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8 + << "bits ; " + << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) + << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; } static void Comms(void) @@ -115,18 +141,23 @@ class Benchmark std::vector t_time(Nloop); time_statistics timestat; - std::cout << GridLogMessage << "====================================================================================================" << std::endl; - std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl; - std::cout << GridLogMessage << "====================================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "=============================" + << std::endl; + std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " + << nmu << " dimensions" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "=============================" + << std::endl; comms_header(); for (int lat = 16; lat <= maxlat; lat += 8) { int Ls = 12; - Coordinate latt_size({lat * mpi_layout[0], - lat * mpi_layout[1], - lat * mpi_layout[2], + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); GridCartesian Grid(latt_size, simd_layout, mpi_layout); @@ -173,8 +204,7 @@ class Benchmark Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); } Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, - bytes); + (void *)&rbuf[dir][0], recv_from_rank, bytes); dbytes += bytes; double stop = usecond(); @@ -186,12 +216,14 @@ class Benchmark double xbytes = dbytes * 0.5; double bidibytes = dbytes; - std::cout << GridLogMessage << lat << "\t" << Ls << "\t " - << bytes << " \t " - << xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t " - << xbytes / timestat.max << " " << xbytes / timestat.min - << "\t\t" << bidibytes / timestat.mean << " " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " " - << bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl; + std::cout << GridLogMessage << lat << "\t" << Ls << "\t " << bytes << " \t " + << xbytes / timestat.mean << " \t " + << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t " + << xbytes / timestat.max << " " << xbytes / timestat.min << "\t\t" + << bidibytes / timestat.mean << " " + << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " " + << bidibytes / timestat.max << " " << bidibytes / timestat.min + << std::endl; } } for (int d = 0; d < 8; d++) @@ -212,9 +244,15 @@ class Benchmark Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; std::cout << GridLogMessage << " L " << "\t\t" << "bytes" @@ -224,7 +262,9 @@ class Benchmark << "Gflop/s" << "\t\t seconds" << "\t\tGB/s / node" << std::endl; - std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl; + std::cout << GridLogMessage + << "----------------------------------------------------------" + << std::endl; // uint64_t NP; uint64_t NN; @@ -237,7 +277,8 @@ class Benchmark for (int lat = 8; lat <= lmax; lat += 8) { - Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], + lat * mpi_layout[3]}); int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; GridCartesian Grid(latt_size, simd_layout, mpi_layout); @@ -268,9 +309,10 @@ class Benchmark double flops = vol * Nvec * 2; // mul,add double bytes = 3.0 * vol * Nvec * sizeof(Real); - std::cout << GridLogMessage << std::setprecision(3) - << lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000. - << "\t\t" << bytes / time / NN << std::endl; + std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes + << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" + << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN + << std::endl; } }; @@ -282,9 +324,15 @@ class Benchmark Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; std::cout << GridLogMessage << " L " << "\t\t" << "bytes" @@ -294,7 +342,9 @@ class Benchmark << "Gflop/s" << "\t\t seconds" << "\t\tGB/s / node" << std::endl; - std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl; + std::cout << GridLogMessage + << "----------------------------------------------------------" + << std::endl; uint64_t NN; @@ -305,7 +355,8 @@ class Benchmark for (int lat = 8; lat <= lmax; lat += 8) { - Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], + lat * mpi_layout[3]}); int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; GridCartesian Grid(latt_size, simd_layout, mpi_layout); @@ -332,9 +383,10 @@ class Benchmark double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF); - std::cout << GridLogMessage << std::setprecision(3) - << lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000. - << "\t\t" << bytes / time / NN << std::endl; + std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes + << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" + << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN + << std::endl; } }; @@ -355,31 +407,41 @@ class Benchmark Coordinate mpi = GridDefaultMpi(); assert(mpi.size() == 4); Coordinate local({L, L, L, L}); - Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); + Coordinate latt4( + {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); - GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, - GridDefaultSimd(Nd, vComplex::Nsimd()), - GridDefaultMpi()); + GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( + latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global = NN; uint64_t SHM = NP / NN; ///////// Welcome message //////////// - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " + << std::endl; std::cout << GridLogMessage << "* Nc : " << Nc << std::endl; - std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; + std::cout << GridLogMessage + << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; std::cout << GridLogMessage << "* Ls : " << Ls << std::endl; std::cout << GridLogMessage << "* ranks : " << NP << std::endl; std::cout << GridLogMessage << "* nodes : " << NN << std::endl; std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; - std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl; + std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) + << std::endl; std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; ///////// Lattice Init //////////// - GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid( + latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); @@ -418,10 +480,14 @@ class Benchmark std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); controls Cases[] = { - {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, - {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, - {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential}, - {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}}; + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, + CartesianCommunicator::CommunicatorPolicyConcurrent}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, + CartesianCommunicator::CommunicatorPolicyConcurrent}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, + CartesianCommunicator::CommunicatorPolicySequential}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, + CartesianCommunicator::CommunicatorPolicySequential}}; for (int c = 0; c < num_cases; c++) { @@ -430,7 +496,10 @@ class Benchmark WilsonKernelsStatic::Opt = Cases[c].Opt; CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "===================================================================" + "===============" + << std::endl; if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl; if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) @@ -438,7 +507,10 @@ class Benchmark if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; std::cout << GridLogMessage << "* SINGLE precision " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "===================================================================" + "===============" + << std::endl; int nwarm = 10; double t0 = usecond(); @@ -453,7 +525,8 @@ class Benchmark FGrid->Broadcast(0, &ncall, sizeof(ncall)); - // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<RankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global = NN; uint64_t SHM = NP / NN; ///////// Welcome message //////////// - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L << "^4 local volume " << std::endl; - std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; + std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L + << "^4 local volume " << std::endl; + std::cout << GridLogMessage + << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; std::cout << GridLogMessage << "* ranks : " << NP << std::endl; std::cout << GridLogMessage << "* nodes : " << NN << std::endl; std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; - std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl; + std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) + << std::endl; std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; ///////// Lattice Init //////////// - GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid( + latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid); ///////// RNG Init //////////// @@ -599,10 +697,14 @@ class Benchmark std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); controls Cases[] = { - {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, - {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, - {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential}, - {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}}; + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, + CartesianCommunicator::CommunicatorPolicyConcurrent}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, + CartesianCommunicator::CommunicatorPolicyConcurrent}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, + CartesianCommunicator::CommunicatorPolicySequential}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, + CartesianCommunicator::CommunicatorPolicySequential}}; for (int c = 0; c < num_cases; c++) { @@ -611,15 +713,22 @@ class Benchmark StaggeredKernelsStatic::Opt = Cases[c].Opt; CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "===================================================================" + "===============" + << std::endl; if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric) - std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" << std::endl; + std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" + << std::endl; if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute) std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; std::cout << GridLogMessage << "* SINGLE precision " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "===================================================================" + "===============" + << std::endl; int nwarm = 10; double t0 = usecond(); @@ -634,7 +743,8 @@ class Benchmark FGrid->Broadcast(0, &ncall, sizeof(ncall)); - // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<({8, 2, 2, 2}); #else @@ -717,83 +842,148 @@ int main(int argc, char **argv) std::vector staggered; int Ls = 1; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; for (int l = 0; l < L_list.size(); l++) { wilson.push_back(Benchmark::DWF(Ls, L_list[l])); } Ls = 12; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; for (int l = 0; l < L_list.size(); l++) { double result = Benchmark::DWF(Ls, L_list[l]); dwf4.push_back(result); } - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; for (int l = 0; l < L_list.size(); l++) { double result = Benchmark::Staggered(L_list[l]); staggered.push_back(result); } - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl; for (int l = 0; l < L_list.size(); l++) { - std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " << dwf4[l] << " \t\t " << staggered[l] << std::endl; + std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " + << dwf4[l] << " \t\t " << staggered[l] << std::endl; } - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; int NN = NN_global; if (do_memory) { - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; std::cout << GridLogMessage << " Memory benchmark " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; Benchmark::Memory(); } if (do_su4) { - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; std::cout << GridLogMessage << " SU(4) benchmark " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; Benchmark::SU4(); } if (do_comms) { - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; std::cout << GridLogMessage << " Communications benchmark " << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=======================================================================" + "===========" + << std::endl; Benchmark::Comms(); } - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl; for (int l = 0; l < L_list.size(); l++) { - std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl; + std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " + << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl; } - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; - std::cout << GridLogMessage << "==================================================================================" << std::endl; - std::cout << GridLogMessage << " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN << " Mflop/s per node" << std::endl; - std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; + std::cout << GridLogMessage + << " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN + << " Mflop/s per node" << std::endl; + std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" + << dwf4[selm1] / NN << ") " << std::endl; std::cout << std::setprecision(3); - std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage + << "=========================================================================" + "=========" + << std::endl; Grid_finalize(); } diff --git a/Grid/Benchmark_IO.hpp b/Grid/Benchmark_IO.hpp index 24e2214..d71e943 100644 --- a/Grid/Benchmark_IO.hpp +++ b/Grid/Benchmark_IO.hpp @@ -20,9 +20,9 @@ along with this program. If not, see . #include #define MSG std::cout << GridLogMessage -#define SEP \ +#define SEP \ "-----------------------------------------------------------------------------" -#define BIGSEP \ +#define BIGSEP \ "=============================================================================" #ifdef HAVE_LIME @@ -36,16 +36,15 @@ namespace Grid // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API // - // template - // void stdWrite(const std::string filestem, Field &vec) + // template void stdWrite(const std::string filestem, Field &vec) // { - // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); - // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); - // size_t size; - // uint32_t crc; + // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); + // size_t size; + // uint32_t crc; // GridStopWatch ioWatch, crcWatch; - // size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + // size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object); // autoView(vec_v, vec, CpuRead); // crcWatch.Start(); // crc = GridChecksum::crc32(vec_v.cpu_ptr, size); @@ -53,36 +52,39 @@ namespace Grid // crcWatch.Stop(); // MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; // ioWatch.Start(); - // std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); + // std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), + // vec.Grid()->lSites(), file); // ioWatch.Stop(); // std::fclose(file); // size *= vec.Grid()->ProcessorCount(); // auto &p = BinaryIO::lastPerf; - // p.size = size; - // p.time = ioWatch.useconds(); - // p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + // p.size = size; + // p.time = ioWatch.useconds(); + // p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); // MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() - // << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + // << ", + // " + // << p.mbytesPerSecond << " MB/s" << std::endl; // MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; // } - // - // template - // void stdRead(Field &vec, const std::string filestem) + + // template void stdRead(Field &vec, const std::string filestem) // { - // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); - // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); - // size_t size; - // uint32_t crcRead, crcData; + // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); + // size_t size; + // uint32_t crcRead, crcData; // GridStopWatch ioWatch, crcWatch; - // size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + // size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object); // crcWatch.Start(); // std::fread(&crcRead, sizeof(uint32_t), 1, file); // crcWatch.Stop(); // { // autoView(vec_v, vec, CpuWrite); // ioWatch.Start(); - // std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); + // std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), + // vec.Grid()->lSites(), file); // ioWatch.Stop(); // std::fclose(file); // } @@ -96,19 +98,19 @@ namespace Grid // assert(crcData == crcRead); // size *= vec.Grid()->ProcessorCount(); // auto &p = BinaryIO::lastPerf; - // p.size = size; - // p.time = ioWatch.useconds(); - // p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); - // MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() - // << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + // p.size = size; + // p.time = ioWatch.useconds(); + // p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); + // MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", " + // << p.mbytesPerSecond << " MB/s" << std::endl; // MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; // } - template - void stdWrite(const std::string filestem, Field &vec) + template void stdWrite(const std::string filestem, Field &vec) { std::string rankStr = std::to_string(vec.Grid()->ThisRank()); - std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary); + std::ofstream file(filestem + "." + rankStr + ".bin", + std::ios::out | std::ios::binary); size_t size, sizec; uint32_t crc; GridStopWatch ioWatch, crcWatch; @@ -130,16 +132,16 @@ namespace Grid p.size = size; p.time = ioWatch.useconds(); p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); - MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() - << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", " + << p.mbytesPerSecond << " MB/s" << std::endl; MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; } - template - void stdRead(Field &vec, const std::string filestem) + template void stdRead(Field &vec, const std::string filestem) { std::string rankStr = std::to_string(vec.Grid()->ThisRank()); - std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary); + std::ifstream file(filestem + "." + rankStr + ".bin", + std::ios::in | std::ios::binary); size_t size, sizec; uint32_t crcRead, crcData; GridStopWatch ioWatch, crcWatch; @@ -168,13 +170,12 @@ namespace Grid p.size = size; p.time = ioWatch.useconds(); p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); - MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() - << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", " + << p.mbytesPerSecond << " MB/s" << std::endl; MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; } - template - void limeWrite(const std::string filestem, Field &vec) + template void limeWrite(const std::string filestem, Field &vec) { emptyUserRecord record; ScidacWriter binWriter(vec.Grid()->IsBoss()); @@ -184,8 +185,7 @@ namespace Grid binWriter.close(); } - template - void limeRead(Field &vec, const std::string filestem) + template void limeRead(Field &vec, const std::string filestem) { emptyUserRecord record; ScidacReader binReader; @@ -225,12 +225,13 @@ namespace Grid template void writeBenchmark(const Coordinate &latt, const std::string filename, - const WriterFn &write, - const unsigned int Ls = 1, const bool rb = false) + const WriterFn &write, const unsigned int Ls = 1, + const bool rb = false) { auto mpi = GridDefaultMpi(); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); - std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); + std::shared_ptr gBasePt( + SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr gPt; std::random_device rd; @@ -251,12 +252,13 @@ namespace Grid template void readBenchmark(const Coordinate &latt, const std::string filename, - const ReaderFn &read, - const unsigned int Ls = 1, const bool rb = false) + const ReaderFn &read, const unsigned int Ls = 1, + const bool rb = false) { auto mpi = GridDefaultMpi(); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); - std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); + std::shared_ptr gBasePt( + SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr gPt; makeGrid(gPt, gBasePt, Ls, rb);