diff --git a/Grid/Benchmark_Grid.cpp b/Grid/Benchmark_Grid.cpp index 262e0eb..d8b8de9 100644 --- a/Grid/Benchmark_Grid.cpp +++ b/Grid/Benchmark_Grid.cpp @@ -264,6 +264,87 @@ class Benchmark return; } + static void Latency(void) + { + int Nwarmup = 100; + int Nloop = 1000; + + Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout << GridLogMessage << "Benchmarking point-to-point latency" << std::endl; + grid_small_sep(); + grid_printf("from to mean(usec) err min\n"); + + int lat = 8; // dummy lattice size. Not actually used. + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], + lat * mpi_layout[3]}); + + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + int ranks; + int me; + MPI_Comm_size(Grid.communicator, &ranks); + MPI_Comm_rank(Grid.communicator, &me); + assert(ranks == Grid._Nprocessors); + assert(me == Grid._processor); + + int bytes = 8; + void *buf_from = acceleratorAllocDevice(bytes); + void *buf_to = acceleratorAllocDevice(bytes); + nlohmann::json json_latency; + for (int from = 0; from < ranks; ++from) + for (int to = 0; to < ranks; ++to) + { + if (from == to) + continue; + + std::vector t_time(Nloop); + time_statistics timestat; + MPI_Status status; + + for (int i = -Nwarmup; i < Nloop; ++i) + { + double start = usecond(); + if (from == me) + { + auto err = MPI_Send(buf_from, bytes, MPI_CHAR, to, 0, Grid.communicator); + assert(err == MPI_SUCCESS); + err = MPI_Recv(buf_to, bytes, MPI_CHAR, to, 0, Grid.communicator, &status); + assert(err == MPI_SUCCESS); + } + if (to == me) + { + auto err = + MPI_Recv(buf_to, bytes, MPI_CHAR, from, 0, Grid.communicator, &status); + assert(err == MPI_SUCCESS); + err = MPI_Send(buf_from, bytes, MPI_CHAR, from, 0, Grid.communicator); + assert(err == MPI_SUCCESS); + } + double stop = usecond(); + if (i >= 0) + t_time[i] = stop - start; + } + // important: only the 'from' rank has a trustworthy time + MPI_Bcast(t_time.data(), Nloop, MPI_DOUBLE, from, Grid.communicator); + + timestat.statistics(t_time); + grid_printf("%2d %2d %15.2f %15.1f %15.2f\n", from, to, timestat.mean, + timestat.err, timestat.min); + nlohmann::json tmp; + tmp["from"] = from; + tmp["to"] = to; + tmp["time_usec"] = timestat.mean; + tmp["time_usec_error"] = timestat.err; + tmp["time_usec_max"] = timestat.min; + json_latency.push_back(tmp); + } + json_results["latency"] = json_latency; + + acceleratorFreeDevice(buf_from); + acceleratorFreeDevice(buf_to); + } + static void Memory(void) { const int Nvec = 8; @@ -805,6 +886,7 @@ int main(int argc, char **argv) int do_su4 = 1; int do_memory = 1; int do_comms = 1; + int do_latency = 1; int do_flops = 1; int Ls = 1; @@ -840,6 +922,14 @@ int main(int argc, char **argv) Benchmark::Comms(); } + if (do_latency) + { + grid_big_sep(); + std::cout << GridLogMessage << " Latency benchmark " << std::endl; + grid_big_sep(); + Benchmark::Latency(); + } + if (do_flops) { Ls = 1;