Compare commits
	
		
			1 Commits
		
	
	
		
			main
			...
			ee07f3c892
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| ee07f3c892 | 
| @@ -1,7 +1,7 @@ | |||||||
| /* | /* | ||||||
| Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk> | Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
| Copyright © 2022 Antonin Portelli <antonin.portelli@me.com> | Copyright © 2022 Antonin Portelli <antonin.portelli@me.com> | ||||||
| Copyright © 2024 Simon Buerger <simon.buerger@rwth-aachen.de> | Copyright © 2022 Simon Buerger <simon.buerger@rwth-aachen.de> | ||||||
|  |  | ||||||
| This is a fork of Benchmark_ITT.cpp from Grid | This is a fork of Benchmark_ITT.cpp from Grid | ||||||
|  |  | ||||||
| @@ -29,43 +29,6 @@ int NN_global; | |||||||
|  |  | ||||||
| nlohmann::json json_results; | nlohmann::json json_results; | ||||||
|  |  | ||||||
| // NOTE: Grid::GridClock is just a typedef to |  | ||||||
| // `std::chrono::high_resolution_clock`, but `Grid::usecond` rounds to |  | ||||||
| // microseconds (no idea why, probably wasnt ever relevant before), so we need |  | ||||||
| // our own wrapper here. |  | ||||||
| double usecond_precise() |  | ||||||
| { |  | ||||||
|   using namespace std::chrono; |  | ||||||
|   auto nsecs = duration_cast<nanoseconds>(GridClock::now() - Grid::theProgramStart); |  | ||||||
|   return nsecs.count() * 1e-3; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| std::vector<std::string> get_mpi_hostnames() |  | ||||||
| { |  | ||||||
|   int world_size; |  | ||||||
|   MPI_Comm_size(MPI_COMM_WORLD, &world_size); |  | ||||||
|  |  | ||||||
|   char hostname[MPI_MAX_PROCESSOR_NAME]; |  | ||||||
|   int name_len = 0; |  | ||||||
|   MPI_Get_processor_name(hostname, &name_len); |  | ||||||
|  |  | ||||||
|   // Allocate buffer to gather all hostnames |  | ||||||
|   std::vector<char> all_hostnames(world_size * MPI_MAX_PROCESSOR_NAME); |  | ||||||
|  |  | ||||||
|   // Use MPI_Allgather to gather all hostnames on all ranks |  | ||||||
|   MPI_Allgather(hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, all_hostnames.data(), |  | ||||||
|                 MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD); |  | ||||||
|  |  | ||||||
|   // Convert the gathered hostnames back into a vector of std::string |  | ||||||
|   std::vector<std::string> hostname_list(world_size); |  | ||||||
|   for (int i = 0; i < world_size; ++i) |  | ||||||
|   { |  | ||||||
|     hostname_list[i] = std::string(&all_hostnames[i * MPI_MAX_PROCESSOR_NAME]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   return hostname_list; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| struct time_statistics | struct time_statistics | ||||||
| { | { | ||||||
|   double mean; |   double mean; | ||||||
| @@ -110,7 +73,7 @@ class Benchmark | |||||||
|         {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); |         {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); | ||||||
|     GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( |     GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( | ||||||
|         latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); |         latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); | ||||||
|     Grid::Coordinate shm(4, 1); |     Grid::Coordinate shm; | ||||||
|     GlobalSharedMemory::GetShmDims(mpi, shm); |     GlobalSharedMemory::GetShmDims(mpi, shm); | ||||||
|  |  | ||||||
|     uint64_t NP = TmpGrid->RankCount(); |     uint64_t NP = TmpGrid->RankCount(); | ||||||
| @@ -174,7 +137,7 @@ class Benchmark | |||||||
|  |  | ||||||
|     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); |     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); | ||||||
|     Coordinate mpi_layout = GridDefaultMpi(); |     Coordinate mpi_layout = GridDefaultMpi(); | ||||||
|     Coordinate shm_layout(Nd, 1); |     Coordinate shm_layout; | ||||||
|     GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout); |     GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout); | ||||||
|  |  | ||||||
|     for (int mu = 0; mu < Nd; mu++) |     for (int mu = 0; mu < Nd; mu++) | ||||||
| @@ -273,7 +236,7 @@ class Benchmark | |||||||
|         double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; |         double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; | ||||||
|         double rate_err = rate * timestat.err / timestat.mean; |         double rate_err = rate * timestat.err / timestat.mean; | ||||||
|         double rate_max = rate * timestat.mean / timestat.min; |         double rate_max = rate * timestat.mean / timestat.min; | ||||||
|         grid_printf("%5d %5d %7s %15llu %15.2f %15.2f %15.1f %15.2f\n", lat, dir, |         grid_printf("%5d %5d %7s %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, | ||||||
|                     is_shm           ? "yes" |                     is_shm           ? "yes" | ||||||
|                     : is_partial_shm ? "partial" |                     : is_partial_shm ? "partial" | ||||||
|                                      : "no", |                                      : "no", | ||||||
| @@ -303,166 +266,124 @@ class Benchmark | |||||||
|  |  | ||||||
|   static void Latency(void) |   static void Latency(void) | ||||||
|   { |   { | ||||||
|     int Nwarmup = 100; |     int Nloop = 200; | ||||||
|     int Nloop = 300; |     int nmu = 0; | ||||||
|  |  | ||||||
|     std::cout << GridLogMessage << "Benchmarking point-to-point latency" << std::endl; |     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); | ||||||
|     grid_small_sep(); |     Coordinate mpi_layout = GridDefaultMpi(); | ||||||
|     grid_printf("from to      mean(usec)           err           max\n"); |     Coordinate shm_layout; | ||||||
|  |     GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout); | ||||||
|  |  | ||||||
|     int ranks; |     for (int mu = 0; mu < Nd; mu++) | ||||||
|     int me; |       if (mpi_layout[mu] > 1) | ||||||
|     MPI_Comm_size(MPI_COMM_WORLD, &ranks); |         nmu++; | ||||||
|     MPI_Comm_rank(MPI_COMM_WORLD, &me); |  | ||||||
|  |  | ||||||
|     int bytes = 8; |  | ||||||
|     void *buf_from = acceleratorAllocDevice(bytes); |  | ||||||
|     void *buf_to = acceleratorAllocDevice(bytes); |  | ||||||
|     nlohmann::json json_latency; |  | ||||||
|     for (int from = 0; from < ranks; ++from) |  | ||||||
|       for (int to = 0; to < ranks; ++to) |  | ||||||
|       { |  | ||||||
|         if (from == to) |  | ||||||
|           continue; |  | ||||||
|  |  | ||||||
|     std::vector<double> t_time(Nloop); |     std::vector<double> t_time(Nloop); | ||||||
|     time_statistics timestat; |     time_statistics timestat; | ||||||
|         MPI_Status status; |  | ||||||
|  |  | ||||||
|         for (int i = -Nwarmup; i < Nloop; ++i) |     std::cout << GridLogMessage << "Benchmarking Latency to neighbors in " << nmu | ||||||
|         { |               << " dimensions" << std::endl; | ||||||
|           double start = usecond_precise(); |  | ||||||
|           if (from == me) |  | ||||||
|           { |  | ||||||
|             auto err = MPI_Send(buf_from, bytes, MPI_CHAR, to, 0, MPI_COMM_WORLD); |  | ||||||
|             assert(err == MPI_SUCCESS); |  | ||||||
|           } |  | ||||||
|           if (to == me) |  | ||||||
|           { |  | ||||||
|             auto err = |  | ||||||
|                 MPI_Recv(buf_to, bytes, MPI_CHAR, from, 0, MPI_COMM_WORLD, &status); |  | ||||||
|             assert(err == MPI_SUCCESS); |  | ||||||
|           } |  | ||||||
|           double stop = usecond_precise(); |  | ||||||
|           if (i >= 0) |  | ||||||
|             t_time[i] = stop - start; |  | ||||||
|         } |  | ||||||
|         // important: only 'from' and 'to' have meaningful timings. we use |  | ||||||
|         // 'from's. |  | ||||||
|         MPI_Bcast(t_time.data(), Nloop, MPI_DOUBLE, from, MPI_COMM_WORLD); |  | ||||||
|  |  | ||||||
|         timestat.statistics(t_time); |  | ||||||
|         grid_printf("%2d %2d %15.4f %15.3f %15.4f\n", from, to, timestat.mean, |  | ||||||
|                     timestat.err, timestat.max); |  | ||||||
|         nlohmann::json tmp; |  | ||||||
|         tmp["from"] = from; |  | ||||||
|         tmp["to"] = to; |  | ||||||
|         tmp["time_usec"] = timestat.mean; |  | ||||||
|         tmp["time_usec_error"] = timestat.err; |  | ||||||
|         tmp["time_usec_min"] = timestat.min; |  | ||||||
|         tmp["time_usec_max"] = timestat.max; |  | ||||||
|         tmp["time_usec_full"] = t_time; |  | ||||||
|         json_latency.push_back(tmp); |  | ||||||
|       } |  | ||||||
|     json_results["latency"] = json_latency; |  | ||||||
|  |  | ||||||
|     acceleratorFreeDevice(buf_from); |  | ||||||
|     acceleratorFreeDevice(buf_to); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   static void P2P(void) |  | ||||||
|   { |  | ||||||
|     // IMPORTANT: The P2P benchmark uses "MPI_COMM_WORLD" communicator, which is |  | ||||||
|     // not the quite the same as Grid.communicator. Practically speaking, the |  | ||||||
|     // latter one contains the same MPI-ranks but in a different order. Grid |  | ||||||
|     // does this make sure it can exploit ranks with shared memory (i.e. |  | ||||||
|     // multiple ranks on the same node) as best as possible. |  | ||||||
|  |  | ||||||
|     // buffer-size to benchmark. This number is the same as the largest one used |  | ||||||
|     // in the "Comms()" benchmark. ( L=48, Ls=12, double-prec-complex, |  | ||||||
|     // half-color-spin-vector. ). Mostly an arbitrary choice, but nice to match |  | ||||||
|     // it here |  | ||||||
|     size_t bytes = 127401984; |  | ||||||
|  |  | ||||||
|     int Nwarmup = 20; |  | ||||||
|     int Nloop = 100; |  | ||||||
|  |  | ||||||
|     std::cout << GridLogMessage << "Benchmarking point-to-point bandwidth" << std::endl; |  | ||||||
|     grid_small_sep(); |     grid_small_sep(); | ||||||
|     grid_printf("from to      mean(usec)           err           min           " |     grid_printf("%5s %7s %15s %15s %15s\n", "dir", "shm", "time (usec)", "std dev", | ||||||
|                 "bytes    rate (GiB/s)\n"); |                 "min"); | ||||||
|  |  | ||||||
|     int ranks; |     int lat = 8; // dummy lattice size. Not really used. | ||||||
|     int me; |     Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], | ||||||
|     MPI_Comm_size(MPI_COMM_WORLD, &ranks); |                           lat * mpi_layout[3]}); | ||||||
|     MPI_Comm_rank(MPI_COMM_WORLD, &me); |  | ||||||
|  |  | ||||||
|     void *buf_from = acceleratorAllocDevice(bytes); |     GridCartesian Grid(latt_size, simd_layout, mpi_layout); | ||||||
|     void *buf_to = acceleratorAllocDevice(bytes); |     RealD Nrank = Grid._Nprocessors; | ||||||
|     nlohmann::json json_p2p; |     RealD Nnode = Grid.NodeCount(); | ||||||
|     for (int from = 0; from < ranks; ++from) |     RealD ppn = Nrank / Nnode; | ||||||
|       for (int to = 0; to < ranks; ++to) |  | ||||||
|  |     std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||||
|  |     std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||||
|  |     uint64_t bytes = 8; | ||||||
|  |     for (int d = 0; d < 8; d++) | ||||||
|     { |     { | ||||||
|         if (from == to) |       xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); | ||||||
|  |       rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     double dbytes; | ||||||
|  | #define NWARMUP 50 | ||||||
|  |  | ||||||
|  |     for (int dir = 0; dir < 8; dir++) | ||||||
|  |     { | ||||||
|  |       int mu = dir % 4; | ||||||
|  |       if (mpi_layout[mu] == 1) // skip directions that are not distributed | ||||||
|         continue; |         continue; | ||||||
|  |       bool is_shm = mpi_layout[mu] == shm_layout[mu]; | ||||||
|  |       bool is_partial_shm = !is_shm && shm_layout[mu] != 1; | ||||||
|  |  | ||||||
|         std::vector<double> t_time(Nloop); |       std::vector<double> times(Nloop); | ||||||
|         time_statistics timestat; |       for (int i = 0; i < NWARMUP; i++) | ||||||
|         MPI_Status status; |       { | ||||||
|  |         int xmit_to_rank; | ||||||
|  |         int recv_from_rank; | ||||||
|  |  | ||||||
|         for (int i = -Nwarmup; i < Nloop; ++i) |         if (dir == mu) | ||||||
|         { |         { | ||||||
|           double start = usecond_precise(); |           int comm_proc = 1; | ||||||
|           if (from == me) |           Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||||
|  |         } | ||||||
|  |         else | ||||||
|         { |         { | ||||||
|             auto err = MPI_Send(buf_from, bytes, MPI_CHAR, to, 0, MPI_COMM_WORLD); |           int comm_proc = mpi_layout[mu] - 1; | ||||||
|             assert(err == MPI_SUCCESS); |           Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||||
|         } |         } | ||||||
|           if (to == me) |         Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], | ||||||
|  |                             recv_from_rank, bytes); | ||||||
|  |       } | ||||||
|  |       for (int i = 0; i < Nloop; i++) | ||||||
|       { |       { | ||||||
|             auto err = |  | ||||||
|                 MPI_Recv(buf_to, bytes, MPI_CHAR, from, 0, MPI_COMM_WORLD, &status); |  | ||||||
|             assert(err == MPI_SUCCESS); |  | ||||||
|           } |  | ||||||
|           double stop = usecond_precise(); |  | ||||||
|           if (i >= 0) |  | ||||||
|             t_time[i] = stop - start; |  | ||||||
|         } |  | ||||||
|         // important: only 'from' and 'to' have meaningful timings. we use |  | ||||||
|         // 'from's. |  | ||||||
|         MPI_Bcast(t_time.data(), Nloop, MPI_DOUBLE, from, MPI_COMM_WORLD); |  | ||||||
|  |  | ||||||
|  |         dbytes = 0; | ||||||
|  |         double start = usecond(); | ||||||
|  |         int xmit_to_rank; | ||||||
|  |         int recv_from_rank; | ||||||
|  |  | ||||||
|  |         if (dir == mu) | ||||||
|  |         { | ||||||
|  |           int comm_proc = 1; | ||||||
|  |           Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |           int comm_proc = mpi_layout[mu] - 1; | ||||||
|  |           Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||||
|  |         } | ||||||
|  |         Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], | ||||||
|  |                             recv_from_rank, bytes); | ||||||
|  |         dbytes += bytes; | ||||||
|  |  | ||||||
|  |         double stop = usecond(); | ||||||
|  |         t_time[i] = stop - start; // microseconds | ||||||
|  |       } | ||||||
|       timestat.statistics(t_time); |       timestat.statistics(t_time); | ||||||
|         double rate = bytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; |  | ||||||
|         double rate_err = rate * timestat.err / timestat.mean; |  | ||||||
|         double rate_max = rate * timestat.mean / timestat.min; |  | ||||||
|         double rate_min = rate * timestat.mean / timestat.max; |  | ||||||
|  |  | ||||||
|         grid_printf("%2d %2d %15.4f %15.3f %15.4f %15zu %15.2f\n", from, to, timestat.mean, |  | ||||||
|                     timestat.err, timestat.min, bytes, rate); |  | ||||||
|  |  | ||||||
|  |       grid_printf("%5d %7s %15.2f %15.1f %15.2f\n", dir, | ||||||
|  |                   is_shm           ? "yes" | ||||||
|  |                   : is_partial_shm ? "partial" | ||||||
|  |                                    : "no", | ||||||
|  |                   timestat.mean, timestat.err, timestat.min); | ||||||
|       nlohmann::json tmp; |       nlohmann::json tmp; | ||||||
|         tmp["from"] = from; |  | ||||||
|         tmp["to"] = to; |  | ||||||
|         tmp["bytes"] = bytes; |  | ||||||
|         tmp["time_usec"] = timestat.mean; |  | ||||||
|         tmp["time_usec_error"] = timestat.err; |  | ||||||
|         tmp["time_usec_min"] = timestat.min; |  | ||||||
|         tmp["time_usec_max"] = timestat.max; |  | ||||||
|         tmp["time_usec_full"] = t_time; |  | ||||||
|       nlohmann::json tmp_rate; |       nlohmann::json tmp_rate; | ||||||
|         tmp_rate["mean"] = rate; |       tmp["dir"] = dir; | ||||||
|         tmp_rate["error"] = rate_err; |       tmp["shared_mem"] = is_shm; | ||||||
|         tmp_rate["max"] = rate_max; |       tmp["partial_shared_mem"] = is_partial_shm; | ||||||
|         tmp_rate["min"] = rate_min; |       tmp["time_usec"] = timestat.mean; | ||||||
|         tmp["rate_GBps"] = tmp_rate; |       tmp["time_usec_error"] = timestat.err; | ||||||
|  |       tmp["time_usec_max"] = timestat.min; | ||||||
|         json_p2p.push_back(tmp); |       json_results["latency"].push_back(tmp); | ||||||
|  |     } | ||||||
|  |     for (int d = 0; d < 8; d++) | ||||||
|  |     { | ||||||
|  |       acceleratorFreeDevice(xbuf[d]); | ||||||
|  |       acceleratorFreeDevice(rbuf[d]); | ||||||
|     } |     } | ||||||
|     json_results["p2p"] = json_p2p; |  | ||||||
|  |  | ||||||
|     acceleratorFreeDevice(buf_from); |     return; | ||||||
|     acceleratorFreeDevice(buf_to); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   static void Memory(void) |   static void Memory(void) | ||||||
| @@ -726,6 +647,8 @@ class Benchmark | |||||||
|  |  | ||||||
|         FGrid->Broadcast(0, &ncall, sizeof(ncall)); |         FGrid->Broadcast(0, &ncall, sizeof(ncall)); | ||||||
|  |  | ||||||
|  |         Dw.ZeroCounters(); | ||||||
|  |  | ||||||
|         time_statistics timestat; |         time_statistics timestat; | ||||||
|         std::vector<double> t_time(ncall); |         std::vector<double> t_time(ncall); | ||||||
|         for (uint64_t i = 0; i < ncall; i++) |         for (uint64_t i = 0; i < ncall; i++) | ||||||
| @@ -920,6 +843,7 @@ class Benchmark | |||||||
|         uint64_t ncall = 500; |         uint64_t ncall = 500; | ||||||
|  |  | ||||||
|         FGrid->Broadcast(0, &ncall, sizeof(ncall)); |         FGrid->Broadcast(0, &ncall, sizeof(ncall)); | ||||||
|  |         Ds.ZeroCounters(); | ||||||
|  |  | ||||||
|         time_statistics timestat; |         time_statistics timestat; | ||||||
|         std::vector<double> t_time(ncall); |         std::vector<double> t_time(ncall); | ||||||
| @@ -987,54 +911,29 @@ int main(int argc, char **argv) | |||||||
| { | { | ||||||
|   Grid_init(&argc, &argv); |   Grid_init(&argc, &argv); | ||||||
|  |  | ||||||
|   int Ls = 1; |  | ||||||
|   bool do_su4 = true; |  | ||||||
|   bool do_memory = true; |  | ||||||
|   bool do_comms = true; |  | ||||||
|   bool do_flops = true; |  | ||||||
|  |  | ||||||
|   // NOTE: these two take O((number of ranks)^2) time, which might be a lot, so they are |  | ||||||
|   // off by default |  | ||||||
|   bool do_latency = false; |  | ||||||
|   bool do_p2p = false; |  | ||||||
|  |  | ||||||
|   std::string json_filename = ""; // empty indicates no json output |   std::string json_filename = ""; // empty indicates no json output | ||||||
|   for (int i = 0; i < argc; i++) |   for (int i = 0; i < argc; i++) | ||||||
|   { |   { | ||||||
|     auto arg = std::string(argv[i]); |     if (std::string(argv[i]) == "--json-out") | ||||||
|     if (arg == "--json-out") |  | ||||||
|       json_filename = argv[i + 1]; |       json_filename = argv[i + 1]; | ||||||
|     if (arg == "--benchmark-su4") |  | ||||||
|       do_su4 = true; |  | ||||||
|     if (arg == "--benchmark-memory") |  | ||||||
|       do_memory = true; |  | ||||||
|     if (arg == "--benchmark-comms") |  | ||||||
|       do_comms = true; |  | ||||||
|     if (arg == "--benchmark-flops") |  | ||||||
|       do_flops = true; |  | ||||||
|     if (arg == "--benchmark-latency") |  | ||||||
|       do_latency = true; |  | ||||||
|     if (arg == "--benchmark-p2p") |  | ||||||
|       do_p2p = true; |  | ||||||
|     if (arg == "--no-benchmark-su4") |  | ||||||
|       do_su4 = false; |  | ||||||
|     if (arg == "--no-benchmark-memory") |  | ||||||
|       do_memory = false; |  | ||||||
|     if (arg == "--no-benchmark-comms") |  | ||||||
|       do_comms = false; |  | ||||||
|     if (arg == "--no-benchmark-flops") |  | ||||||
|       do_flops = false; |  | ||||||
|     if (arg == "--no-benchmark-latency") |  | ||||||
|       do_latency = false; |  | ||||||
|     if (arg == "--no-benchmark-p2p") |  | ||||||
|       do_p2p = false; |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   CartesianCommunicator::SetCommunicatorPolicy( |   CartesianCommunicator::SetCommunicatorPolicy( | ||||||
|       CartesianCommunicator::CommunicatorPolicySequential); |       CartesianCommunicator::CommunicatorPolicySequential); | ||||||
|  | #ifdef KNL | ||||||
|  |   LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2}); | ||||||
|  | #else | ||||||
|  |   LebesgueOrder::Block = std::vector<int>({2, 2, 2, 2}); | ||||||
|  | #endif | ||||||
|   Benchmark::Decomposition(); |   Benchmark::Decomposition(); | ||||||
|  |  | ||||||
|  |   int do_su4 = 1; | ||||||
|  |   int do_memory = 1; | ||||||
|  |   int do_comms = 1; | ||||||
|  |   int do_latency = 1; | ||||||
|  |   int do_flops = 1; | ||||||
|  |   int Ls = 1; | ||||||
|  |  | ||||||
|   int sel = 4; |   int sel = 4; | ||||||
|   std::vector<int> L_list({8, 12, 16, 24, 32}); |   std::vector<int> L_list({8, 12, 16, 24, 32}); | ||||||
|   int selm1 = sel - 1; |   int selm1 = sel - 1; | ||||||
| @@ -1075,14 +974,6 @@ int main(int argc, char **argv) | |||||||
|     Benchmark::Latency(); |     Benchmark::Latency(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (do_p2p) |  | ||||||
|   { |  | ||||||
|     grid_big_sep(); |  | ||||||
|     std::cout << GridLogMessage << " Point-To-Point benchmark " << std::endl; |  | ||||||
|     grid_big_sep(); |  | ||||||
|     Benchmark::P2P(); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if (do_flops) |   if (do_flops) | ||||||
|   { |   { | ||||||
|     Ls = 1; |     Ls = 1; | ||||||
| @@ -1142,8 +1033,6 @@ int main(int argc, char **argv) | |||||||
|     json_results["flops"] = tmp_flops; |     json_results["flops"] = tmp_flops; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   json_results["hostnames"] = get_mpi_hostnames(); |  | ||||||
|  |  | ||||||
|   if (!json_filename.empty()) |   if (!json_filename.empty()) | ||||||
|   { |   { | ||||||
|     std::cout << GridLogMessage << "writing benchmark results to " << json_filename |     std::cout << GridLogMessage << "writing benchmark results to " << json_filename | ||||||
|   | |||||||
| @@ -1,12 +1,13 @@ | |||||||
| #!/usr/bin/env bash | #!/usr/bin/env bash | ||||||
|  |  | ||||||
| lrank=$OMPI_COMM_WORLD_LOCAL_RANK | lrank=$OMPI_COMM_WORLD_LOCAL_RANK | ||||||
| numa1=$((lrank)) | numa1=$(( 2 * lrank)) | ||||||
|  | numa2=$(( 2 * lrank + 1 )) | ||||||
| netdev=mlx5_${lrank}:1 | netdev=mlx5_${lrank}:1 | ||||||
|  |  | ||||||
| export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK | export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK | ||||||
| export UCX_NET_DEVICES=${netdev} | export UCX_NET_DEVICES=${netdev} | ||||||
| BINDING="--interleave=$numa1" | BINDING="--interleave=$numa1,$numa2" | ||||||
|  |  | ||||||
| echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING" | echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING" | ||||||
|  |  | ||||||
|   | |||||||
| @@ -80,7 +80,7 @@ mkdir -p build_gpu; cd build_gpu | |||||||
|              --enable-devel-headers --enable-examples --enable-optimizations   \ |              --enable-devel-headers --enable-examples --enable-optimizations   \ | ||||||
|              --with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging     \ |              --with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging     \ | ||||||
|              --disable-debug --disable-assertions --enable-cma                 \ |              --disable-debug --disable-assertions --enable-cma                 \ | ||||||
|              --with-knem=/opt/knem-1.1.4.90mlnx2/ --with-rdmacm                \ |              --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm                \ | ||||||
|              --without-rocm --without-ugni --without-java                      \ |              --without-rocm --without-ugni --without-java                      \ | ||||||
|              --enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm   \ |              --enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm   \ | ||||||
|              --with-rc --with-ud --with-dc --with-mlx5-dv --with-dm            \ |              --with-rc --with-ud --with-dc --with-mlx5-dv --with-dm            \ | ||||||
| @@ -96,7 +96,7 @@ mkdir -p build_cpu; cd build_cpu | |||||||
|              --enable-devel-headers --enable-examples --enable-optimizations   \ |              --enable-devel-headers --enable-examples --enable-optimizations   \ | ||||||
|              --with-verbs --disable-logging --disable-debug                    \ |              --with-verbs --disable-logging --disable-debug                    \ | ||||||
|              --disable-assertions --enable-mt --enable-cma                     \ |              --disable-assertions --enable-mt --enable-cma                     \ | ||||||
|               --with-knem=/opt/knem-1.1.4.90mlnx2/--with-rdmacm                \ |              --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm                \ | ||||||
|              --without-rocm --without-ugni --without-java                      \ |              --without-rocm --without-ugni --without-java                      \ | ||||||
|              --enable-compiler-opt=3 --without-cm --without-ugni --with-rc     \ |              --enable-compiler-opt=3 --without-cm --without-ugni --with-rc     \ | ||||||
|              --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go |              --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go | ||||||
| @@ -122,7 +122,7 @@ mkdir build_gpu; cd build_gpu | |||||||
| ../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem    \ | ../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem    \ | ||||||
|              --with-ucx="${dir}"/prefix/ucx_gpu                   \ |              --with-ucx="${dir}"/prefix/ucx_gpu                   \ | ||||||
|              --with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib        \ |              --with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib        \ | ||||||
|              --with-knem=/opt/knem-1.1.4.90mlnx2/                 \ |              --with-knem=/opt/knem-1.1.4.90mlnx1/                 \ | ||||||
|              --enable-mca-no-build=btl-uct                        \ |              --enable-mca-no-build=btl-uct                        \ | ||||||
|              --with-cuda="${cuda_path}" --disable-getpwuid        \ |              --with-cuda="${cuda_path}" --disable-getpwuid        \ | ||||||
|              --with-verbs --with-slurm --enable-mpi-fortran=all   \ |              --with-verbs --with-slurm --enable-mpi-fortran=all   \ | ||||||
| @@ -136,7 +136,7 @@ mkdir build_cpu; cd build_cpu | |||||||
| ../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem    \ | ../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem    \ | ||||||
|              --with-ucx="${dir}"/prefix/ucx_cpu                   \ |              --with-ucx="${dir}"/prefix/ucx_cpu                   \ | ||||||
|              --with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib        \ |              --with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib        \ | ||||||
|              --with-knem=/opt/knem-1.1.4.90mlnx2/                 \ |              --with-knem=/opt/knem-1.1.4.90mlnx1/                 \ | ||||||
|              --enable-mca-no-build=btl-uct --disable-getpwuid     \ |              --enable-mca-no-build=btl-uct --disable-getpwuid     \ | ||||||
|              --with-verbs --with-slurm --enable-mpi-fortran=all   \ |              --with-verbs --with-slurm --enable-mpi-fortran=all   \ | ||||||
|              --with-pmix=internal --with-libevent=internal |              --with-pmix=internal --with-libevent=internal | ||||||
|   | |||||||
| @@ -1,14 +0,0 @@ | |||||||
| { |  | ||||||
|   BasedOnStyle: LLVM, |  | ||||||
|   UseTab: Never, |  | ||||||
|   IndentWidth: 2, |  | ||||||
|   TabWidth: 2, |  | ||||||
|   BreakBeforeBraces: Allman, |  | ||||||
|   AllowShortIfStatementsOnASingleLine: false, |  | ||||||
|   IndentCaseLabels: false, |  | ||||||
|   ColumnLimit: 90, |  | ||||||
|   AccessModifierOffset: -4, |  | ||||||
|   NamespaceIndentation: All, |  | ||||||
|   FixNamespaceComments: false, |  | ||||||
|   SortIncludes: true, |  | ||||||
| } |  | ||||||
| @@ -1,458 +0,0 @@ | |||||||
| #include <algorithm> |  | ||||||
| #include <array> |  | ||||||
| #include <blas_quda.h> |  | ||||||
| #include <cassert> |  | ||||||
| #include <chrono> |  | ||||||
| #include <color_spinor_field.h> |  | ||||||
| #include <communicator_quda.h> |  | ||||||
| #include <dirac_quda.h> |  | ||||||
| #include <fstream> |  | ||||||
| #include <gauge_tools.h> |  | ||||||
| #include <memory> |  | ||||||
| #include <mpi.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <stdlib.h> |  | ||||||
|  |  | ||||||
| // remove to use QUDA's own flop counting instead of Grid's convention |  | ||||||
| #define FLOP_COUNTING_GRID |  | ||||||
|  |  | ||||||
| #include "json.hpp" |  | ||||||
| using nlohmann::json; |  | ||||||
| json json_results; |  | ||||||
|  |  | ||||||
| using namespace quda; |  | ||||||
|  |  | ||||||
| // thanks chatGPT :) |  | ||||||
| std::string get_timestamp() |  | ||||||
| { |  | ||||||
|   // Get the current time |  | ||||||
|   auto now = std::chrono::system_clock::now(); |  | ||||||
|  |  | ||||||
|   // Convert the current time to a time_t object |  | ||||||
|   std::time_t currentTime = std::chrono::system_clock::to_time_t(now); |  | ||||||
|  |  | ||||||
|   // Format the time using std::put_time |  | ||||||
|   std::stringstream ss; |  | ||||||
|   ss << std::put_time(std::localtime(¤tTime), "%Y%m%d %H:%M:%S"); |  | ||||||
|  |  | ||||||
|   return ss.str(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // This is the MPI grid, i.e. the layout of ranks |  | ||||||
| int nranks = -1; |  | ||||||
| std::array<int, 4> mpi_grid = {1, 1, 1, 1}; |  | ||||||
|  |  | ||||||
| // run f() in a loop for roughly target_time seconds |  | ||||||
| // returns seconds per iteration it took |  | ||||||
| template <class F> double bench(F const &f, double target_time, int niter_warmup = 5) |  | ||||||
| { |  | ||||||
|   device_timer_t timer; |  | ||||||
|   timer.start(); |  | ||||||
|   for (int iter = 0; iter < niter_warmup; ++iter) |  | ||||||
|     f(); |  | ||||||
|   timer.stop(); |  | ||||||
|  |  | ||||||
|   double secs = timer.last() / niter_warmup; |  | ||||||
|   int niter = std::max(1, int(target_time / secs)); |  | ||||||
|   // niter = std::min(1000, niter); |  | ||||||
|   // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter); |  | ||||||
|  |  | ||||||
|   // important: each rank has its own timer, so their measurements can slightly vary. But |  | ||||||
|   // 'niter' needs to be consistent (bug took me a couple hours to track down) |  | ||||||
|   comm_broadcast_global(&niter, sizeof(niter), 0); |  | ||||||
|  |  | ||||||
|   timer.reset(__FUNCTION__, __FILE__, __LINE__); |  | ||||||
|   timer.start(); |  | ||||||
|   for (int iter = 0; iter < niter; ++iter) |  | ||||||
|     f(); |  | ||||||
|   timer.stop(); |  | ||||||
|   return timer.last() / niter; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void initComms(int argc, char **argv) |  | ||||||
| { |  | ||||||
|   // init MPI communication |  | ||||||
|   MPI_Init(&argc, &argv); |  | ||||||
|  |  | ||||||
|   MPI_Comm_size(MPI_COMM_WORLD, &nranks); |  | ||||||
|   assert(1 <= nranks && nranks <= 100000); |  | ||||||
|  |  | ||||||
|   mpi_grid[3] = nranks; |  | ||||||
|  |  | ||||||
|   // this maps coordinates to rank number |  | ||||||
|   auto lex_rank_from_coords = [](int const *coords, void *) |  | ||||||
|   { |  | ||||||
|     int rank = coords[0]; |  | ||||||
|     for (int i = 1; i < 4; i++) |  | ||||||
|       rank = mpi_grid[i] * rank + coords[i]; |  | ||||||
|     return rank; |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|   initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr); |  | ||||||
|  |  | ||||||
|   for (int d = 0; d < 4; d++) |  | ||||||
|     if (mpi_grid[d] > 1) |  | ||||||
|       commDimPartitionedSet(d); |  | ||||||
|  |  | ||||||
|   json_results["geometry"]["ranks"] = nranks; |  | ||||||
|   json_results["geometry"]["mpi"] = mpi_grid; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // creates a random gauge field. L = local(!) size |  | ||||||
| cudaGaugeField make_gauge_field(int L) |  | ||||||
| { |  | ||||||
|   GaugeFieldParam param; |  | ||||||
|  |  | ||||||
|   // dimension and type of the lattice object |  | ||||||
|   param.nDim = 4; |  | ||||||
|   param.x[0] = L; |  | ||||||
|   param.x[1] = L; |  | ||||||
|   param.x[2] = L; |  | ||||||
|   param.x[3] = L; |  | ||||||
|  |  | ||||||
|   // number of colors. potentially confusingly, QUDA sometimes uses the word "color" to |  | ||||||
|   // things unrelated with physical color. things like "nColor=32" do pop up in deflation |  | ||||||
|   // solvers where it (to my understanding) refers to the number of (parallely processed) |  | ||||||
|   // deflation vectors. |  | ||||||
|   param.nColor = 3; |  | ||||||
|  |  | ||||||
|   // boundary conditions (dont really care for benchmark) |  | ||||||
|   param.t_boundary = QUDA_PERIODIC_T; |  | ||||||
|  |  | ||||||
|   // for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller |  | ||||||
|   // precisions are available in QUDA too |  | ||||||
|   param.setPrecision(QUDA_SINGLE_PRECISION); |  | ||||||
|  |  | ||||||
|   // no even/odd subset, we want a full lattice |  | ||||||
|   param.siteSubset = QUDA_FULL_SITE_SUBSET; |  | ||||||
|  |  | ||||||
|   // what kind of 3x3 matrices the field contains. A proper gauge field has SU(3) |  | ||||||
|   // matrices, but (for example) smeared/thick links could have non-unitary links. |  | ||||||
|   param.link_type = QUDA_SU3_LINKS; |  | ||||||
|  |  | ||||||
|   // "NULL" does not initialize the field upon creation, "ZERO" would set everything to 0 |  | ||||||
|   param.create = QUDA_NULL_FIELD_CREATE; |  | ||||||
|  |  | ||||||
|   // field should be allocated directly on the accelerator/GPU |  | ||||||
|   param.location = QUDA_CUDA_FIELD_LOCATION; |  | ||||||
|  |  | ||||||
|   // "reconstruct" here means reconstructing a SU(3) matrix from fewer than 18 real |  | ||||||
|   // numbers (=3x3 complex numbers). Great feature in production (saving |  | ||||||
|   // memory/cache/network bandwidth), not used for this benchmark. |  | ||||||
|   param.reconstruct = QUDA_RECONSTRUCT_NO; |  | ||||||
|  |  | ||||||
|   // "ghostExchange" would often be called "halo exchange" outside of Quda. This has |  | ||||||
|   // nothing to do with ghost fields from continuum/perturbative qcd. |  | ||||||
|   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; |  | ||||||
|  |  | ||||||
|   // This controls the physical order of elements. "float2" is the the default |  | ||||||
|   param.order = QUDA_FLOAT2_GAUGE_ORDER; |  | ||||||
|  |  | ||||||
|   // this means the field is a LORENTZ vector (which a gauge field must be). Has nothing |  | ||||||
|   // to do with spin. |  | ||||||
|   param.geometry = QUDA_VECTOR_GEOMETRY; |  | ||||||
|  |  | ||||||
|   // create the field and fill with random SU(3) matrices |  | ||||||
|   // std::cout << param << std::endl; // double-check parameters |  | ||||||
|   auto U = cudaGaugeField(param); |  | ||||||
|   gaugeGauss(U, /*seed=*/1234, 1.0); |  | ||||||
|   return U; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // create a random source vector (L = local size) |  | ||||||
| ColorSpinorField make_source(int L, int Ls = 1) |  | ||||||
| { |  | ||||||
|   // NOTE: `param.x` directly determines the size of the (local, per rank) memory |  | ||||||
|   // allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a |  | ||||||
|   // physical local volume of L^4, thus implicity choosing a dimension for the |  | ||||||
|   // checkerboarding (shouldnt really matter of course which one). |  | ||||||
|   ColorSpinorParam param; |  | ||||||
|   param.nColor = 3; |  | ||||||
|   param.nSpin = 4; |  | ||||||
|   param.nVec = 1; // only a single vector |  | ||||||
|   param.pad = 0; |  | ||||||
|   param.siteSubset = QUDA_PARITY_SITE_SUBSET; |  | ||||||
|   param.nDim = Ls == 1 ? 4 : 5; |  | ||||||
|   param.x[0] = L / 2; |  | ||||||
|   param.x[1] = L; |  | ||||||
|   param.x[2] = L; |  | ||||||
|   param.x[3] = L; |  | ||||||
|   param.x[4] = Ls; |  | ||||||
|   param.pc_type = QUDA_4D_PC; |  | ||||||
|   param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; |  | ||||||
|  |  | ||||||
|   // somewhat surprisingly, the DiracWilson::Dslash(...) function only works with the |  | ||||||
|   // UKQCD_GAMMA_BASIS |  | ||||||
|   param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; |  | ||||||
|  |  | ||||||
|   param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field |  | ||||||
|   param.setPrecision(QUDA_SINGLE_PRECISION); |  | ||||||
|   param.location = QUDA_CUDA_FIELD_LOCATION; |  | ||||||
|  |  | ||||||
|   // create the field and fill it with random values |  | ||||||
|   auto src = ColorSpinorField(param); |  | ||||||
|   quda::RNG rng(src, 1234); |  | ||||||
|   spinorNoise(src, rng, QUDA_NOISE_GAUSS); |  | ||||||
|   /*printfQuda( |  | ||||||
|       "created src with norm = %f (sanity check: should be close to %f) and %f bytes\n", |  | ||||||
|       blas::norm2(src), 2.0 * 12 * geom[0] * geom[1] * geom[2] * geom[3], |  | ||||||
|       src.Bytes() * 1.0);*/ |  | ||||||
|   // src.PrintDims(); |  | ||||||
|  |  | ||||||
|   return src; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void benchmark_wilson(std::vector<int> const &L_list, double target_time) |  | ||||||
| { |  | ||||||
|   printfQuda("==================== wilson dirac operator ====================\n"); |  | ||||||
| #ifdef FLOP_COUNTING_GRID |  | ||||||
|   printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); |  | ||||||
| #else |  | ||||||
|   printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " |  | ||||||
|              "Benchmark_Grid)\n"); |  | ||||||
| #endif |  | ||||||
|   printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); |  | ||||||
|  |  | ||||||
|   for (int L : L_list) |  | ||||||
|   { |  | ||||||
|     // printfQuda("starting wilson L=%d\n", L); |  | ||||||
|  |  | ||||||
|     auto U = make_gauge_field(L); |  | ||||||
|     auto src = make_source(L); |  | ||||||
|  |  | ||||||
|     // create (Wilson) dirac operator |  | ||||||
|     DiracParam param; |  | ||||||
|     param.kappa = 0.10; |  | ||||||
|     param.dagger = QUDA_DAG_NO; |  | ||||||
|     param.matpcType = QUDA_MATPC_EVEN_EVEN; |  | ||||||
|     auto dirac = DiracWilson(param); |  | ||||||
|  |  | ||||||
|     // insert gauge field into the dirac operator |  | ||||||
|     // (the additional nullptr's are for smeared links and fancy preconditioners and such. |  | ||||||
|     // Not used for simple Wilson fermions) |  | ||||||
|     dirac.updateFields(&U, nullptr, nullptr, nullptr); |  | ||||||
|     auto res = ColorSpinorField(ColorSpinorParam(src)); |  | ||||||
|     auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; |  | ||||||
|  |  | ||||||
|     // first run to get the quda tuning out of the way |  | ||||||
|     dirac.Flops(); // reset flops counter |  | ||||||
|     f(); |  | ||||||
|     double flops = 1.0 * dirac.Flops(); |  | ||||||
|  |  | ||||||
|     // actual benchmarking |  | ||||||
|     auto start_time = get_timestamp(); |  | ||||||
|     double secs = bench(f, target_time); |  | ||||||
|     auto end_time = get_timestamp(); |  | ||||||
|  |  | ||||||
| #ifdef FLOP_COUNTING_GRID |  | ||||||
|     // this is the flop counting from Benchmark_Grid |  | ||||||
|     double Nc = 3; |  | ||||||
|     double Nd = 4; |  | ||||||
|     double Ns = 4; |  | ||||||
|     flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); |  | ||||||
|     flops *= L * L * L * L / 2.0; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); |  | ||||||
|  |  | ||||||
|     json tmp; |  | ||||||
|     tmp["L"] = L; |  | ||||||
|     tmp["Gflops_wilson"] = flops / secs * 1e-9; |  | ||||||
|     tmp["start_time"] = start_time; |  | ||||||
|     tmp["end_time"] = end_time; |  | ||||||
|     json_results["flops"]["results"].push_back(tmp); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void benchmark_dwf(std::vector<int> const &L_list, double target_time) |  | ||||||
| { |  | ||||||
|   printfQuda("==================== domain wall dirac operator ====================\n"); |  | ||||||
| #ifdef FLOP_COUNTING_GRID |  | ||||||
|   printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); |  | ||||||
| #else |  | ||||||
|   printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " |  | ||||||
|              "Benchmark_Grid)\n"); |  | ||||||
| #endif |  | ||||||
|   printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); |  | ||||||
|   int Ls = 12; |  | ||||||
|   for (int L : L_list) |  | ||||||
|   { |  | ||||||
|     // printfQuda("starting dwf L=%d\n", L); |  | ||||||
|     auto U = make_gauge_field(L); |  | ||||||
|     auto src = make_source(L, Ls); |  | ||||||
|  |  | ||||||
|     // create dirac operator |  | ||||||
|     DiracParam param; |  | ||||||
|     param.kappa = 0.10; |  | ||||||
|     param.Ls = Ls; |  | ||||||
|     param.m5 = 0.1; |  | ||||||
|     param.dagger = QUDA_DAG_NO; |  | ||||||
|     param.matpcType = QUDA_MATPC_EVEN_EVEN; |  | ||||||
|     auto dirac = DiracDomainWall(param); |  | ||||||
|  |  | ||||||
|     // insert gauge field into the dirac operator |  | ||||||
|     // (the additional nullptr's are for smeared links and fancy preconditioners and such) |  | ||||||
|     dirac.updateFields(&U, nullptr, nullptr, nullptr); |  | ||||||
|     auto res = ColorSpinorField(ColorSpinorParam(src)); |  | ||||||
|     auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; |  | ||||||
|  |  | ||||||
|     // first run to get the quda tuning out of the way |  | ||||||
|     dirac.Flops(); // reset flops counter |  | ||||||
|     f(); |  | ||||||
|     double flops = 1.0 * dirac.Flops(); |  | ||||||
|  |  | ||||||
|     // actual benchmarking |  | ||||||
|     auto start_time = get_timestamp(); |  | ||||||
|     double secs = bench(f, target_time); |  | ||||||
|     auto end_time = get_timestamp(); |  | ||||||
|  |  | ||||||
| #ifdef FLOP_COUNTING_GRID |  | ||||||
|     // this is the flop counting from Benchmark_Grid |  | ||||||
|     double Nc = 3; |  | ||||||
|     double Nd = 4; |  | ||||||
|     double Ns = 4; |  | ||||||
|     flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); |  | ||||||
|     flops *= L * L * L * L * Ls / 2.0; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); |  | ||||||
|     json tmp; |  | ||||||
|     tmp["L"] = L; |  | ||||||
|     tmp["Gflops_dwf4"] = flops / secs * 1e-9; |  | ||||||
|     tmp["start_time"] = start_time; |  | ||||||
|     tmp["end_time"] = end_time; |  | ||||||
|     json_results["flops"]["results"].push_back(tmp); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void benchmark_axpy(std::vector<int> const &L_list, double target_time) |  | ||||||
| { |  | ||||||
|   // number of iterations for warmup / measurement |  | ||||||
|   // (feel free to change for noise/time tradeoff) |  | ||||||
|   constexpr int niter_warmup = 5; |  | ||||||
|  |  | ||||||
|   printfQuda("==================== axpy / memory ====================\n"); |  | ||||||
|  |  | ||||||
|   ColorSpinorParam param; |  | ||||||
|   param.nDim = 4;   // 4-dimensional lattice |  | ||||||
|   param.x[4] = 1;   // no fifth dimension |  | ||||||
|   param.nColor = 3; // supported values for nSpin/nColor are configured when compiling |  | ||||||
|                     // QUDA. "3*4" will probably always be enabled, so we stick with this |  | ||||||
|   param.nSpin = 4; |  | ||||||
|   param.nVec = 1;                            // just a single vector |  | ||||||
|   param.siteSubset = QUDA_FULL_SITE_SUBSET;  // full lattice = no odd/even |  | ||||||
|   param.pad = 0;                             // no padding |  | ||||||
|   param.create = QUDA_NULL_FIELD_CREATE;     // do not (zero-) initilize the field |  | ||||||
|   param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU |  | ||||||
|   param.setPrecision(QUDA_SINGLE_PRECISION); |  | ||||||
|  |  | ||||||
|   // the following dont matter for an axpy benchmark, but need to choose something |  | ||||||
|   param.pc_type = QUDA_4D_PC; |  | ||||||
|   param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; |  | ||||||
|   param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; |  | ||||||
|  |  | ||||||
|   printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", |  | ||||||
|              "GiB/s/rank", "Gflop/s/rank"); |  | ||||||
|   for (int L : L_list) |  | ||||||
|   { |  | ||||||
|     // printfQuda("starting axpy L=%d\n", L); |  | ||||||
|     //  IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` |  | ||||||
|     //             are LOCAL, i.e. per rank / per GPU |  | ||||||
|  |  | ||||||
|     param.x[0] = L; |  | ||||||
|     param.x[1] = L; |  | ||||||
|     param.x[2] = L; |  | ||||||
|     param.x[3] = L; |  | ||||||
|  |  | ||||||
|     // number of (real) elements in one (local) field |  | ||||||
|     size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * |  | ||||||
|                             param.nColor * param.nSpin; |  | ||||||
|  |  | ||||||
|     // create the field(s) |  | ||||||
|     auto fieldA = ColorSpinorField(param); |  | ||||||
|     auto fieldB = ColorSpinorField(param); |  | ||||||
|     assert(fieldA.Bytes() == sizeof(float) * field_elements); // sanity check |  | ||||||
|     assert(fieldB.Bytes() == sizeof(float) * field_elements); // sanity check |  | ||||||
|  |  | ||||||
|     // fill fields with random values |  | ||||||
|     quda::RNG rng(fieldA, 1234); |  | ||||||
|     spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); |  | ||||||
|     spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); |  | ||||||
|  |  | ||||||
|     // number of operations / bytes per iteration |  | ||||||
|     // axpy is one addition, one multiplication, two read, one write |  | ||||||
|     double flops = 2 * field_elements; |  | ||||||
|     double memory = 3 * sizeof(float) * field_elements; |  | ||||||
|  |  | ||||||
|     auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); }; |  | ||||||
|  |  | ||||||
|     // first run to get the quda tuning out of the way |  | ||||||
|     f(); |  | ||||||
|  |  | ||||||
|     // actual benchmarking |  | ||||||
|     auto start_time = get_timestamp(); |  | ||||||
|     double secs = bench(f, target_time); |  | ||||||
|     auto end_time = get_timestamp(); |  | ||||||
|  |  | ||||||
|     double mem_MiB = memory / 1024. / 1024.; |  | ||||||
|     double GBps = mem_MiB / 1024 / secs; |  | ||||||
|     printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps, |  | ||||||
|                flops / secs * 1e-9); |  | ||||||
|  |  | ||||||
|     json tmp; |  | ||||||
|     tmp["L"] = L; |  | ||||||
|     tmp["size_MB"] = mem_MiB; |  | ||||||
|     tmp["GBps"] = GBps; |  | ||||||
|     tmp["GFlops"] = flops / secs * 1e-9; |  | ||||||
|     tmp["start_time"] = start_time; |  | ||||||
|     tmp["end_time"] = end_time; |  | ||||||
|     json_results["axpy"].push_back(tmp); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| int main(int argc, char **argv) |  | ||||||
| { |  | ||||||
|   std::string json_filename = ""; // empty indicates no json output |  | ||||||
|   for (int i = 0; i < argc; i++) |  | ||||||
|   { |  | ||||||
|     if (std::string(argv[i]) == "--json-out") |  | ||||||
|       json_filename = argv[i + 1]; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   initComms(argc, argv); |  | ||||||
|  |  | ||||||
|   initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used |  | ||||||
|  |  | ||||||
|   //  verbosity options are: |  | ||||||
|   //  SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE |  | ||||||
|   setVerbosity(QUDA_SUMMARIZE); |  | ||||||
|  |  | ||||||
|   printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], |  | ||||||
|              mpi_grid[3]); |  | ||||||
|  |  | ||||||
|   benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0); |  | ||||||
|  |  | ||||||
|   setVerbosity(QUDA_SILENT); |  | ||||||
|   benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0); |  | ||||||
|   benchmark_dwf({8, 12, 16, 24, 32}, 1.0); |  | ||||||
|   setVerbosity(QUDA_SUMMARIZE); |  | ||||||
|  |  | ||||||
|   printfQuda("==================== done with all benchmarks ====================\n"); |  | ||||||
|  |  | ||||||
|   if (!json_filename.empty()) |  | ||||||
|   { |  | ||||||
|     printfQuda("writing benchmark results to %s\n", json_filename.c_str()); |  | ||||||
|  |  | ||||||
|     int me = 0; |  | ||||||
|     MPI_Comm_rank(MPI_COMM_WORLD, &me); |  | ||||||
|     if (me == 0) |  | ||||||
|     { |  | ||||||
|       std::ofstream json_file(json_filename); |  | ||||||
|       json_file << std::setw(2) << json_results; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   endQuda(); |  | ||||||
|   quda::comm_finalize(); |  | ||||||
|   MPI_Finalize(); |  | ||||||
| } |  | ||||||
| @@ -1,30 +0,0 @@ | |||||||
| # QUDA benchmarks |  | ||||||
|  |  | ||||||
| This folder contains benchmarks for the [QUDA](https://github.com/lattice/quda) library. |  | ||||||
|  |  | ||||||
| - `Benchmark_Quda`: This benchmark measure floating point performances of fermion |  | ||||||
| matrices (Wilson and DWF), as well as memory bandwidth (using a simple `axpy` operation). Measurements are |  | ||||||
| performed for a fixed range of problem sizes. |  | ||||||
|  |  | ||||||
| ## Building |  | ||||||
| After setting up your compilation environment (Tursa: `source /home/dp207/dp207/shared/env/production/env-{base,gpu}.sh`): |  | ||||||
| ```bash |  | ||||||
| ./build-quda.sh <env_dir>          # build Quda |  | ||||||
| ./build-benchmark.sh <env_dir>     # build benchmark |  | ||||||
| ``` |  | ||||||
| where `<env_dir>` is an arbitrary directory where every product will be stored. |  | ||||||
|  |  | ||||||
| ## Running the Benchmark |  | ||||||
|  |  | ||||||
| The benchmark should be run as |  | ||||||
| ```bash |  | ||||||
| mpirun -np <ranks> <env_dir>/prefix/qudabench/Benchmark_Quda |  | ||||||
| ``` |  | ||||||
| where `<ranks>` is the total number of GPU's to use. On Tursa this is 4 times the number of nodes. |  | ||||||
|  |  | ||||||
| Note: |  | ||||||
| - on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary. |  | ||||||
| - due to Qudas automatic tuning, the benchmark might take significantly longer to run than `Benchmark_Grid` (even though it does fewer things). |  | ||||||
|   - setting `QUDA_ENABLE_TUNING=0` disables all tuning (degrades performance severely). By default, it is turned on. |  | ||||||
|   - setting `QUDA_RESOURCE_PATH=<some folder>` enables Quda to save and reuse optimal tuning parameters, making repeated runs much faster |  | ||||||
|    |  | ||||||
| @@ -1,32 +0,0 @@ | |||||||
| #!/usr/bin/env bash |  | ||||||
| # shellcheck disable=SC1090,SC1091 |  | ||||||
|  |  | ||||||
| set -euo pipefail |  | ||||||
|  |  | ||||||
| if (( $# != 1 )); then |  | ||||||
|     echo "usage: $(basename "$0") <environment directory>" 1>&2 |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
| env_dir=$1 |  | ||||||
|  |  | ||||||
| # TODO: this is Tursa specific. have not figured out the correct way to do this. |  | ||||||
| EXTRA_LIBS="/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so" |  | ||||||
|  |  | ||||||
| # NOTE: these flags need to be in sync with Qudas compilation options (see build-quda.sh) |  | ||||||
| BUILD_FLAGS="-O3 -std=c++17 -DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=12 -DQUDA_RECONSTRUCT=4" |  | ||||||
|  |  | ||||||
| call_dir=$(pwd -P) |  | ||||||
| script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" |  | ||||||
| cd "${env_dir}" |  | ||||||
| env_dir=$(pwd -P) |  | ||||||
| cd "${call_dir}" |  | ||||||
| BUILD_DIR="${env_dir}/build/Quda-benchmarks" |  | ||||||
| PREFIX_DIR="${env_dir}/prefix/qudabench" |  | ||||||
| QUDA_DIR=${env_dir}/prefix/quda |  | ||||||
| mkdir -p "${BUILD_DIR}" |  | ||||||
| mkdir -p "${PREFIX_DIR}" |  | ||||||
|  |  | ||||||
| LINK_FLAGS="-Wl,-rpath,$QUDA_DIR/lib: $QUDA_DIR/lib/libquda.so $EXTRA_LIBS -lpthread -lmpi" |  | ||||||
|  |  | ||||||
| g++ $BUILD_FLAGS -I$QUDA_DIR/include/targets/cuda  -I$QUDA_DIR/include   -c -o $BUILD_DIR/Benchmark_Quda.o  $script_dir/Benchmark_Quda.cpp |  | ||||||
| g++ -g -O3 $BUILD_DIR/Benchmark_Quda.o -o $PREFIX_DIR/Benchmark_Quda $LINK_FLAGS -lmpi |  | ||||||
| @@ -1,36 +0,0 @@ | |||||||
| #!/usr/bin/env bash |  | ||||||
| # shellcheck disable=SC1090,SC1091 |  | ||||||
|  |  | ||||||
| BUILD_FLAGS="-O3 -std=c++17" |  | ||||||
| QUDA_FLAGS="-DQUDA_MPI=ON -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=4 -DQUDA_GPU_ARCH=sm_80" |  | ||||||
|  |  | ||||||
| set -euo pipefail |  | ||||||
|  |  | ||||||
| if (( $# != 1 )); then |  | ||||||
|     echo "usage: $(basename "$0") <environment directory>" 1>&2 |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
| env_dir=$1 |  | ||||||
|  |  | ||||||
| call_dir=$(pwd -P) |  | ||||||
| mkdir -p ${env_dir} |  | ||||||
| cd "${env_dir}" |  | ||||||
| env_dir=$(pwd -P) |  | ||||||
| cd "${call_dir}" |  | ||||||
|  |  | ||||||
| build_dir="${env_dir}/build/quda" |  | ||||||
| if [ -d "${build_dir}" ]; then |  | ||||||
|     echo "error: directory '${build_dir}' exists" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
| mkdir -p "${build_dir}" |  | ||||||
|  |  | ||||||
| git clone https://github.com/lattice/quda.git "${build_dir}" |  | ||||||
| cd "${build_dir}" |  | ||||||
|  |  | ||||||
| mkdir build; cd build |  | ||||||
| cmake .. $QUDA_FLAGS -DCMAKE_INSTALL_PREFIX=${env_dir}/prefix/quda |  | ||||||
| make -j128 |  | ||||||
| make install |  | ||||||
|  |  | ||||||
| cd "${call_dir}" |  | ||||||
							
								
								
									
										21
									
								
								Quda/env.sh
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								Quda/env.sh
									
									
									
									
									
								
							| @@ -1,21 +0,0 @@ | |||||||
| module load gcc/9.3.0 |  | ||||||
| module load cuda/11.4.1 |  | ||||||
| module load openmpi/4.1.1-cuda11.4 |  | ||||||
|  |  | ||||||
| export QUDA_RESOURCE_PATH=$(pwd)/tuning |  | ||||||
| export OMP_NUM_THREADS=4 |  | ||||||
| export OMPI_MCA_btl=^uct,openib |  | ||||||
| export OMPI_MCA_pml=ucx # by fabian. no idea what this is |  | ||||||
| #export UCX_TLS=rc,rc_x,sm,cuda_copy,cuda_ipc,gdr_copy |  | ||||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc |  | ||||||
| export UCX_RNDV_THRESH=16384 |  | ||||||
| export UCX_RNDV_SCHEME=put_zcopy |  | ||||||
| export UCX_IB_GPU_DIRECT_RDMA=yes |  | ||||||
| export UCX_MEMTYPE_CACHE=n |  | ||||||
|  |  | ||||||
| export OMPI_MCA_io=romio321 |  | ||||||
| export OMPI_MCA_btl_openib_allow_ib=true |  | ||||||
| export OMPI_MCA_btl_openib_device_type=infiniband |  | ||||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 |  | ||||||
|  |  | ||||||
| export QUDA_REORDER_LOCATION=GPU # this is the default anyway |  | ||||||
		Reference in New Issue
	
	Block a user