forked from portelli/lattice-benchmarks
		
	Compare commits
	
		
			18 Commits
		
	
	
		
			a1ad41bb06
			...
			main
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 371a329457 | |||
| f81cb198ab | |||
| a7e1d9e67f | |||
| 19c9dcb6ae | |||
| 7d89380b80 | |||
| 4cd67805b9 | |||
| f7e607eae4 | |||
| a267986800 | |||
| a1ec08cdb3 | |||
| fb6c79d9ca | |||
| d7647afa72 | |||
| ba00493c7d | |||
| 6055e0503c | |||
| 6ea093fc80 | |||
| fa47ec5bbe | |||
| 7235bfde4c | |||
| e5c61c2db1 | |||
| 80c80049d7 | 
| @@ -1,7 +1,7 @@ | |||||||
| /* | /* | ||||||
| Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk> | Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
| Copyright © 2022 Antonin Portelli <antonin.portelli@me.com> | Copyright © 2022 Antonin Portelli <antonin.portelli@me.com> | ||||||
| Copyright © 2022 Simon Buerger <simon.buerger@rwth-aachen.de> | Copyright © 2024 Simon Buerger <simon.buerger@rwth-aachen.de> | ||||||
|  |  | ||||||
| This is a fork of Benchmark_ITT.cpp from Grid | This is a fork of Benchmark_ITT.cpp from Grid | ||||||
|  |  | ||||||
| @@ -29,6 +29,43 @@ int NN_global; | |||||||
|  |  | ||||||
| nlohmann::json json_results; | nlohmann::json json_results; | ||||||
|  |  | ||||||
|  | // NOTE: Grid::GridClock is just a typedef to | ||||||
|  | // `std::chrono::high_resolution_clock`, but `Grid::usecond` rounds to | ||||||
|  | // microseconds (no idea why, probably wasnt ever relevant before), so we need | ||||||
|  | // our own wrapper here. | ||||||
|  | double usecond_precise() | ||||||
|  | { | ||||||
|  |   using namespace std::chrono; | ||||||
|  |   auto nsecs = duration_cast<nanoseconds>(GridClock::now() - Grid::theProgramStart); | ||||||
|  |   return nsecs.count() * 1e-3; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | std::vector<std::string> get_mpi_hostnames() | ||||||
|  | { | ||||||
|  |   int world_size; | ||||||
|  |   MPI_Comm_size(MPI_COMM_WORLD, &world_size); | ||||||
|  |  | ||||||
|  |   char hostname[MPI_MAX_PROCESSOR_NAME]; | ||||||
|  |   int name_len = 0; | ||||||
|  |   MPI_Get_processor_name(hostname, &name_len); | ||||||
|  |  | ||||||
|  |   // Allocate buffer to gather all hostnames | ||||||
|  |   std::vector<char> all_hostnames(world_size * MPI_MAX_PROCESSOR_NAME); | ||||||
|  |  | ||||||
|  |   // Use MPI_Allgather to gather all hostnames on all ranks | ||||||
|  |   MPI_Allgather(hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, all_hostnames.data(), | ||||||
|  |                 MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD); | ||||||
|  |  | ||||||
|  |   // Convert the gathered hostnames back into a vector of std::string | ||||||
|  |   std::vector<std::string> hostname_list(world_size); | ||||||
|  |   for (int i = 0; i < world_size; ++i) | ||||||
|  |   { | ||||||
|  |     hostname_list[i] = std::string(&all_hostnames[i * MPI_MAX_PROCESSOR_NAME]); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   return hostname_list; | ||||||
|  | } | ||||||
|  |  | ||||||
| struct time_statistics | struct time_statistics | ||||||
| { | { | ||||||
|   double mean; |   double mean; | ||||||
| @@ -73,6 +110,8 @@ class Benchmark | |||||||
|         {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); |         {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); | ||||||
|     GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( |     GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( | ||||||
|         latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); |         latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); | ||||||
|  |     Grid::Coordinate shm(4, 1); | ||||||
|  |     GlobalSharedMemory::GetShmDims(mpi, shm); | ||||||
|  |  | ||||||
|     uint64_t NP = TmpGrid->RankCount(); |     uint64_t NP = TmpGrid->RankCount(); | ||||||
|     uint64_t NN = TmpGrid->NodeCount(); |     uint64_t NN = TmpGrid->NodeCount(); | ||||||
| @@ -85,7 +124,9 @@ class Benchmark | |||||||
|     std::cout << GridLogMessage << "* OpenMP threads : " << GridThread::GetThreads() |     std::cout << GridLogMessage << "* OpenMP threads : " << GridThread::GetThreads() | ||||||
|               << std::endl; |               << std::endl; | ||||||
|  |  | ||||||
|     std::cout << GridLogMessage << "* MPI tasks      : " << GridCmdVectorIntToString(mpi) |     std::cout << GridLogMessage << "* MPI layout     : " << GridCmdVectorIntToString(mpi) | ||||||
|  |               << std::endl; | ||||||
|  |     std::cout << GridLogMessage << "* Shm layout     : " << GridCmdVectorIntToString(shm) | ||||||
|               << std::endl; |               << std::endl; | ||||||
|  |  | ||||||
|     std::cout << GridLogMessage << "* vReal          : " << sizeof(vReal) * 8 << "bits ; " |     std::cout << GridLogMessage << "* vReal          : " << sizeof(vReal) * 8 << "bits ; " | ||||||
| @@ -118,6 +159,7 @@ class Benchmark | |||||||
|     for (unsigned int i = 0; i < mpi.size(); ++i) |     for (unsigned int i = 0; i < mpi.size(); ++i) | ||||||
|     { |     { | ||||||
|       tmp["mpi"].push_back(mpi[i]); |       tmp["mpi"].push_back(mpi[i]); | ||||||
|  |       tmp["shm"].push_back(shm[i]); | ||||||
|     } |     } | ||||||
|     tmp["ranks"] = NP; |     tmp["ranks"] = NP; | ||||||
|     tmp["nodes"] = NN; |     tmp["nodes"] = NN; | ||||||
| @@ -132,6 +174,8 @@ class Benchmark | |||||||
|  |  | ||||||
|     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); |     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); | ||||||
|     Coordinate mpi_layout = GridDefaultMpi(); |     Coordinate mpi_layout = GridDefaultMpi(); | ||||||
|  |     Coordinate shm_layout(Nd, 1); | ||||||
|  |     GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout); | ||||||
|  |  | ||||||
|     for (int mu = 0; mu < Nd; mu++) |     for (int mu = 0; mu < Nd; mu++) | ||||||
|       if (mpi_layout[mu] > 1) |       if (mpi_layout[mu] > 1) | ||||||
| @@ -143,8 +187,8 @@ class Benchmark | |||||||
|     std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in " |     std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in " | ||||||
|               << nmu << " dimensions" << std::endl; |               << nmu << " dimensions" << std::endl; | ||||||
|     grid_small_sep(); |     grid_small_sep(); | ||||||
|     grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)", |     grid_printf("%5s %5s %7s %15s %15s %15s %15s %15s\n", "L", "dir", "shm", | ||||||
|                 "time (usec)", "rate (GB/s/node)", "std dev", "max"); |                 "payload (B)", "time (usec)", "rate (GB/s/node)", "std dev", "max"); | ||||||
|  |  | ||||||
|     for (int lat = 16; lat <= maxlat; lat += 8) |     for (int lat = 16; lat <= maxlat; lat += 8) | ||||||
|     { |     { | ||||||
| @@ -173,74 +217,80 @@ class Benchmark | |||||||
|       for (int dir = 0; dir < 8; dir++) |       for (int dir = 0; dir < 8; dir++) | ||||||
|       { |       { | ||||||
|         int mu = dir % 4; |         int mu = dir % 4; | ||||||
|         if (mpi_layout[mu] > 1) |         if (mpi_layout[mu] == 1) // skip directions that are not distributed | ||||||
|  |           continue; | ||||||
|  |         bool is_shm = mpi_layout[mu] == shm_layout[mu]; | ||||||
|  |         bool is_partial_shm = !is_shm && shm_layout[mu] != 1; | ||||||
|  |  | ||||||
|  |         std::vector<double> times(Nloop); | ||||||
|  |         for (int i = 0; i < NWARMUP; i++) | ||||||
|  |         { | ||||||
|  |           int xmit_to_rank; | ||||||
|  |           int recv_from_rank; | ||||||
|  |  | ||||||
|  |           if (dir == mu) | ||||||
|  |           { | ||||||
|  |             int comm_proc = 1; | ||||||
|  |             Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||||
|  |           } | ||||||
|  |           else | ||||||
|  |           { | ||||||
|  |             int comm_proc = mpi_layout[mu] - 1; | ||||||
|  |             Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||||
|  |           } | ||||||
|  |           Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], | ||||||
|  |                               recv_from_rank, bytes); | ||||||
|  |         } | ||||||
|  |         for (int i = 0; i < Nloop; i++) | ||||||
|         { |         { | ||||||
|  |  | ||||||
|           std::vector<double> times(Nloop); |           dbytes = 0; | ||||||
|           for (int i = 0; i < NWARMUP; i++) |           double start = usecond(); | ||||||
|  |           int xmit_to_rank; | ||||||
|  |           int recv_from_rank; | ||||||
|  |  | ||||||
|  |           if (dir == mu) | ||||||
|           { |           { | ||||||
|             int xmit_to_rank; |             int comm_proc = 1; | ||||||
|             int recv_from_rank; |             Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||||
|  |  | ||||||
|             if (dir == mu) |  | ||||||
|             { |  | ||||||
|               int comm_proc = 1; |  | ||||||
|               Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); |  | ||||||
|             } |  | ||||||
|             else |  | ||||||
|             { |  | ||||||
|               int comm_proc = mpi_layout[mu] - 1; |  | ||||||
|               Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); |  | ||||||
|             } |  | ||||||
|             Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, |  | ||||||
|                                 (void *)&rbuf[dir][0], recv_from_rank, bytes); |  | ||||||
|           } |           } | ||||||
|           for (int i = 0; i < Nloop; i++) |           else | ||||||
|           { |           { | ||||||
|  |             int comm_proc = mpi_layout[mu] - 1; | ||||||
|             dbytes = 0; |             Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||||
|             double start = usecond(); |  | ||||||
|             int xmit_to_rank; |  | ||||||
|             int recv_from_rank; |  | ||||||
|  |  | ||||||
|             if (dir == mu) |  | ||||||
|             { |  | ||||||
|               int comm_proc = 1; |  | ||||||
|               Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); |  | ||||||
|             } |  | ||||||
|             else |  | ||||||
|             { |  | ||||||
|               int comm_proc = mpi_layout[mu] - 1; |  | ||||||
|               Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); |  | ||||||
|             } |  | ||||||
|             Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, |  | ||||||
|                                 (void *)&rbuf[dir][0], recv_from_rank, bytes); |  | ||||||
|             dbytes += bytes; |  | ||||||
|  |  | ||||||
|             double stop = usecond(); |  | ||||||
|             t_time[i] = stop - start; // microseconds |  | ||||||
|           } |           } | ||||||
|           timestat.statistics(t_time); |           Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], | ||||||
|  |                               recv_from_rank, bytes); | ||||||
|  |           dbytes += bytes; | ||||||
|  |  | ||||||
|           dbytes = dbytes * ppn; |           double stop = usecond(); | ||||||
|           double bidibytes = 2. * dbytes; |           t_time[i] = stop - start; // microseconds | ||||||
|           double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; |  | ||||||
|           double rate_err = rate * timestat.err / timestat.mean; |  | ||||||
|           double rate_max = rate * timestat.mean / timestat.min; |  | ||||||
|           grid_printf("%5d %5d %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, bytes, |  | ||||||
|                       timestat.mean, rate, rate_err, rate_max); |  | ||||||
|           nlohmann::json tmp; |  | ||||||
|           nlohmann::json tmp_rate; |  | ||||||
|           tmp["L"] = lat; |  | ||||||
|           tmp["dir"] = dir; |  | ||||||
|           tmp["bytes"] = bytes; |  | ||||||
|           tmp["time_usec"] = timestat.mean; |  | ||||||
|           tmp_rate["mean"] = rate; |  | ||||||
|           tmp_rate["error"] = rate_err; |  | ||||||
|           tmp_rate["max"] = rate_max; |  | ||||||
|           tmp["rate_GBps"] = tmp_rate; |  | ||||||
|           json_results["comms"].push_back(tmp); |  | ||||||
|         } |         } | ||||||
|  |         timestat.statistics(t_time); | ||||||
|  |  | ||||||
|  |         dbytes = dbytes * ppn; | ||||||
|  |         double bidibytes = 2. * dbytes; | ||||||
|  |         double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; | ||||||
|  |         double rate_err = rate * timestat.err / timestat.mean; | ||||||
|  |         double rate_max = rate * timestat.mean / timestat.min; | ||||||
|  |         grid_printf("%5d %5d %7s %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, | ||||||
|  |                     is_shm           ? "yes" | ||||||
|  |                     : is_partial_shm ? "partial" | ||||||
|  |                                      : "no", | ||||||
|  |                     bytes, timestat.mean, rate, rate_err, rate_max); | ||||||
|  |         nlohmann::json tmp; | ||||||
|  |         nlohmann::json tmp_rate; | ||||||
|  |         tmp["L"] = lat; | ||||||
|  |         tmp["dir"] = dir; | ||||||
|  |         tmp["shared_mem"] = is_shm; | ||||||
|  |         tmp["partial_shared_mem"] = is_partial_shm; | ||||||
|  |         tmp["bytes"] = bytes; | ||||||
|  |         tmp["time_usec"] = timestat.mean; | ||||||
|  |         tmp_rate["mean"] = rate; | ||||||
|  |         tmp_rate["error"] = rate_err; | ||||||
|  |         tmp_rate["max"] = rate_max; | ||||||
|  |         tmp["rate_GBps"] = tmp_rate; | ||||||
|  |         json_results["comms"].push_back(tmp); | ||||||
|       } |       } | ||||||
|       for (int d = 0; d < 8; d++) |       for (int d = 0; d < 8; d++) | ||||||
|       { |       { | ||||||
| @@ -251,6 +301,170 @@ class Benchmark | |||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   static void Latency(void) | ||||||
|  |   { | ||||||
|  |     int Nwarmup = 100; | ||||||
|  |     int Nloop = 300; | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << "Benchmarking point-to-point latency" << std::endl; | ||||||
|  |     grid_small_sep(); | ||||||
|  |     grid_printf("from to      mean(usec)           err           max\n"); | ||||||
|  |  | ||||||
|  |     int ranks; | ||||||
|  |     int me; | ||||||
|  |     MPI_Comm_size(MPI_COMM_WORLD, &ranks); | ||||||
|  |     MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||||
|  |  | ||||||
|  |     int bytes = 8; | ||||||
|  |     void *buf_from = acceleratorAllocDevice(bytes); | ||||||
|  |     void *buf_to = acceleratorAllocDevice(bytes); | ||||||
|  |     nlohmann::json json_latency; | ||||||
|  |     for (int from = 0; from < ranks; ++from) | ||||||
|  |       for (int to = 0; to < ranks; ++to) | ||||||
|  |       { | ||||||
|  |         if (from == to) | ||||||
|  |           continue; | ||||||
|  |  | ||||||
|  |         std::vector<double> t_time(Nloop); | ||||||
|  |         time_statistics timestat; | ||||||
|  |         MPI_Status status; | ||||||
|  |  | ||||||
|  |         for (int i = -Nwarmup; i < Nloop; ++i) | ||||||
|  |         { | ||||||
|  |           double start = usecond_precise(); | ||||||
|  |           if (from == me) | ||||||
|  |           { | ||||||
|  |             auto err = MPI_Send(buf_from, bytes, MPI_CHAR, to, 0, MPI_COMM_WORLD); | ||||||
|  |             assert(err == MPI_SUCCESS); | ||||||
|  |           } | ||||||
|  |           if (to == me) | ||||||
|  |           { | ||||||
|  |             auto err = | ||||||
|  |                 MPI_Recv(buf_to, bytes, MPI_CHAR, from, 0, MPI_COMM_WORLD, &status); | ||||||
|  |             assert(err == MPI_SUCCESS); | ||||||
|  |           } | ||||||
|  |           double stop = usecond_precise(); | ||||||
|  |           if (i >= 0) | ||||||
|  |             t_time[i] = stop - start; | ||||||
|  |         } | ||||||
|  |         // important: only 'from' and 'to' have meaningful timings. we use | ||||||
|  |         // 'from's. | ||||||
|  |         MPI_Bcast(t_time.data(), Nloop, MPI_DOUBLE, from, MPI_COMM_WORLD); | ||||||
|  |  | ||||||
|  |         timestat.statistics(t_time); | ||||||
|  |         grid_printf("%2d %2d %15.4f %15.3f %15.4f\n", from, to, timestat.mean, | ||||||
|  |                     timestat.err, timestat.max); | ||||||
|  |         nlohmann::json tmp; | ||||||
|  |         tmp["from"] = from; | ||||||
|  |         tmp["to"] = to; | ||||||
|  |         tmp["time_usec"] = timestat.mean; | ||||||
|  |         tmp["time_usec_error"] = timestat.err; | ||||||
|  |         tmp["time_usec_min"] = timestat.min; | ||||||
|  |         tmp["time_usec_max"] = timestat.max; | ||||||
|  |         tmp["time_usec_full"] = t_time; | ||||||
|  |         json_latency.push_back(tmp); | ||||||
|  |       } | ||||||
|  |     json_results["latency"] = json_latency; | ||||||
|  |  | ||||||
|  |     acceleratorFreeDevice(buf_from); | ||||||
|  |     acceleratorFreeDevice(buf_to); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static void P2P(void) | ||||||
|  |   { | ||||||
|  |     // IMPORTANT: The P2P benchmark uses "MPI_COMM_WORLD" communicator, which is | ||||||
|  |     // not the quite the same as Grid.communicator. Practically speaking, the | ||||||
|  |     // latter one contains the same MPI-ranks but in a different order. Grid | ||||||
|  |     // does this make sure it can exploit ranks with shared memory (i.e. | ||||||
|  |     // multiple ranks on the same node) as best as possible. | ||||||
|  |  | ||||||
|  |     // buffer-size to benchmark. This number is the same as the largest one used | ||||||
|  |     // in the "Comms()" benchmark. ( L=48, Ls=12, double-prec-complex, | ||||||
|  |     // half-color-spin-vector. ). Mostly an arbitrary choice, but nice to match | ||||||
|  |     // it here | ||||||
|  |     size_t bytes = 127401984; | ||||||
|  |  | ||||||
|  |     int Nwarmup = 20; | ||||||
|  |     int Nloop = 100; | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << "Benchmarking point-to-point bandwidth" << std::endl; | ||||||
|  |     grid_small_sep(); | ||||||
|  |     grid_printf("from to      mean(usec)           err           min           " | ||||||
|  |                 "bytes    rate (GiB/s)\n"); | ||||||
|  |  | ||||||
|  |     int ranks; | ||||||
|  |     int me; | ||||||
|  |     MPI_Comm_size(MPI_COMM_WORLD, &ranks); | ||||||
|  |     MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||||
|  |  | ||||||
|  |     void *buf_from = acceleratorAllocDevice(bytes); | ||||||
|  |     void *buf_to = acceleratorAllocDevice(bytes); | ||||||
|  |     nlohmann::json json_p2p; | ||||||
|  |     for (int from = 0; from < ranks; ++from) | ||||||
|  |       for (int to = 0; to < ranks; ++to) | ||||||
|  |       { | ||||||
|  |         if (from == to) | ||||||
|  |           continue; | ||||||
|  |  | ||||||
|  |         std::vector<double> t_time(Nloop); | ||||||
|  |         time_statistics timestat; | ||||||
|  |         MPI_Status status; | ||||||
|  |  | ||||||
|  |         for (int i = -Nwarmup; i < Nloop; ++i) | ||||||
|  |         { | ||||||
|  |           double start = usecond_precise(); | ||||||
|  |           if (from == me) | ||||||
|  |           { | ||||||
|  |             auto err = MPI_Send(buf_from, bytes, MPI_CHAR, to, 0, MPI_COMM_WORLD); | ||||||
|  |             assert(err == MPI_SUCCESS); | ||||||
|  |           } | ||||||
|  |           if (to == me) | ||||||
|  |           { | ||||||
|  |             auto err = | ||||||
|  |                 MPI_Recv(buf_to, bytes, MPI_CHAR, from, 0, MPI_COMM_WORLD, &status); | ||||||
|  |             assert(err == MPI_SUCCESS); | ||||||
|  |           } | ||||||
|  |           double stop = usecond_precise(); | ||||||
|  |           if (i >= 0) | ||||||
|  |             t_time[i] = stop - start; | ||||||
|  |         } | ||||||
|  |         // important: only 'from' and 'to' have meaningful timings. we use | ||||||
|  |         // 'from's. | ||||||
|  |         MPI_Bcast(t_time.data(), Nloop, MPI_DOUBLE, from, MPI_COMM_WORLD); | ||||||
|  |  | ||||||
|  |         timestat.statistics(t_time); | ||||||
|  |         double rate = bytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; | ||||||
|  |         double rate_err = rate * timestat.err / timestat.mean; | ||||||
|  |         double rate_max = rate * timestat.mean / timestat.min; | ||||||
|  |         double rate_min = rate * timestat.mean / timestat.max; | ||||||
|  |  | ||||||
|  |         grid_printf("%2d %2d %15.4f %15.3f %15.4f %15d %15.2f\n", from, to, timestat.mean, | ||||||
|  |                     timestat.err, timestat.min, bytes, rate); | ||||||
|  |  | ||||||
|  |         nlohmann::json tmp; | ||||||
|  |         tmp["from"] = from; | ||||||
|  |         tmp["to"] = to; | ||||||
|  |         tmp["bytes"] = bytes; | ||||||
|  |         tmp["time_usec"] = timestat.mean; | ||||||
|  |         tmp["time_usec_error"] = timestat.err; | ||||||
|  |         tmp["time_usec_min"] = timestat.min; | ||||||
|  |         tmp["time_usec_max"] = timestat.max; | ||||||
|  |         tmp["time_usec_full"] = t_time; | ||||||
|  |         nlohmann::json tmp_rate; | ||||||
|  |         tmp_rate["mean"] = rate; | ||||||
|  |         tmp_rate["error"] = rate_err; | ||||||
|  |         tmp_rate["max"] = rate_max; | ||||||
|  |         tmp_rate["min"] = rate_min; | ||||||
|  |         tmp["rate_GBps"] = tmp_rate; | ||||||
|  |  | ||||||
|  |         json_p2p.push_back(tmp); | ||||||
|  |       } | ||||||
|  |     json_results["p2p"] = json_p2p; | ||||||
|  |  | ||||||
|  |     acceleratorFreeDevice(buf_from); | ||||||
|  |     acceleratorFreeDevice(buf_to); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   static void Memory(void) |   static void Memory(void) | ||||||
|   { |   { | ||||||
|     const int Nvec = 8; |     const int Nvec = 8; | ||||||
| @@ -512,8 +726,6 @@ class Benchmark | |||||||
|  |  | ||||||
|         FGrid->Broadcast(0, &ncall, sizeof(ncall)); |         FGrid->Broadcast(0, &ncall, sizeof(ncall)); | ||||||
|  |  | ||||||
|         Dw.ZeroCounters(); |  | ||||||
|  |  | ||||||
|         time_statistics timestat; |         time_statistics timestat; | ||||||
|         std::vector<double> t_time(ncall); |         std::vector<double> t_time(ncall); | ||||||
|         for (uint64_t i = 0; i < ncall; i++) |         for (uint64_t i = 0; i < ncall; i++) | ||||||
| @@ -708,7 +920,6 @@ class Benchmark | |||||||
|         uint64_t ncall = 500; |         uint64_t ncall = 500; | ||||||
|  |  | ||||||
|         FGrid->Broadcast(0, &ncall, sizeof(ncall)); |         FGrid->Broadcast(0, &ncall, sizeof(ncall)); | ||||||
|         Ds.ZeroCounters(); |  | ||||||
|  |  | ||||||
|         time_statistics timestat; |         time_statistics timestat; | ||||||
|         std::vector<double> t_time(ncall); |         std::vector<double> t_time(ncall); | ||||||
| @@ -776,11 +987,47 @@ int main(int argc, char **argv) | |||||||
| { | { | ||||||
|   Grid_init(&argc, &argv); |   Grid_init(&argc, &argv); | ||||||
|  |  | ||||||
|  |   int Ls = 1; | ||||||
|  |   bool do_su4 = true; | ||||||
|  |   bool do_memory = true; | ||||||
|  |   bool do_comms = true; | ||||||
|  |   bool do_flops = true; | ||||||
|  |  | ||||||
|  |   // NOTE: these two take O((number of ranks)^2) time, which might be a lot, so they are | ||||||
|  |   // off by default | ||||||
|  |   bool do_latency = false; | ||||||
|  |   bool do_p2p = false; | ||||||
|  |  | ||||||
|   std::string json_filename = ""; // empty indicates no json output |   std::string json_filename = ""; // empty indicates no json output | ||||||
|   for (int i = 0; i < argc; i++) |   for (int i = 0; i < argc; i++) | ||||||
|   { |   { | ||||||
|     if (std::string(argv[i]) == "--json-out") |     auto arg = std::string(argv[i]); | ||||||
|  |     if (arg == "--json-out") | ||||||
|       json_filename = argv[i + 1]; |       json_filename = argv[i + 1]; | ||||||
|  |     if (arg == "--benchmark-su4") | ||||||
|  |       do_su4 = true; | ||||||
|  |     if (arg == "--benchmark-memory") | ||||||
|  |       do_memory = true; | ||||||
|  |     if (arg == "--benchmark-comms") | ||||||
|  |       do_comms = true; | ||||||
|  |     if (arg == "--benchmark-flops") | ||||||
|  |       do_flops = true; | ||||||
|  |     if (arg == "--benchmark-latency") | ||||||
|  |       do_latency = true; | ||||||
|  |     if (arg == "--benchmark-p2p") | ||||||
|  |       do_p2p = true; | ||||||
|  |     if (arg == "--no-benchmark-su4") | ||||||
|  |       do_su4 = false; | ||||||
|  |     if (arg == "--no-benchmark-memory") | ||||||
|  |       do_memory = false; | ||||||
|  |     if (arg == "--no-benchmark-comms") | ||||||
|  |       do_comms = false; | ||||||
|  |     if (arg == "--no-benchmark-flops") | ||||||
|  |       do_flops = false; | ||||||
|  |     if (arg == "--no-benchmark-latency") | ||||||
|  |       do_latency = false; | ||||||
|  |     if (arg == "--no-benchmark-p2p") | ||||||
|  |       do_p2p = false; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   CartesianCommunicator::SetCommunicatorPolicy( |   CartesianCommunicator::SetCommunicatorPolicy( | ||||||
| @@ -792,12 +1039,6 @@ int main(int argc, char **argv) | |||||||
| #endif | #endif | ||||||
|   Benchmark::Decomposition(); |   Benchmark::Decomposition(); | ||||||
|  |  | ||||||
|   int do_su4 = 1; |  | ||||||
|   int do_memory = 1; |  | ||||||
|   int do_comms = 1; |  | ||||||
|   int do_flops = 1; |  | ||||||
|   int Ls = 1; |  | ||||||
|  |  | ||||||
|   int sel = 4; |   int sel = 4; | ||||||
|   std::vector<int> L_list({8, 12, 16, 24, 32}); |   std::vector<int> L_list({8, 12, 16, 24, 32}); | ||||||
|   int selm1 = sel - 1; |   int selm1 = sel - 1; | ||||||
| @@ -830,6 +1071,22 @@ int main(int argc, char **argv) | |||||||
|     Benchmark::Comms(); |     Benchmark::Comms(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   if (do_latency) | ||||||
|  |   { | ||||||
|  |     grid_big_sep(); | ||||||
|  |     std::cout << GridLogMessage << " Latency benchmark " << std::endl; | ||||||
|  |     grid_big_sep(); | ||||||
|  |     Benchmark::Latency(); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if (do_p2p) | ||||||
|  |   { | ||||||
|  |     grid_big_sep(); | ||||||
|  |     std::cout << GridLogMessage << " Point-To-Point benchmark " << std::endl; | ||||||
|  |     grid_big_sep(); | ||||||
|  |     Benchmark::P2P(); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   if (do_flops) |   if (do_flops) | ||||||
|   { |   { | ||||||
|     Ls = 1; |     Ls = 1; | ||||||
| @@ -889,6 +1146,8 @@ int main(int argc, char **argv) | |||||||
|     json_results["flops"] = tmp_flops; |     json_results["flops"] = tmp_flops; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   json_results["hostnames"] = get_mpi_hostnames(); | ||||||
|  |  | ||||||
|   if (!json_filename.empty()) |   if (!json_filename.empty()) | ||||||
|   { |   { | ||||||
|     std::cout << GridLogMessage << "writing benchmark results to " << json_filename |     std::cout << GridLogMessage << "writing benchmark results to " << json_filename | ||||||
|   | |||||||
| @@ -4,7 +4,13 @@ set -euo pipefail | |||||||
|  |  | ||||||
| gcc_spec='gcc@9.4.0' | gcc_spec='gcc@9.4.0' | ||||||
| cuda_spec='cuda@11.4.0' | cuda_spec='cuda@11.4.0' | ||||||
| hdf5_spec='hdf5@1.10.7' |  | ||||||
|  | # hdf5 and fftw depend on OpenMPI, which we install manually. To make sure this | ||||||
|  | # dependency is picked by spack, we specify the compiler here explicitly. For | ||||||
|  | # most other packages we dont really care about the compiler (i.e. system | ||||||
|  | # compiler versus ${gcc_spec}) | ||||||
|  | hdf5_spec="hdf5@1.10.7+cxx+threadsafe%${gcc_spec}" | ||||||
|  | fftw_spec="fftw%${gcc_spec}" | ||||||
|  |  | ||||||
| if (( $# != 1 )); then | if (( $# != 1 )); then | ||||||
|     echo "usage: $(basename "$0") <env dir>" 1>&2 |     echo "usage: $(basename "$0") <env dir>" 1>&2 | ||||||
| @@ -38,26 +44,23 @@ rm external.yaml | |||||||
|  |  | ||||||
| # Base compilers ############################################################### | # Base compilers ############################################################### | ||||||
| # configure system base | # configure system base | ||||||
|  |  | ||||||
|  | spack env create base | ||||||
|  | spack env activate base | ||||||
| spack compiler find --scope site | spack compiler find --scope site | ||||||
|  |  | ||||||
| # install GCC, CUDA & LLVM | # install GCC, CUDA | ||||||
| spack install ${gcc_spec} ${cuda_spec} llvm | spack add ${gcc_spec} ${cuda_spec} | ||||||
|  | spack concretize | ||||||
| spack load llvm | spack env depfile -o Makefile.tmp | ||||||
|  | make -j128 -f Makefile.tmp | ||||||
| spack compiler find --scope site | spack compiler find --scope site | ||||||
| spack unload llvm |  | ||||||
|  |  | ||||||
| spack load ${gcc_spec} |  | ||||||
| spack compiler find --scope site |  | ||||||
| spack unload ${gcc_spec} |  | ||||||
|  |  | ||||||
| # Manual compilation of OpenMPI & UCX ########################################## | # Manual compilation of OpenMPI & UCX ########################################## | ||||||
| # set build directories | # set build directories | ||||||
| mkdir -p "${dir}"/build | mkdir -p "${dir}"/build | ||||||
| cd "${dir}"/build | cd "${dir}"/build | ||||||
|  |  | ||||||
| spack load ${gcc_spec} ${cuda_spec} |  | ||||||
|  |  | ||||||
| cuda_path=$(spack find --format "{prefix}" cuda) | cuda_path=$(spack find --format "{prefix}" cuda) | ||||||
| gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1 | gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1 | ||||||
|  |  | ||||||
| @@ -141,60 +144,62 @@ make -j 128 | |||||||
| make install | make install | ||||||
| cd "${dir}" | cd "${dir}" | ||||||
|  |  | ||||||
|  | ucx_spec_gpu="ucx@1.12.0.GPU%${gcc_spec}" | ||||||
|  | ucx_spec_cpu="ucx@1.12.0.CPU%${gcc_spec}" | ||||||
|  | openmpi_spec_gpu="openmpi@4.1.1.GPU%${gcc_spec}" | ||||||
|  | openmpi_spec_cpu="openmpi@4.1.1.CPU%${gcc_spec}" | ||||||
|  |  | ||||||
| # Add externals to spack | # Add externals to spack | ||||||
| echo "packages: | echo "packages: | ||||||
|   ucx: |   ucx: | ||||||
|     externals: |     externals: | ||||||
|     - spec: \"ucx@1.12.0.GPU%gcc@9.4.0\" |     - spec: \"${ucx_spec_gpu}\" | ||||||
|       prefix: ${dir}/prefix/ucx_gpu |       prefix: ${dir}/prefix/ucx_gpu | ||||||
|     - spec: \"ucx@1.12.0.CPU%gcc@9.4.0\" |     - spec: \"${ucx_spec_cpu}\" | ||||||
|       prefix: ${dir}/prefix/ucx_cpu |       prefix: ${dir}/prefix/ucx_cpu | ||||||
|     buildable: False |     buildable: False | ||||||
|   openmpi: |   openmpi: | ||||||
|     externals: |     externals: | ||||||
|     - spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\" |     - spec: \"${openmpi_spec_gpu}\" | ||||||
|       prefix: ${dir}/prefix/ompi_gpu |       prefix: ${dir}/prefix/ompi_gpu | ||||||
|     - spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\" |     - spec: \"${openmpi_spec_cpu}\" | ||||||
|       prefix: ${dir}/prefix/ompi_cpu |       prefix: ${dir}/prefix/ompi_cpu | ||||||
|     buildable: False" > spack.yaml |     buildable: False" > spack.yaml | ||||||
|  |  | ||||||
| spack config --scope site add -f spack.yaml | spack config --scope site add -f spack.yaml | ||||||
| rm spack.yaml | rm spack.yaml | ||||||
| spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0 | spack env deactivate | ||||||
| spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0 |  | ||||||
|  |  | ||||||
| cd "${cwd}" | cd "${cwd}" | ||||||
|  |  | ||||||
| # environments ################################################################# | # environments ################################################################# | ||||||
| dev_tools=("autoconf" "automake" "libtool" "jq" "git") | dev_tools=("autoconf" "automake" "libtool" "jq" "git") | ||||||
| ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU) |  | ||||||
| ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU) |  | ||||||
|  |  | ||||||
| spack env create grid-gpu | spack env create grid-gpu | ||||||
| spack env activate grid-gpu | spack env activate grid-gpu | ||||||
| spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}"  | spack compiler find --scope site | ||||||
| spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0 | spack add ${gcc_spec} ${cuda_spec} ${ucx_spec_gpu} ${openmpi_spec_gpu} | ||||||
| spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}" | spack add ${hdf5_spec} ${fftw_spec} | ||||||
| spack add fftw ^/"${ompi_gpu_hash}" | spack add openssl gmp mpfr c-lime "${dev_tools[@]}" | ||||||
| spack add openssl gmp mpfr c-lime | spack concretize | ||||||
| spack install | spack env depfile -o Makefile.tmp | ||||||
|  | make -j128 -f Makefile.tmp | ||||||
| spack env deactivate | spack env deactivate | ||||||
|  |  | ||||||
| spack env create grid-cpu | spack env create grid-cpu | ||||||
| spack env activate grid-cpu | spack env activate grid-cpu | ||||||
| spack add llvm "${dev_tools[@]}"  | spack compiler find --scope site | ||||||
| spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0 | spack add ${gcc_spec} ${ucx_spec_cpu} ${openmpi_spec_cpu} | ||||||
| spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}" | spack add ${hdf5_spec} ${fftw_spec} | ||||||
| spack add fftw ^/"${ompi_cpu_hash}" | spack add openssl gmp mpfr c-lime "${dev_tools[@]}" | ||||||
| spack add openssl gmp mpfr c-lime | spack concretize | ||||||
| spack install | spack env depfile -o Makefile.tmp | ||||||
|  | make -j128 -f Makefile.tmp | ||||||
| spack env deactivate | spack env deactivate | ||||||
|  |  | ||||||
| spack install jq git |  | ||||||
|  |  | ||||||
| # Final setup ################################################################## | # Final setup ################################################################## | ||||||
| spack clean | spack clean | ||||||
| spack gc -y | #spack gc -y  # "spack gc" tends to get hung up for unknown reasons | ||||||
|  |  | ||||||
| # add more environment variables in module loading | # add more environment variables in module loading | ||||||
| spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]' | spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]' | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user