Compare commits
	
		
			1 Commits
		
	
	
		
			main
			...
			ee07f3c892
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| ee07f3c892 | 
| @@ -1,7 +1,7 @@ | ||||
| /* | ||||
| Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Copyright © 2022 Antonin Portelli <antonin.portelli@me.com> | ||||
| Copyright © 2024 Simon Buerger <simon.buerger@rwth-aachen.de> | ||||
| Copyright © 2022 Simon Buerger <simon.buerger@rwth-aachen.de> | ||||
|  | ||||
| This is a fork of Benchmark_ITT.cpp from Grid | ||||
|  | ||||
| @@ -29,43 +29,6 @@ int NN_global; | ||||
|  | ||||
| nlohmann::json json_results; | ||||
|  | ||||
| // NOTE: Grid::GridClock is just a typedef to | ||||
| // `std::chrono::high_resolution_clock`, but `Grid::usecond` rounds to | ||||
| // microseconds (no idea why, probably wasnt ever relevant before), so we need | ||||
| // our own wrapper here. | ||||
| double usecond_precise() | ||||
| { | ||||
|   using namespace std::chrono; | ||||
|   auto nsecs = duration_cast<nanoseconds>(GridClock::now() - Grid::theProgramStart); | ||||
|   return nsecs.count() * 1e-3; | ||||
| } | ||||
|  | ||||
| std::vector<std::string> get_mpi_hostnames() | ||||
| { | ||||
|   int world_size; | ||||
|   MPI_Comm_size(MPI_COMM_WORLD, &world_size); | ||||
|  | ||||
|   char hostname[MPI_MAX_PROCESSOR_NAME]; | ||||
|   int name_len = 0; | ||||
|   MPI_Get_processor_name(hostname, &name_len); | ||||
|  | ||||
|   // Allocate buffer to gather all hostnames | ||||
|   std::vector<char> all_hostnames(world_size * MPI_MAX_PROCESSOR_NAME); | ||||
|  | ||||
|   // Use MPI_Allgather to gather all hostnames on all ranks | ||||
|   MPI_Allgather(hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, all_hostnames.data(), | ||||
|                 MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD); | ||||
|  | ||||
|   // Convert the gathered hostnames back into a vector of std::string | ||||
|   std::vector<std::string> hostname_list(world_size); | ||||
|   for (int i = 0; i < world_size; ++i) | ||||
|   { | ||||
|     hostname_list[i] = std::string(&all_hostnames[i * MPI_MAX_PROCESSOR_NAME]); | ||||
|   } | ||||
|  | ||||
|   return hostname_list; | ||||
| } | ||||
|  | ||||
| struct time_statistics | ||||
| { | ||||
|   double mean; | ||||
| @@ -110,7 +73,7 @@ class Benchmark | ||||
|         {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); | ||||
|     GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid( | ||||
|         latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); | ||||
|     Grid::Coordinate shm(4, 1); | ||||
|     Grid::Coordinate shm; | ||||
|     GlobalSharedMemory::GetShmDims(mpi, shm); | ||||
|  | ||||
|     uint64_t NP = TmpGrid->RankCount(); | ||||
| @@ -174,7 +137,7 @@ class Benchmark | ||||
|  | ||||
|     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); | ||||
|     Coordinate mpi_layout = GridDefaultMpi(); | ||||
|     Coordinate shm_layout(Nd, 1); | ||||
|     Coordinate shm_layout; | ||||
|     GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout); | ||||
|  | ||||
|     for (int mu = 0; mu < Nd; mu++) | ||||
| @@ -273,7 +236,7 @@ class Benchmark | ||||
|         double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; | ||||
|         double rate_err = rate * timestat.err / timestat.mean; | ||||
|         double rate_max = rate * timestat.mean / timestat.min; | ||||
|         grid_printf("%5d %5d %7s %15llu %15.2f %15.2f %15.1f %15.2f\n", lat, dir, | ||||
|         grid_printf("%5d %5d %7s %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, | ||||
|                     is_shm           ? "yes" | ||||
|                     : is_partial_shm ? "partial" | ||||
|                                      : "no", | ||||
| @@ -303,166 +266,124 @@ class Benchmark | ||||
|  | ||||
|   static void Latency(void) | ||||
|   { | ||||
|     int Nwarmup = 100; | ||||
|     int Nloop = 300; | ||||
|     int Nloop = 200; | ||||
|     int nmu = 0; | ||||
|  | ||||
|     std::cout << GridLogMessage << "Benchmarking point-to-point latency" << std::endl; | ||||
|     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); | ||||
|     Coordinate mpi_layout = GridDefaultMpi(); | ||||
|     Coordinate shm_layout; | ||||
|     GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout); | ||||
|  | ||||
|     for (int mu = 0; mu < Nd; mu++) | ||||
|       if (mpi_layout[mu] > 1) | ||||
|         nmu++; | ||||
|  | ||||
|     std::vector<double> t_time(Nloop); | ||||
|     time_statistics timestat; | ||||
|  | ||||
|     std::cout << GridLogMessage << "Benchmarking Latency to neighbors in " << nmu | ||||
|               << " dimensions" << std::endl; | ||||
|     grid_small_sep(); | ||||
|     grid_printf("from to      mean(usec)           err           max\n"); | ||||
|     grid_printf("%5s %7s %15s %15s %15s\n", "dir", "shm", "time (usec)", "std dev", | ||||
|                 "min"); | ||||
|  | ||||
|     int ranks; | ||||
|     int me; | ||||
|     MPI_Comm_size(MPI_COMM_WORLD, &ranks); | ||||
|     MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||
|     int lat = 8; // dummy lattice size. Not really used. | ||||
|     Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], | ||||
|                           lat * mpi_layout[3]}); | ||||
|  | ||||
|     int bytes = 8; | ||||
|     void *buf_from = acceleratorAllocDevice(bytes); | ||||
|     void *buf_to = acceleratorAllocDevice(bytes); | ||||
|     nlohmann::json json_latency; | ||||
|     for (int from = 0; from < ranks; ++from) | ||||
|       for (int to = 0; to < ranks; ++to) | ||||
|     GridCartesian Grid(latt_size, simd_layout, mpi_layout); | ||||
|     RealD Nrank = Grid._Nprocessors; | ||||
|     RealD Nnode = Grid.NodeCount(); | ||||
|     RealD ppn = Nrank / Nnode; | ||||
|  | ||||
|     std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||
|     std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||
|     uint64_t bytes = 8; | ||||
|     for (int d = 0; d < 8; d++) | ||||
|     { | ||||
|       xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); | ||||
|       rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); | ||||
|     } | ||||
|  | ||||
|     double dbytes; | ||||
| #define NWARMUP 50 | ||||
|  | ||||
|     for (int dir = 0; dir < 8; dir++) | ||||
|     { | ||||
|       int mu = dir % 4; | ||||
|       if (mpi_layout[mu] == 1) // skip directions that are not distributed | ||||
|         continue; | ||||
|       bool is_shm = mpi_layout[mu] == shm_layout[mu]; | ||||
|       bool is_partial_shm = !is_shm && shm_layout[mu] != 1; | ||||
|  | ||||
|       std::vector<double> times(Nloop); | ||||
|       for (int i = 0; i < NWARMUP; i++) | ||||
|       { | ||||
|         if (from == to) | ||||
|           continue; | ||||
|         int xmit_to_rank; | ||||
|         int recv_from_rank; | ||||
|  | ||||
|         std::vector<double> t_time(Nloop); | ||||
|         time_statistics timestat; | ||||
|         MPI_Status status; | ||||
|  | ||||
|         for (int i = -Nwarmup; i < Nloop; ++i) | ||||
|         if (dir == mu) | ||||
|         { | ||||
|           double start = usecond_precise(); | ||||
|           if (from == me) | ||||
|           { | ||||
|             auto err = MPI_Send(buf_from, bytes, MPI_CHAR, to, 0, MPI_COMM_WORLD); | ||||
|             assert(err == MPI_SUCCESS); | ||||
|           } | ||||
|           if (to == me) | ||||
|           { | ||||
|             auto err = | ||||
|                 MPI_Recv(buf_to, bytes, MPI_CHAR, from, 0, MPI_COMM_WORLD, &status); | ||||
|             assert(err == MPI_SUCCESS); | ||||
|           } | ||||
|           double stop = usecond_precise(); | ||||
|           if (i >= 0) | ||||
|             t_time[i] = stop - start; | ||||
|           int comm_proc = 1; | ||||
|           Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||
|         } | ||||
|         // important: only 'from' and 'to' have meaningful timings. we use | ||||
|         // 'from's. | ||||
|         MPI_Bcast(t_time.data(), Nloop, MPI_DOUBLE, from, MPI_COMM_WORLD); | ||||
|  | ||||
|         timestat.statistics(t_time); | ||||
|         grid_printf("%2d %2d %15.4f %15.3f %15.4f\n", from, to, timestat.mean, | ||||
|                     timestat.err, timestat.max); | ||||
|         nlohmann::json tmp; | ||||
|         tmp["from"] = from; | ||||
|         tmp["to"] = to; | ||||
|         tmp["time_usec"] = timestat.mean; | ||||
|         tmp["time_usec_error"] = timestat.err; | ||||
|         tmp["time_usec_min"] = timestat.min; | ||||
|         tmp["time_usec_max"] = timestat.max; | ||||
|         tmp["time_usec_full"] = t_time; | ||||
|         json_latency.push_back(tmp); | ||||
|         else | ||||
|         { | ||||
|           int comm_proc = mpi_layout[mu] - 1; | ||||
|           Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||
|         } | ||||
|         Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], | ||||
|                             recv_from_rank, bytes); | ||||
|       } | ||||
|     json_results["latency"] = json_latency; | ||||
|  | ||||
|     acceleratorFreeDevice(buf_from); | ||||
|     acceleratorFreeDevice(buf_to); | ||||
|   } | ||||
|  | ||||
|   static void P2P(void) | ||||
|   { | ||||
|     // IMPORTANT: The P2P benchmark uses "MPI_COMM_WORLD" communicator, which is | ||||
|     // not the quite the same as Grid.communicator. Practically speaking, the | ||||
|     // latter one contains the same MPI-ranks but in a different order. Grid | ||||
|     // does this make sure it can exploit ranks with shared memory (i.e. | ||||
|     // multiple ranks on the same node) as best as possible. | ||||
|  | ||||
|     // buffer-size to benchmark. This number is the same as the largest one used | ||||
|     // in the "Comms()" benchmark. ( L=48, Ls=12, double-prec-complex, | ||||
|     // half-color-spin-vector. ). Mostly an arbitrary choice, but nice to match | ||||
|     // it here | ||||
|     size_t bytes = 127401984; | ||||
|  | ||||
|     int Nwarmup = 20; | ||||
|     int Nloop = 100; | ||||
|  | ||||
|     std::cout << GridLogMessage << "Benchmarking point-to-point bandwidth" << std::endl; | ||||
|     grid_small_sep(); | ||||
|     grid_printf("from to      mean(usec)           err           min           " | ||||
|                 "bytes    rate (GiB/s)\n"); | ||||
|  | ||||
|     int ranks; | ||||
|     int me; | ||||
|     MPI_Comm_size(MPI_COMM_WORLD, &ranks); | ||||
|     MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||
|  | ||||
|     void *buf_from = acceleratorAllocDevice(bytes); | ||||
|     void *buf_to = acceleratorAllocDevice(bytes); | ||||
|     nlohmann::json json_p2p; | ||||
|     for (int from = 0; from < ranks; ++from) | ||||
|       for (int to = 0; to < ranks; ++to) | ||||
|       for (int i = 0; i < Nloop; i++) | ||||
|       { | ||||
|         if (from == to) | ||||
|           continue; | ||||
|  | ||||
|         std::vector<double> t_time(Nloop); | ||||
|         time_statistics timestat; | ||||
|         MPI_Status status; | ||||
|         dbytes = 0; | ||||
|         double start = usecond(); | ||||
|         int xmit_to_rank; | ||||
|         int recv_from_rank; | ||||
|  | ||||
|         for (int i = -Nwarmup; i < Nloop; ++i) | ||||
|         if (dir == mu) | ||||
|         { | ||||
|           double start = usecond_precise(); | ||||
|           if (from == me) | ||||
|           { | ||||
|             auto err = MPI_Send(buf_from, bytes, MPI_CHAR, to, 0, MPI_COMM_WORLD); | ||||
|             assert(err == MPI_SUCCESS); | ||||
|           } | ||||
|           if (to == me) | ||||
|           { | ||||
|             auto err = | ||||
|                 MPI_Recv(buf_to, bytes, MPI_CHAR, from, 0, MPI_COMM_WORLD, &status); | ||||
|             assert(err == MPI_SUCCESS); | ||||
|           } | ||||
|           double stop = usecond_precise(); | ||||
|           if (i >= 0) | ||||
|             t_time[i] = stop - start; | ||||
|           int comm_proc = 1; | ||||
|           Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||
|         } | ||||
|         // important: only 'from' and 'to' have meaningful timings. we use | ||||
|         // 'from's. | ||||
|         MPI_Bcast(t_time.data(), Nloop, MPI_DOUBLE, from, MPI_COMM_WORLD); | ||||
|         else | ||||
|         { | ||||
|           int comm_proc = mpi_layout[mu] - 1; | ||||
|           Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); | ||||
|         } | ||||
|         Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], | ||||
|                             recv_from_rank, bytes); | ||||
|         dbytes += bytes; | ||||
|  | ||||
|         timestat.statistics(t_time); | ||||
|         double rate = bytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.; | ||||
|         double rate_err = rate * timestat.err / timestat.mean; | ||||
|         double rate_max = rate * timestat.mean / timestat.min; | ||||
|         double rate_min = rate * timestat.mean / timestat.max; | ||||
|  | ||||
|         grid_printf("%2d %2d %15.4f %15.3f %15.4f %15zu %15.2f\n", from, to, timestat.mean, | ||||
|                     timestat.err, timestat.min, bytes, rate); | ||||
|  | ||||
|         nlohmann::json tmp; | ||||
|         tmp["from"] = from; | ||||
|         tmp["to"] = to; | ||||
|         tmp["bytes"] = bytes; | ||||
|         tmp["time_usec"] = timestat.mean; | ||||
|         tmp["time_usec_error"] = timestat.err; | ||||
|         tmp["time_usec_min"] = timestat.min; | ||||
|         tmp["time_usec_max"] = timestat.max; | ||||
|         tmp["time_usec_full"] = t_time; | ||||
|         nlohmann::json tmp_rate; | ||||
|         tmp_rate["mean"] = rate; | ||||
|         tmp_rate["error"] = rate_err; | ||||
|         tmp_rate["max"] = rate_max; | ||||
|         tmp_rate["min"] = rate_min; | ||||
|         tmp["rate_GBps"] = tmp_rate; | ||||
|  | ||||
|         json_p2p.push_back(tmp); | ||||
|         double stop = usecond(); | ||||
|         t_time[i] = stop - start; // microseconds | ||||
|       } | ||||
|     json_results["p2p"] = json_p2p; | ||||
|       timestat.statistics(t_time); | ||||
|  | ||||
|     acceleratorFreeDevice(buf_from); | ||||
|     acceleratorFreeDevice(buf_to); | ||||
|       grid_printf("%5d %7s %15.2f %15.1f %15.2f\n", dir, | ||||
|                   is_shm           ? "yes" | ||||
|                   : is_partial_shm ? "partial" | ||||
|                                    : "no", | ||||
|                   timestat.mean, timestat.err, timestat.min); | ||||
|       nlohmann::json tmp; | ||||
|       nlohmann::json tmp_rate; | ||||
|       tmp["dir"] = dir; | ||||
|       tmp["shared_mem"] = is_shm; | ||||
|       tmp["partial_shared_mem"] = is_partial_shm; | ||||
|       tmp["time_usec"] = timestat.mean; | ||||
|       tmp["time_usec_error"] = timestat.err; | ||||
|       tmp["time_usec_max"] = timestat.min; | ||||
|       json_results["latency"].push_back(tmp); | ||||
|     } | ||||
|     for (int d = 0; d < 8; d++) | ||||
|     { | ||||
|       acceleratorFreeDevice(xbuf[d]); | ||||
|       acceleratorFreeDevice(rbuf[d]); | ||||
|     } | ||||
|  | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   static void Memory(void) | ||||
| @@ -726,6 +647,8 @@ class Benchmark | ||||
|  | ||||
|         FGrid->Broadcast(0, &ncall, sizeof(ncall)); | ||||
|  | ||||
|         Dw.ZeroCounters(); | ||||
|  | ||||
|         time_statistics timestat; | ||||
|         std::vector<double> t_time(ncall); | ||||
|         for (uint64_t i = 0; i < ncall; i++) | ||||
| @@ -920,6 +843,7 @@ class Benchmark | ||||
|         uint64_t ncall = 500; | ||||
|  | ||||
|         FGrid->Broadcast(0, &ncall, sizeof(ncall)); | ||||
|         Ds.ZeroCounters(); | ||||
|  | ||||
|         time_statistics timestat; | ||||
|         std::vector<double> t_time(ncall); | ||||
| @@ -987,54 +911,29 @@ int main(int argc, char **argv) | ||||
| { | ||||
|   Grid_init(&argc, &argv); | ||||
|  | ||||
|   int Ls = 1; | ||||
|   bool do_su4 = true; | ||||
|   bool do_memory = true; | ||||
|   bool do_comms = true; | ||||
|   bool do_flops = true; | ||||
|  | ||||
|   // NOTE: these two take O((number of ranks)^2) time, which might be a lot, so they are | ||||
|   // off by default | ||||
|   bool do_latency = false; | ||||
|   bool do_p2p = false; | ||||
|  | ||||
|   std::string json_filename = ""; // empty indicates no json output | ||||
|   for (int i = 0; i < argc; i++) | ||||
|   { | ||||
|     auto arg = std::string(argv[i]); | ||||
|     if (arg == "--json-out") | ||||
|     if (std::string(argv[i]) == "--json-out") | ||||
|       json_filename = argv[i + 1]; | ||||
|     if (arg == "--benchmark-su4") | ||||
|       do_su4 = true; | ||||
|     if (arg == "--benchmark-memory") | ||||
|       do_memory = true; | ||||
|     if (arg == "--benchmark-comms") | ||||
|       do_comms = true; | ||||
|     if (arg == "--benchmark-flops") | ||||
|       do_flops = true; | ||||
|     if (arg == "--benchmark-latency") | ||||
|       do_latency = true; | ||||
|     if (arg == "--benchmark-p2p") | ||||
|       do_p2p = true; | ||||
|     if (arg == "--no-benchmark-su4") | ||||
|       do_su4 = false; | ||||
|     if (arg == "--no-benchmark-memory") | ||||
|       do_memory = false; | ||||
|     if (arg == "--no-benchmark-comms") | ||||
|       do_comms = false; | ||||
|     if (arg == "--no-benchmark-flops") | ||||
|       do_flops = false; | ||||
|     if (arg == "--no-benchmark-latency") | ||||
|       do_latency = false; | ||||
|     if (arg == "--no-benchmark-p2p") | ||||
|       do_p2p = false; | ||||
|   } | ||||
|  | ||||
|   CartesianCommunicator::SetCommunicatorPolicy( | ||||
|       CartesianCommunicator::CommunicatorPolicySequential); | ||||
|  | ||||
| #ifdef KNL | ||||
|   LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2}); | ||||
| #else | ||||
|   LebesgueOrder::Block = std::vector<int>({2, 2, 2, 2}); | ||||
| #endif | ||||
|   Benchmark::Decomposition(); | ||||
|  | ||||
|   int do_su4 = 1; | ||||
|   int do_memory = 1; | ||||
|   int do_comms = 1; | ||||
|   int do_latency = 1; | ||||
|   int do_flops = 1; | ||||
|   int Ls = 1; | ||||
|  | ||||
|   int sel = 4; | ||||
|   std::vector<int> L_list({8, 12, 16, 24, 32}); | ||||
|   int selm1 = sel - 1; | ||||
| @@ -1075,14 +974,6 @@ int main(int argc, char **argv) | ||||
|     Benchmark::Latency(); | ||||
|   } | ||||
|  | ||||
|   if (do_p2p) | ||||
|   { | ||||
|     grid_big_sep(); | ||||
|     std::cout << GridLogMessage << " Point-To-Point benchmark " << std::endl; | ||||
|     grid_big_sep(); | ||||
|     Benchmark::P2P(); | ||||
|   } | ||||
|  | ||||
|   if (do_flops) | ||||
|   { | ||||
|     Ls = 1; | ||||
| @@ -1142,8 +1033,6 @@ int main(int argc, char **argv) | ||||
|     json_results["flops"] = tmp_flops; | ||||
|   } | ||||
|  | ||||
|   json_results["hostnames"] = get_mpi_hostnames(); | ||||
|  | ||||
|   if (!json_filename.empty()) | ||||
|   { | ||||
|     std::cout << GridLogMessage << "writing benchmark results to " << json_filename | ||||
|   | ||||
| @@ -1,12 +1,13 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| lrank=$OMPI_COMM_WORLD_LOCAL_RANK | ||||
| numa1=$((lrank)) | ||||
| numa1=$(( 2 * lrank)) | ||||
| numa2=$(( 2 * lrank + 1 )) | ||||
| netdev=mlx5_${lrank}:1 | ||||
|  | ||||
| export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK | ||||
| export UCX_NET_DEVICES=${netdev} | ||||
| BINDING="--interleave=$numa1" | ||||
| BINDING="--interleave=$numa1,$numa2" | ||||
|  | ||||
| echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING" | ||||
|  | ||||
|   | ||||
| @@ -80,7 +80,7 @@ mkdir -p build_gpu; cd build_gpu | ||||
|              --enable-devel-headers --enable-examples --enable-optimizations   \ | ||||
|              --with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging     \ | ||||
|              --disable-debug --disable-assertions --enable-cma                 \ | ||||
|              --with-knem=/opt/knem-1.1.4.90mlnx2/ --with-rdmacm                \ | ||||
|              --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm                \ | ||||
|              --without-rocm --without-ugni --without-java                      \ | ||||
|              --enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm   \ | ||||
|              --with-rc --with-ud --with-dc --with-mlx5-dv --with-dm            \ | ||||
| @@ -96,7 +96,7 @@ mkdir -p build_cpu; cd build_cpu | ||||
|              --enable-devel-headers --enable-examples --enable-optimizations   \ | ||||
|              --with-verbs --disable-logging --disable-debug                    \ | ||||
|              --disable-assertions --enable-mt --enable-cma                     \ | ||||
|               --with-knem=/opt/knem-1.1.4.90mlnx2/--with-rdmacm                \ | ||||
|              --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm                \ | ||||
|              --without-rocm --without-ugni --without-java                      \ | ||||
|              --enable-compiler-opt=3 --without-cm --without-ugni --with-rc     \ | ||||
|              --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go | ||||
| @@ -122,7 +122,7 @@ mkdir build_gpu; cd build_gpu | ||||
| ../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem    \ | ||||
|              --with-ucx="${dir}"/prefix/ucx_gpu                   \ | ||||
|              --with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib        \ | ||||
|              --with-knem=/opt/knem-1.1.4.90mlnx2/                 \ | ||||
|              --with-knem=/opt/knem-1.1.4.90mlnx1/                 \ | ||||
|              --enable-mca-no-build=btl-uct                        \ | ||||
|              --with-cuda="${cuda_path}" --disable-getpwuid        \ | ||||
|              --with-verbs --with-slurm --enable-mpi-fortran=all   \ | ||||
| @@ -136,7 +136,7 @@ mkdir build_cpu; cd build_cpu | ||||
| ../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem    \ | ||||
|              --with-ucx="${dir}"/prefix/ucx_cpu                   \ | ||||
|              --with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib        \ | ||||
|              --with-knem=/opt/knem-1.1.4.90mlnx2/                 \ | ||||
|              --with-knem=/opt/knem-1.1.4.90mlnx1/                 \ | ||||
|              --enable-mca-no-build=btl-uct --disable-getpwuid     \ | ||||
|              --with-verbs --with-slurm --enable-mpi-fortran=all   \ | ||||
|              --with-pmix=internal --with-libevent=internal | ||||
|   | ||||
| @@ -1,14 +0,0 @@ | ||||
| { | ||||
|   BasedOnStyle: LLVM, | ||||
|   UseTab: Never, | ||||
|   IndentWidth: 2, | ||||
|   TabWidth: 2, | ||||
|   BreakBeforeBraces: Allman, | ||||
|   AllowShortIfStatementsOnASingleLine: false, | ||||
|   IndentCaseLabels: false, | ||||
|   ColumnLimit: 90, | ||||
|   AccessModifierOffset: -4, | ||||
|   NamespaceIndentation: All, | ||||
|   FixNamespaceComments: false, | ||||
|   SortIncludes: true, | ||||
| } | ||||
| @@ -1,458 +0,0 @@ | ||||
| #include <algorithm> | ||||
| #include <array> | ||||
| #include <blas_quda.h> | ||||
| #include <cassert> | ||||
| #include <chrono> | ||||
| #include <color_spinor_field.h> | ||||
| #include <communicator_quda.h> | ||||
| #include <dirac_quda.h> | ||||
| #include <fstream> | ||||
| #include <gauge_tools.h> | ||||
| #include <memory> | ||||
| #include <mpi.h> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
|  | ||||
| // remove to use QUDA's own flop counting instead of Grid's convention | ||||
| #define FLOP_COUNTING_GRID | ||||
|  | ||||
| #include "json.hpp" | ||||
| using nlohmann::json; | ||||
| json json_results; | ||||
|  | ||||
| using namespace quda; | ||||
|  | ||||
| // thanks chatGPT :) | ||||
| std::string get_timestamp() | ||||
| { | ||||
|   // Get the current time | ||||
|   auto now = std::chrono::system_clock::now(); | ||||
|  | ||||
|   // Convert the current time to a time_t object | ||||
|   std::time_t currentTime = std::chrono::system_clock::to_time_t(now); | ||||
|  | ||||
|   // Format the time using std::put_time | ||||
|   std::stringstream ss; | ||||
|   ss << std::put_time(std::localtime(¤tTime), "%Y%m%d %H:%M:%S"); | ||||
|  | ||||
|   return ss.str(); | ||||
| } | ||||
|  | ||||
| // This is the MPI grid, i.e. the layout of ranks | ||||
| int nranks = -1; | ||||
| std::array<int, 4> mpi_grid = {1, 1, 1, 1}; | ||||
|  | ||||
| // run f() in a loop for roughly target_time seconds | ||||
| // returns seconds per iteration it took | ||||
| template <class F> double bench(F const &f, double target_time, int niter_warmup = 5) | ||||
| { | ||||
|   device_timer_t timer; | ||||
|   timer.start(); | ||||
|   for (int iter = 0; iter < niter_warmup; ++iter) | ||||
|     f(); | ||||
|   timer.stop(); | ||||
|  | ||||
|   double secs = timer.last() / niter_warmup; | ||||
|   int niter = std::max(1, int(target_time / secs)); | ||||
|   // niter = std::min(1000, niter); | ||||
|   // printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter); | ||||
|  | ||||
|   // important: each rank has its own timer, so their measurements can slightly vary. But | ||||
|   // 'niter' needs to be consistent (bug took me a couple hours to track down) | ||||
|   comm_broadcast_global(&niter, sizeof(niter), 0); | ||||
|  | ||||
|   timer.reset(__FUNCTION__, __FILE__, __LINE__); | ||||
|   timer.start(); | ||||
|   for (int iter = 0; iter < niter; ++iter) | ||||
|     f(); | ||||
|   timer.stop(); | ||||
|   return timer.last() / niter; | ||||
| } | ||||
|  | ||||
| void initComms(int argc, char **argv) | ||||
| { | ||||
|   // init MPI communication | ||||
|   MPI_Init(&argc, &argv); | ||||
|  | ||||
|   MPI_Comm_size(MPI_COMM_WORLD, &nranks); | ||||
|   assert(1 <= nranks && nranks <= 100000); | ||||
|  | ||||
|   mpi_grid[3] = nranks; | ||||
|  | ||||
|   // this maps coordinates to rank number | ||||
|   auto lex_rank_from_coords = [](int const *coords, void *) | ||||
|   { | ||||
|     int rank = coords[0]; | ||||
|     for (int i = 1; i < 4; i++) | ||||
|       rank = mpi_grid[i] * rank + coords[i]; | ||||
|     return rank; | ||||
|   }; | ||||
|  | ||||
|   initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr); | ||||
|  | ||||
|   for (int d = 0; d < 4; d++) | ||||
|     if (mpi_grid[d] > 1) | ||||
|       commDimPartitionedSet(d); | ||||
|  | ||||
|   json_results["geometry"]["ranks"] = nranks; | ||||
|   json_results["geometry"]["mpi"] = mpi_grid; | ||||
| } | ||||
|  | ||||
| // creates a random gauge field. L = local(!) size | ||||
| cudaGaugeField make_gauge_field(int L) | ||||
| { | ||||
|   GaugeFieldParam param; | ||||
|  | ||||
|   // dimension and type of the lattice object | ||||
|   param.nDim = 4; | ||||
|   param.x[0] = L; | ||||
|   param.x[1] = L; | ||||
|   param.x[2] = L; | ||||
|   param.x[3] = L; | ||||
|  | ||||
|   // number of colors. potentially confusingly, QUDA sometimes uses the word "color" to | ||||
|   // things unrelated with physical color. things like "nColor=32" do pop up in deflation | ||||
|   // solvers where it (to my understanding) refers to the number of (parallely processed) | ||||
|   // deflation vectors. | ||||
|   param.nColor = 3; | ||||
|  | ||||
|   // boundary conditions (dont really care for benchmark) | ||||
|   param.t_boundary = QUDA_PERIODIC_T; | ||||
|  | ||||
|   // for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller | ||||
|   // precisions are available in QUDA too | ||||
|   param.setPrecision(QUDA_SINGLE_PRECISION); | ||||
|  | ||||
|   // no even/odd subset, we want a full lattice | ||||
|   param.siteSubset = QUDA_FULL_SITE_SUBSET; | ||||
|  | ||||
|   // what kind of 3x3 matrices the field contains. A proper gauge field has SU(3) | ||||
|   // matrices, but (for example) smeared/thick links could have non-unitary links. | ||||
|   param.link_type = QUDA_SU3_LINKS; | ||||
|  | ||||
|   // "NULL" does not initialize the field upon creation, "ZERO" would set everything to 0 | ||||
|   param.create = QUDA_NULL_FIELD_CREATE; | ||||
|  | ||||
|   // field should be allocated directly on the accelerator/GPU | ||||
|   param.location = QUDA_CUDA_FIELD_LOCATION; | ||||
|  | ||||
|   // "reconstruct" here means reconstructing a SU(3) matrix from fewer than 18 real | ||||
|   // numbers (=3x3 complex numbers). Great feature in production (saving | ||||
|   // memory/cache/network bandwidth), not used for this benchmark. | ||||
|   param.reconstruct = QUDA_RECONSTRUCT_NO; | ||||
|  | ||||
|   // "ghostExchange" would often be called "halo exchange" outside of Quda. This has | ||||
|   // nothing to do with ghost fields from continuum/perturbative qcd. | ||||
|   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; | ||||
|  | ||||
|   // This controls the physical order of elements. "float2" is the the default | ||||
|   param.order = QUDA_FLOAT2_GAUGE_ORDER; | ||||
|  | ||||
|   // this means the field is a LORENTZ vector (which a gauge field must be). Has nothing | ||||
|   // to do with spin. | ||||
|   param.geometry = QUDA_VECTOR_GEOMETRY; | ||||
|  | ||||
|   // create the field and fill with random SU(3) matrices | ||||
|   // std::cout << param << std::endl; // double-check parameters | ||||
|   auto U = cudaGaugeField(param); | ||||
|   gaugeGauss(U, /*seed=*/1234, 1.0); | ||||
|   return U; | ||||
| } | ||||
|  | ||||
| // create a random source vector (L = local size) | ||||
| ColorSpinorField make_source(int L, int Ls = 1) | ||||
| { | ||||
|   // NOTE: `param.x` directly determines the size of the (local, per rank) memory | ||||
|   // allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a | ||||
|   // physical local volume of L^4, thus implicity choosing a dimension for the | ||||
|   // checkerboarding (shouldnt really matter of course which one). | ||||
|   ColorSpinorParam param; | ||||
|   param.nColor = 3; | ||||
|   param.nSpin = 4; | ||||
|   param.nVec = 1; // only a single vector | ||||
|   param.pad = 0; | ||||
|   param.siteSubset = QUDA_PARITY_SITE_SUBSET; | ||||
|   param.nDim = Ls == 1 ? 4 : 5; | ||||
|   param.x[0] = L / 2; | ||||
|   param.x[1] = L; | ||||
|   param.x[2] = L; | ||||
|   param.x[3] = L; | ||||
|   param.x[4] = Ls; | ||||
|   param.pc_type = QUDA_4D_PC; | ||||
|   param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; | ||||
|  | ||||
|   // somewhat surprisingly, the DiracWilson::Dslash(...) function only works with the | ||||
|   // UKQCD_GAMMA_BASIS | ||||
|   param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; | ||||
|  | ||||
|   param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field | ||||
|   param.setPrecision(QUDA_SINGLE_PRECISION); | ||||
|   param.location = QUDA_CUDA_FIELD_LOCATION; | ||||
|  | ||||
|   // create the field and fill it with random values | ||||
|   auto src = ColorSpinorField(param); | ||||
|   quda::RNG rng(src, 1234); | ||||
|   spinorNoise(src, rng, QUDA_NOISE_GAUSS); | ||||
|   /*printfQuda( | ||||
|       "created src with norm = %f (sanity check: should be close to %f) and %f bytes\n", | ||||
|       blas::norm2(src), 2.0 * 12 * geom[0] * geom[1] * geom[2] * geom[3], | ||||
|       src.Bytes() * 1.0);*/ | ||||
|   // src.PrintDims(); | ||||
|  | ||||
|   return src; | ||||
| } | ||||
|  | ||||
| void benchmark_wilson(std::vector<int> const &L_list, double target_time) | ||||
| { | ||||
|   printfQuda("==================== wilson dirac operator ====================\n"); | ||||
| #ifdef FLOP_COUNTING_GRID | ||||
|   printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); | ||||
| #else | ||||
|   printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " | ||||
|              "Benchmark_Grid)\n"); | ||||
| #endif | ||||
|   printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); | ||||
|  | ||||
|   for (int L : L_list) | ||||
|   { | ||||
|     // printfQuda("starting wilson L=%d\n", L); | ||||
|  | ||||
|     auto U = make_gauge_field(L); | ||||
|     auto src = make_source(L); | ||||
|  | ||||
|     // create (Wilson) dirac operator | ||||
|     DiracParam param; | ||||
|     param.kappa = 0.10; | ||||
|     param.dagger = QUDA_DAG_NO; | ||||
|     param.matpcType = QUDA_MATPC_EVEN_EVEN; | ||||
|     auto dirac = DiracWilson(param); | ||||
|  | ||||
|     // insert gauge field into the dirac operator | ||||
|     // (the additional nullptr's are for smeared links and fancy preconditioners and such. | ||||
|     // Not used for simple Wilson fermions) | ||||
|     dirac.updateFields(&U, nullptr, nullptr, nullptr); | ||||
|     auto res = ColorSpinorField(ColorSpinorParam(src)); | ||||
|     auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; | ||||
|  | ||||
|     // first run to get the quda tuning out of the way | ||||
|     dirac.Flops(); // reset flops counter | ||||
|     f(); | ||||
|     double flops = 1.0 * dirac.Flops(); | ||||
|  | ||||
|     // actual benchmarking | ||||
|     auto start_time = get_timestamp(); | ||||
|     double secs = bench(f, target_time); | ||||
|     auto end_time = get_timestamp(); | ||||
|  | ||||
| #ifdef FLOP_COUNTING_GRID | ||||
|     // this is the flop counting from Benchmark_Grid | ||||
|     double Nc = 3; | ||||
|     double Nd = 4; | ||||
|     double Ns = 4; | ||||
|     flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); | ||||
|     flops *= L * L * L * L / 2.0; | ||||
| #endif | ||||
|  | ||||
|     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); | ||||
|  | ||||
|     json tmp; | ||||
|     tmp["L"] = L; | ||||
|     tmp["Gflops_wilson"] = flops / secs * 1e-9; | ||||
|     tmp["start_time"] = start_time; | ||||
|     tmp["end_time"] = end_time; | ||||
|     json_results["flops"]["results"].push_back(tmp); | ||||
|   } | ||||
| } | ||||
|  | ||||
| void benchmark_dwf(std::vector<int> const &L_list, double target_time) | ||||
| { | ||||
|   printfQuda("==================== domain wall dirac operator ====================\n"); | ||||
| #ifdef FLOP_COUNTING_GRID | ||||
|   printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n"); | ||||
| #else | ||||
|   printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from " | ||||
|              "Benchmark_Grid)\n"); | ||||
| #endif | ||||
|   printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); | ||||
|   int Ls = 12; | ||||
|   for (int L : L_list) | ||||
|   { | ||||
|     // printfQuda("starting dwf L=%d\n", L); | ||||
|     auto U = make_gauge_field(L); | ||||
|     auto src = make_source(L, Ls); | ||||
|  | ||||
|     // create dirac operator | ||||
|     DiracParam param; | ||||
|     param.kappa = 0.10; | ||||
|     param.Ls = Ls; | ||||
|     param.m5 = 0.1; | ||||
|     param.dagger = QUDA_DAG_NO; | ||||
|     param.matpcType = QUDA_MATPC_EVEN_EVEN; | ||||
|     auto dirac = DiracDomainWall(param); | ||||
|  | ||||
|     // insert gauge field into the dirac operator | ||||
|     // (the additional nullptr's are for smeared links and fancy preconditioners and such) | ||||
|     dirac.updateFields(&U, nullptr, nullptr, nullptr); | ||||
|     auto res = ColorSpinorField(ColorSpinorParam(src)); | ||||
|     auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); }; | ||||
|  | ||||
|     // first run to get the quda tuning out of the way | ||||
|     dirac.Flops(); // reset flops counter | ||||
|     f(); | ||||
|     double flops = 1.0 * dirac.Flops(); | ||||
|  | ||||
|     // actual benchmarking | ||||
|     auto start_time = get_timestamp(); | ||||
|     double secs = bench(f, target_time); | ||||
|     auto end_time = get_timestamp(); | ||||
|  | ||||
| #ifdef FLOP_COUNTING_GRID | ||||
|     // this is the flop counting from Benchmark_Grid | ||||
|     double Nc = 3; | ||||
|     double Nd = 4; | ||||
|     double Ns = 4; | ||||
|     flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2); | ||||
|     flops *= L * L * L * L * Ls / 2.0; | ||||
| #endif | ||||
|  | ||||
|     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); | ||||
|     json tmp; | ||||
|     tmp["L"] = L; | ||||
|     tmp["Gflops_dwf4"] = flops / secs * 1e-9; | ||||
|     tmp["start_time"] = start_time; | ||||
|     tmp["end_time"] = end_time; | ||||
|     json_results["flops"]["results"].push_back(tmp); | ||||
|   } | ||||
| } | ||||
|  | ||||
| void benchmark_axpy(std::vector<int> const &L_list, double target_time) | ||||
| { | ||||
|   // number of iterations for warmup / measurement | ||||
|   // (feel free to change for noise/time tradeoff) | ||||
|   constexpr int niter_warmup = 5; | ||||
|  | ||||
|   printfQuda("==================== axpy / memory ====================\n"); | ||||
|  | ||||
|   ColorSpinorParam param; | ||||
|   param.nDim = 4;   // 4-dimensional lattice | ||||
|   param.x[4] = 1;   // no fifth dimension | ||||
|   param.nColor = 3; // supported values for nSpin/nColor are configured when compiling | ||||
|                     // QUDA. "3*4" will probably always be enabled, so we stick with this | ||||
|   param.nSpin = 4; | ||||
|   param.nVec = 1;                            // just a single vector | ||||
|   param.siteSubset = QUDA_FULL_SITE_SUBSET;  // full lattice = no odd/even | ||||
|   param.pad = 0;                             // no padding | ||||
|   param.create = QUDA_NULL_FIELD_CREATE;     // do not (zero-) initilize the field | ||||
|   param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU | ||||
|   param.setPrecision(QUDA_SINGLE_PRECISION); | ||||
|  | ||||
|   // the following dont matter for an axpy benchmark, but need to choose something | ||||
|   param.pc_type = QUDA_4D_PC; | ||||
|   param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; | ||||
|   param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; | ||||
|  | ||||
|   printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", | ||||
|              "GiB/s/rank", "Gflop/s/rank"); | ||||
|   for (int L : L_list) | ||||
|   { | ||||
|     // printfQuda("starting axpy L=%d\n", L); | ||||
|     //  IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` | ||||
|     //             are LOCAL, i.e. per rank / per GPU | ||||
|  | ||||
|     param.x[0] = L; | ||||
|     param.x[1] = L; | ||||
|     param.x[2] = L; | ||||
|     param.x[3] = L; | ||||
|  | ||||
|     // number of (real) elements in one (local) field | ||||
|     size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * | ||||
|                             param.nColor * param.nSpin; | ||||
|  | ||||
|     // create the field(s) | ||||
|     auto fieldA = ColorSpinorField(param); | ||||
|     auto fieldB = ColorSpinorField(param); | ||||
|     assert(fieldA.Bytes() == sizeof(float) * field_elements); // sanity check | ||||
|     assert(fieldB.Bytes() == sizeof(float) * field_elements); // sanity check | ||||
|  | ||||
|     // fill fields with random values | ||||
|     quda::RNG rng(fieldA, 1234); | ||||
|     spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); | ||||
|     spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); | ||||
|  | ||||
|     // number of operations / bytes per iteration | ||||
|     // axpy is one addition, one multiplication, two read, one write | ||||
|     double flops = 2 * field_elements; | ||||
|     double memory = 3 * sizeof(float) * field_elements; | ||||
|  | ||||
|     auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); }; | ||||
|  | ||||
|     // first run to get the quda tuning out of the way | ||||
|     f(); | ||||
|  | ||||
|     // actual benchmarking | ||||
|     auto start_time = get_timestamp(); | ||||
|     double secs = bench(f, target_time); | ||||
|     auto end_time = get_timestamp(); | ||||
|  | ||||
|     double mem_MiB = memory / 1024. / 1024.; | ||||
|     double GBps = mem_MiB / 1024 / secs; | ||||
|     printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps, | ||||
|                flops / secs * 1e-9); | ||||
|  | ||||
|     json tmp; | ||||
|     tmp["L"] = L; | ||||
|     tmp["size_MB"] = mem_MiB; | ||||
|     tmp["GBps"] = GBps; | ||||
|     tmp["GFlops"] = flops / secs * 1e-9; | ||||
|     tmp["start_time"] = start_time; | ||||
|     tmp["end_time"] = end_time; | ||||
|     json_results["axpy"].push_back(tmp); | ||||
|   } | ||||
| } | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|   std::string json_filename = ""; // empty indicates no json output | ||||
|   for (int i = 0; i < argc; i++) | ||||
|   { | ||||
|     if (std::string(argv[i]) == "--json-out") | ||||
|       json_filename = argv[i + 1]; | ||||
|   } | ||||
|  | ||||
|   initComms(argc, argv); | ||||
|  | ||||
|   initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used | ||||
|  | ||||
|   //  verbosity options are: | ||||
|   //  SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE | ||||
|   setVerbosity(QUDA_SUMMARIZE); | ||||
|  | ||||
|   printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], | ||||
|              mpi_grid[3]); | ||||
|  | ||||
|   benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0); | ||||
|  | ||||
|   setVerbosity(QUDA_SILENT); | ||||
|   benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0); | ||||
|   benchmark_dwf({8, 12, 16, 24, 32}, 1.0); | ||||
|   setVerbosity(QUDA_SUMMARIZE); | ||||
|  | ||||
|   printfQuda("==================== done with all benchmarks ====================\n"); | ||||
|  | ||||
|   if (!json_filename.empty()) | ||||
|   { | ||||
|     printfQuda("writing benchmark results to %s\n", json_filename.c_str()); | ||||
|  | ||||
|     int me = 0; | ||||
|     MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||
|     if (me == 0) | ||||
|     { | ||||
|       std::ofstream json_file(json_filename); | ||||
|       json_file << std::setw(2) << json_results; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   endQuda(); | ||||
|   quda::comm_finalize(); | ||||
|   MPI_Finalize(); | ||||
| } | ||||
| @@ -1,30 +0,0 @@ | ||||
| # QUDA benchmarks | ||||
|  | ||||
| This folder contains benchmarks for the [QUDA](https://github.com/lattice/quda) library. | ||||
|  | ||||
| - `Benchmark_Quda`: This benchmark measure floating point performances of fermion | ||||
| matrices (Wilson and DWF), as well as memory bandwidth (using a simple `axpy` operation). Measurements are | ||||
| performed for a fixed range of problem sizes. | ||||
|  | ||||
| ## Building | ||||
| After setting up your compilation environment (Tursa: `source /home/dp207/dp207/shared/env/production/env-{base,gpu}.sh`): | ||||
| ```bash | ||||
| ./build-quda.sh <env_dir>          # build Quda | ||||
| ./build-benchmark.sh <env_dir>     # build benchmark | ||||
| ``` | ||||
| where `<env_dir>` is an arbitrary directory where every product will be stored. | ||||
|  | ||||
| ## Running the Benchmark | ||||
|  | ||||
| The benchmark should be run as | ||||
| ```bash | ||||
| mpirun -np <ranks> <env_dir>/prefix/qudabench/Benchmark_Quda | ||||
| ``` | ||||
| where `<ranks>` is the total number of GPU's to use. On Tursa this is 4 times the number of nodes. | ||||
|  | ||||
| Note: | ||||
| - on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary. | ||||
| - due to Qudas automatic tuning, the benchmark might take significantly longer to run than `Benchmark_Grid` (even though it does fewer things). | ||||
|   - setting `QUDA_ENABLE_TUNING=0` disables all tuning (degrades performance severely). By default, it is turned on. | ||||
|   - setting `QUDA_RESOURCE_PATH=<some folder>` enables Quda to save and reuse optimal tuning parameters, making repeated runs much faster | ||||
|    | ||||
| @@ -1,32 +0,0 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1090,SC1091 | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| if (( $# != 1 )); then | ||||
|     echo "usage: $(basename "$0") <environment directory>" 1>&2 | ||||
|     exit 1 | ||||
| fi | ||||
| env_dir=$1 | ||||
|  | ||||
| # TODO: this is Tursa specific. have not figured out the correct way to do this. | ||||
| EXTRA_LIBS="/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so" | ||||
|  | ||||
| # NOTE: these flags need to be in sync with Qudas compilation options (see build-quda.sh) | ||||
| BUILD_FLAGS="-O3 -std=c++17 -DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=12 -DQUDA_RECONSTRUCT=4" | ||||
|  | ||||
| call_dir=$(pwd -P) | ||||
| script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" | ||||
| cd "${env_dir}" | ||||
| env_dir=$(pwd -P) | ||||
| cd "${call_dir}" | ||||
| BUILD_DIR="${env_dir}/build/Quda-benchmarks" | ||||
| PREFIX_DIR="${env_dir}/prefix/qudabench" | ||||
| QUDA_DIR=${env_dir}/prefix/quda | ||||
| mkdir -p "${BUILD_DIR}" | ||||
| mkdir -p "${PREFIX_DIR}" | ||||
|  | ||||
| LINK_FLAGS="-Wl,-rpath,$QUDA_DIR/lib: $QUDA_DIR/lib/libquda.so $EXTRA_LIBS -lpthread -lmpi" | ||||
|  | ||||
| g++ $BUILD_FLAGS -I$QUDA_DIR/include/targets/cuda  -I$QUDA_DIR/include   -c -o $BUILD_DIR/Benchmark_Quda.o  $script_dir/Benchmark_Quda.cpp | ||||
| g++ -g -O3 $BUILD_DIR/Benchmark_Quda.o -o $PREFIX_DIR/Benchmark_Quda $LINK_FLAGS -lmpi | ||||
| @@ -1,36 +0,0 @@ | ||||
| #!/usr/bin/env bash | ||||
| # shellcheck disable=SC1090,SC1091 | ||||
|  | ||||
| BUILD_FLAGS="-O3 -std=c++17" | ||||
| QUDA_FLAGS="-DQUDA_MPI=ON -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=4 -DQUDA_GPU_ARCH=sm_80" | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| if (( $# != 1 )); then | ||||
|     echo "usage: $(basename "$0") <environment directory>" 1>&2 | ||||
|     exit 1 | ||||
| fi | ||||
| env_dir=$1 | ||||
|  | ||||
| call_dir=$(pwd -P) | ||||
| mkdir -p ${env_dir} | ||||
| cd "${env_dir}" | ||||
| env_dir=$(pwd -P) | ||||
| cd "${call_dir}" | ||||
|  | ||||
| build_dir="${env_dir}/build/quda" | ||||
| if [ -d "${build_dir}" ]; then | ||||
|     echo "error: directory '${build_dir}' exists" | ||||
|     exit 1 | ||||
| fi | ||||
| mkdir -p "${build_dir}" | ||||
|  | ||||
| git clone https://github.com/lattice/quda.git "${build_dir}" | ||||
| cd "${build_dir}" | ||||
|  | ||||
| mkdir build; cd build | ||||
| cmake .. $QUDA_FLAGS -DCMAKE_INSTALL_PREFIX=${env_dir}/prefix/quda | ||||
| make -j128 | ||||
| make install | ||||
|  | ||||
| cd "${call_dir}" | ||||
							
								
								
									
										21
									
								
								Quda/env.sh
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								Quda/env.sh
									
									
									
									
									
								
							| @@ -1,21 +0,0 @@ | ||||
| module load gcc/9.3.0 | ||||
| module load cuda/11.4.1 | ||||
| module load openmpi/4.1.1-cuda11.4 | ||||
|  | ||||
| export QUDA_RESOURCE_PATH=$(pwd)/tuning | ||||
| export OMP_NUM_THREADS=4 | ||||
| export OMPI_MCA_btl=^uct,openib | ||||
| export OMPI_MCA_pml=ucx # by fabian. no idea what this is | ||||
| #export UCX_TLS=rc,rc_x,sm,cuda_copy,cuda_ipc,gdr_copy | ||||
| export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc | ||||
| export UCX_RNDV_THRESH=16384 | ||||
| export UCX_RNDV_SCHEME=put_zcopy | ||||
| export UCX_IB_GPU_DIRECT_RDMA=yes | ||||
| export UCX_MEMTYPE_CACHE=n | ||||
|  | ||||
| export OMPI_MCA_io=romio321 | ||||
| export OMPI_MCA_btl_openib_allow_ib=true | ||||
| export OMPI_MCA_btl_openib_device_type=infiniband | ||||
| export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 | ||||
|  | ||||
| export QUDA_REORDER_LOCATION=GPU # this is the default anyway | ||||
		Reference in New Issue
	
	Block a user