forked from portelli/lattice-benchmarks
		
	fix scaling conventions for multi-gpu
This commit is contained in:
		| @@ -1,56 +1,58 @@ | ||||
| #include <algorithm> | ||||
| #include <array> | ||||
| #include <blas_quda.h> | ||||
| #include <cassert> | ||||
| #include <color_spinor_field.h> | ||||
| #include <mpi.h> | ||||
| // #include <quda_internal.h> | ||||
| #include <dirac_quda.h> | ||||
| #include <gauge_tools.h> | ||||
| #include <memory> | ||||
| #include <mpi.h> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
|  | ||||
| #include <cassert> | ||||
| #include <dirac_quda.h> | ||||
| #include <gauge_tools.h> | ||||
|  | ||||
| using namespace quda; | ||||
|  | ||||
| QudaPrecision smoother_halo_prec = QUDA_INVALID_PRECISION; | ||||
| // This is the MPI grid, i.e. the layout of ranks | ||||
| int nranks = -1; | ||||
| std::array<int, 4> mpi_grid = {1, 1, 1, 1}; | ||||
|  | ||||
| // This is the MPI grid, i.e. the layout of ranks, not the lattice volume | ||||
| std::array<int, 4> gridsize = {1, 1, 1, 4}; | ||||
|  | ||||
| void initComms(int argc, char **argv, std::array<int, 4> const &commDims) | ||||
| void initComms(int argc, char **argv) | ||||
| { | ||||
|   // init MPI communication | ||||
|   MPI_Init(&argc, &argv); | ||||
|  | ||||
|   MPI_Comm_size(MPI_COMM_WORLD, &nranks); | ||||
|   assert(1 <= nranks && nranks <= 100000); | ||||
|  | ||||
|   mpi_grid[3] = nranks; | ||||
|  | ||||
|   // this maps coordinates to rank number | ||||
|   auto lex_rank_from_coords = [](int const *coords, void *) | ||||
|   { | ||||
|     int rank = coords[0]; | ||||
|     for (int i = 1; i < 4; i++) | ||||
|       rank = gridsize[i] * rank + coords[i]; | ||||
|       rank = mpi_grid[i] * rank + coords[i]; | ||||
|     return rank; | ||||
|   }; | ||||
|  | ||||
|   initCommsGridQuda(4, commDims.data(), lex_rank_from_coords, nullptr); | ||||
|   initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr); | ||||
|  | ||||
|   for (int d = 0; d < 4; d++) | ||||
|     if (gridsize[d] > 1) | ||||
|     if (mpi_grid[d] > 1) | ||||
|       commDimPartitionedSet(d); | ||||
| } | ||||
|  | ||||
| // creates a random gauge field | ||||
| cudaGaugeField make_gauge_field(std::array<int, 4> const &geom) | ||||
| // creates a random gauge field. L = local(!) size | ||||
| cudaGaugeField make_gauge_field(int L) | ||||
| { | ||||
|   GaugeFieldParam param; | ||||
|  | ||||
|   // dimension and type of the lattice object | ||||
|   param.nDim = 4; | ||||
|   param.x[0] = geom[0]; | ||||
|   param.x[1] = geom[1]; | ||||
|   param.x[2] = geom[2]; | ||||
|   param.x[3] = geom[3]; | ||||
|   param.x[0] = L; | ||||
|   param.x[1] = L; | ||||
|   param.x[2] = L; | ||||
|   param.x[3] = L; | ||||
|  | ||||
|   // number of colors. potentially confusingly, QUDA sometimes uses the word "color" to | ||||
|   // things unrelated with physical color. things like "nColor=32" do pop up in deflation | ||||
| @@ -101,8 +103,8 @@ cudaGaugeField make_gauge_field(std::array<int, 4> const &geom) | ||||
|   return U; | ||||
| } | ||||
|  | ||||
| // create a random source vector | ||||
| ColorSpinorField make_source(std::array<int, 4> const &geom) | ||||
| // create a random source vector (L = local size) | ||||
| ColorSpinorField make_source(int L) | ||||
| { | ||||
|   ColorSpinorParam param; | ||||
|   param.nColor = 3; | ||||
| @@ -111,10 +113,10 @@ ColorSpinorField make_source(std::array<int, 4> const &geom) | ||||
|   param.pad = 0; | ||||
|   param.siteSubset = QUDA_FULL_SITE_SUBSET; | ||||
|   param.nDim = 4; | ||||
|   param.x[0] = geom[0]; | ||||
|   param.x[1] = geom[1]; | ||||
|   param.x[2] = geom[2]; | ||||
|   param.x[3] = geom[3]; | ||||
|   param.x[0] = L; | ||||
|   param.x[1] = L; | ||||
|   param.x[2] = L; | ||||
|   param.x[3] = L; | ||||
|   param.x[4] = 1; // no fifth dimension | ||||
|   param.pc_type = QUDA_4D_PC; | ||||
|   param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; | ||||
| @@ -136,130 +138,151 @@ ColorSpinorField make_source(std::array<int, 4> const &geom) | ||||
|   return src; | ||||
| } | ||||
|  | ||||
| void benchmark(int L, int niter) | ||||
| void benchmark_wilson() | ||||
| { | ||||
|   std::array<int, 4> geom = {L, L, L, L}; | ||||
|   int niter = 20; | ||||
|   int niter_warmup = 10; | ||||
|  | ||||
|   printfQuda("=======================  benchmarking L=%d =======================\n", L); | ||||
|   printfQuda("==================== wilson dirac operator ====================\n"); | ||||
|   printfQuda("IMPORTANT: QUDAs own flop counting. Probably not the same as in Grid.\n"); | ||||
|   printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank"); | ||||
|  | ||||
|   auto U = make_gauge_field(geom); | ||||
|   printfQuda("created random gauge field, %.3f GiB (sanity check: should be %.3f)\n", | ||||
|              U.Bytes() / 1024. / 1024. / 1024., | ||||
|              1.0 * L * L * L * L * 4 * 18 * 8 / 1024. / 1024. / 1024.); | ||||
|   auto src = make_source(geom); | ||||
|   printfQuda("created random source, %.3f GiB (sanity check: should be %.3f)\n", | ||||
|              src.Bytes() / 1024. / 1024. / 1024., | ||||
|              1.0 * L * L * L * L * 12 * 2 * 8 / 1024. / 1024. / 1024.); | ||||
|   for (int L : {8, 12, 16, 24, 32}) | ||||
|   { | ||||
|     auto U = make_gauge_field(L); | ||||
|     auto src = make_source(L); | ||||
|  | ||||
|   // create (Wilson) dirac operator | ||||
|   DiracParam param; | ||||
|   param.kappa = 0.10; | ||||
|   param.dagger = QUDA_DAG_NO; | ||||
|   param.matpcType = QUDA_MATPC_EVEN_EVEN; | ||||
|   auto dirac = DiracWilson(param); | ||||
|     // create (Wilson) dirac operator | ||||
|     DiracParam param; | ||||
|     param.kappa = 0.10; | ||||
|     param.dagger = QUDA_DAG_NO; | ||||
|     param.matpcType = QUDA_MATPC_EVEN_EVEN; | ||||
|     auto dirac = DiracWilson(param); | ||||
|  | ||||
|   // insert gauge field into the dirac operator | ||||
|   // (the additional nullptr's are for smeared links and fancy preconditioners and such. | ||||
|   // Not used for simple Wilson fermions) | ||||
|   dirac.updateFields(&U, nullptr, nullptr, nullptr); | ||||
|     // insert gauge field into the dirac operator | ||||
|     // (the additional nullptr's are for smeared links and fancy preconditioners and such. | ||||
|     // Not used for simple Wilson fermions) | ||||
|     dirac.updateFields(&U, nullptr, nullptr, nullptr); | ||||
|  | ||||
|   auto tmp = ColorSpinorField(ColorSpinorParam(src)); | ||||
|     auto tmp = ColorSpinorField(ColorSpinorParam(src)); | ||||
|  | ||||
|   printfQuda("benchmarking Dirac operator. geom=(%d,%d,%d,%d), niter=%d\n", geom[0], | ||||
|              geom[1], geom[2], geom[3], niter); | ||||
|     // couple iterations without timing to warm up | ||||
|     for (int iter = 0; iter < niter_warmup; ++iter) | ||||
|       dirac.M(tmp, src); | ||||
|  | ||||
|   // couple iterations without timing to warm up | ||||
|   printfQuda("warmup...\n"); | ||||
|   for (int iter = 0; iter < 20; ++iter) | ||||
|     dirac.M(tmp, src); | ||||
|     // actual benchmark with timings | ||||
|     dirac.Flops(); // reset flops counter | ||||
|     device_timer_t device_timer; | ||||
|     device_timer.start(); | ||||
|     for (int iter = 0; iter < niter; ++iter) | ||||
|       dirac.M(tmp, src); | ||||
|     device_timer.stop(); | ||||
|  | ||||
|   printfQuda("running...\n"); | ||||
|   dirac.Flops(); // reset flops counter | ||||
|   device_timer_t device_timer; | ||||
|   device_timer.start(); | ||||
|   for (int iter = 0; iter < niter; ++iter) | ||||
|     dirac.M(tmp, src); | ||||
|   device_timer.stop(); | ||||
|     double secs = device_timer.last() / niter; | ||||
|     double flops = 1.0 * dirac.Flops() / niter; | ||||
|  | ||||
|   double secs = device_timer.last(); | ||||
|   double gflops = (dirac.Flops() * 1e-9) / secs; | ||||
|   printfQuda("Gflops = %6.2f\n", gflops); | ||||
|     printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9); | ||||
|   } | ||||
| } | ||||
|  | ||||
| void benchmark_axpy(int L) | ||||
| void benchmark_axpy() | ||||
| { | ||||
|   printfQuda("================ axpy L=%d ==============\n", L); | ||||
|   // number of iterations for warmup / measurement | ||||
|   // (feel free to change for noise/time tradeoff) | ||||
|   constexpr int niter_warmup = 10; | ||||
|   constexpr int niter = 20; | ||||
|  | ||||
|   printfQuda("==================== axpy / memory ====================\n"); | ||||
|  | ||||
|   ColorSpinorParam param; | ||||
|   param.nColor = 3; | ||||
|   param.nDim = 4;   // 4-dimensional lattice | ||||
|   param.x[4] = 1;   // no fifth dimension | ||||
|   param.nColor = 3; // supported values for nSpin/nColor are configured when compiling | ||||
|                     // QUDA. "3*4" will probably always be enabled, so we stick with this | ||||
|   param.nSpin = 4; | ||||
|   param.nVec = 1; | ||||
|   param.pad = 0; | ||||
|   param.siteSubset = QUDA_FULL_SITE_SUBSET; | ||||
|   param.nDim = 4; | ||||
|   param.x[0] = L; | ||||
|   param.x[1] = L; | ||||
|   param.x[2] = L; | ||||
|   param.x[3] = L; | ||||
|   param.x[4] = 1; // no fifth dimension | ||||
|   param.nVec = 1;                            // just a single vector | ||||
|   param.siteSubset = QUDA_FULL_SITE_SUBSET;  // full lattice = no odd/even | ||||
|   param.pad = 0;                             // no padding | ||||
|   param.create = QUDA_NULL_FIELD_CREATE;     // do not (zero-) initilize the field | ||||
|   param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU | ||||
|   param.setPrecision(QUDA_DOUBLE_PRECISION); | ||||
|  | ||||
|   // the following dont matter for an axpy benchmark, but need to choose something | ||||
|   param.pc_type = QUDA_4D_PC; | ||||
|   param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; | ||||
|   param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; | ||||
|   param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field | ||||
|   param.setPrecision(QUDA_DOUBLE_PRECISION); | ||||
|   param.location = QUDA_CUDA_FIELD_LOCATION; | ||||
|  | ||||
|   // create the field and fill it with random values | ||||
|   auto fieldA = ColorSpinorField(param); | ||||
|   auto fieldB = ColorSpinorField(param); | ||||
|   quda::RNG rng(fieldA, 1234); | ||||
|   auto size_bytes = size_t(8) * 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * | ||||
|                     param.nColor * param.nSpin; | ||||
|   assert(fieldA.Bytes() == size_bytes); // sanity check | ||||
|   assert(fieldB.Bytes() == size_bytes); // sanity check | ||||
|   spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); | ||||
|   spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); | ||||
|   printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)", | ||||
|              "GiB/s/rank", "Gflop/s/rank"); | ||||
|   std::vector L_list = {8, 12, 16, 24, 32}; | ||||
|   for (int L : L_list) | ||||
|   { | ||||
|     // IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()` | ||||
|     //            are LOCAL, i.e. per rank / per GPU | ||||
|  | ||||
|   // number of (real) elements in the field = number of fma instructions to do | ||||
|   double flops_per_iter = | ||||
|       2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * param.nColor * param.nSpin; | ||||
|     param.x[0] = L; | ||||
|     param.x[1] = L; | ||||
|     param.x[2] = L; | ||||
|     param.x[3] = L; | ||||
|  | ||||
|   int niter = 20; | ||||
|     // number of (real) elements in one (local) field | ||||
|     size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] * | ||||
|                             param.nColor * param.nSpin; | ||||
|  | ||||
|   printfQuda("warmup...\n"); | ||||
|   for (int iter = 0; iter < 10; ++iter) | ||||
|     blas::axpy(1.234, fieldA, fieldB); | ||||
|     // create the field(s) | ||||
|     auto fieldA = ColorSpinorField(param); | ||||
|     auto fieldB = ColorSpinorField(param); | ||||
|     assert(fieldA.Bytes() == sizeof(double) * field_elements); // sanity check | ||||
|     assert(fieldB.Bytes() == sizeof(double) * field_elements); // sanity check | ||||
|  | ||||
|   printfQuda("running...\n"); | ||||
|   device_timer_t device_timer; | ||||
|   device_timer.start(); | ||||
|   for (int iter = 0; iter < niter; ++iter) | ||||
|     blas::axpy(1.234, fieldA, fieldB); // fieldB += 1.234*fieldA | ||||
|   device_timer.stop(); | ||||
|     // fill fields with random values | ||||
|     quda::RNG rng(fieldA, 1234); | ||||
|     spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS); | ||||
|     spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS); | ||||
|  | ||||
|   double secs = device_timer.last(); | ||||
|   double gflops = (flops_per_iter * niter) * 1e-9 / secs; | ||||
|   printfQuda("Gflops = %6.2f\n", gflops); | ||||
|   printfQuda("bytes = %6.2f GiB\n", 3. * fieldA.Bytes() / 1024. / 1024. / 1024.); | ||||
|   printfQuda("bandwidth = %6.2f GiB/s\n", | ||||
|              fieldA.Bytes() * 3 / 1024. / 1024. / 1024. * niter / secs); | ||||
|     // number of operations / bytes per iteration | ||||
|     // axpy is one addition, one multiplication, two read, one write | ||||
|     double flops = 2 * field_elements; | ||||
|     double memory = 3 * sizeof(double) * field_elements; | ||||
|  | ||||
|     // do some iterations to to let QUDA do its internal tuning and also stabilize cache | ||||
|     // behaviour and such | ||||
|     for (int iter = 0; iter < niter_warmup; ++iter) | ||||
|       blas::axpy(1.234, fieldA, fieldB); | ||||
|  | ||||
|     // running the actual benchmark | ||||
|     device_timer_t device_timer; | ||||
|     device_timer.start(); | ||||
|     for (int iter = 0; iter < niter; ++iter) | ||||
|       blas::axpy(1.234, fieldA, fieldB); | ||||
|     device_timer.stop(); | ||||
|     double secs = device_timer.last() / niter; // seconds per iteration | ||||
|  | ||||
|     printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, memory / 1024. / 1024., secs * 1e6, | ||||
|                memory / secs / 1024. / 1024. / 1024., flops / secs * 1e-9); | ||||
|   } | ||||
| } | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|   initComms(argc, argv, gridsize); | ||||
|   initComms(argc, argv); | ||||
|  | ||||
|   initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used | ||||
|  | ||||
|   //  verbosity options are: | ||||
|   //  SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE | ||||
|   setVerbosity(QUDA_VERBOSE); | ||||
|   setVerbosity(QUDA_SUMMARIZE); | ||||
|  | ||||
|   for (int L : {8, 12, 16, 24, 32}) | ||||
|     benchmark_axpy(L); | ||||
|   for (int L : {16, 24, 32, 48, 64}) | ||||
|     benchmark(L, 100); | ||||
|   printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2], | ||||
|              mpi_grid[3]); | ||||
|  | ||||
|   benchmark_axpy(); | ||||
|  | ||||
|   setVerbosity(QUDA_SILENT); | ||||
|   benchmark_wilson(); | ||||
|   setVerbosity(QUDA_SUMMARIZE); | ||||
|  | ||||
|   printfQuda("==================== done with all benchmarks ====================\n"); | ||||
|   endQuda(); | ||||
|   quda::comm_finalize(); | ||||
|   MPI_Finalize(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user