Finale cleanup!

2023-01-28 15:26:03 +00:00
parent f180cbb8ec
commit 8f1a556afa
3 changed files with 272 additions and 132 deletions
--- a/Grid/Benchmark_Grid.cpp
+++ b/Grid/Benchmark_Grid.cpp
@@ -1,6 +1,7 @@
 /*
 Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
 Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
 Copyright © 2022 Simon Buerger <simon.buerger@rwth-aachen.de>
 This is a fork of Benchmark_ITT.cpp from Grid
@@ -24,13 +25,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 using namespace Grid;
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
 double mflop_ref;
 double mflop_ref_err;
 int NN_global;
 nlohmann::json json_results;
@@ -58,18 +52,6 @@ struct time_statistics
  }
 };
 void comms_header()
 {
  std::cout << GridLogMessage << " L  "
            << "\t"
            << " Ls  "
            << "\t"
            << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
 };
 Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
                        Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
 struct controls
 {
  int Opt;
@@ -133,10 +115,9 @@ class Benchmark
    std::vector<double> t_time(Nloop);
    time_statistics timestat;
-    grid_big_sep();
+    std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in "
    std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "
              << nmu << " dimensions" << std::endl;
-    grid_big_sep();
+    grid_small_sep();
    grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)",
                "time (usec)", "rate (GB/s)", "std dev", "max");
@@ -368,10 +349,10 @@ class Benchmark
    RealD mass = 0.1;
    RealD M5 = 1.8;
-    double mflops;
+    double gflops;
-    double mflops_best = 0;
+    double gflops_best = 0;
-    double mflops_worst = 0;
+    double gflops_worst = 0;
-    std::vector<double> mflops_all;
+    std::vector<double> gflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
@@ -486,8 +467,6 @@ class Benchmark
        FGrid->Broadcast(0, &ncall, sizeof(ncall));
        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per
        // second"<<std::endl;
        Dw.ZeroCounters();
        time_statistics timestat;
@@ -515,60 +494,60 @@ class Benchmark
        double fps =
            Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
 #endif
-        double flops = (fps * volume) / 2;
+        double flops = (fps * volume) / 2.;
-        double mf_hi, mf_lo, mf_err;
+        double gf_hi, gf_lo, gf_err;
        timestat.statistics(t_time);
-        mf_hi = flops / timestat.min;
+        gf_hi = flops / timestat.min / 1000.;
-        mf_lo = flops / timestat.max;
+        gf_lo = flops / timestat.max / 1000.;
-        mf_err = flops / timestat.min * timestat.err / timestat.mean;
+        gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
-        mflops = flops / timestat.mean;
+        gflops = flops / timestat.mean / 1000.;
-        mflops_all.push_back(mflops);
+        gflops_all.push_back(gflops);
-        if (mflops_best == 0)
+        if (gflops_best == 0)
-          mflops_best = mflops;
+          gflops_best = gflops;
-        if (mflops_worst == 0)
+        if (gflops_worst == 0)
-          mflops_worst = mflops;
+          gflops_worst = gflops;
-        if (mflops > mflops_best)
+        if (gflops > gflops_best)
-          mflops_best = mflops;
+          gflops_best = gflops;
-        if (mflops < mflops_worst)
+        if (gflops < gflops_worst)
-          mflops_worst = mflops;
+          gflops_worst = gflops;
        std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo
+                  << "Deo Gflop/s =   " << gflops << " (" << gf_err << ") " << gf_lo
-                  << "-" << mf_hi << std::endl;
+                  << "-" << gf_hi << std::endl;
        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s per rank   " << mflops / NP << std::endl;
+                  << "Deo Gflop/s per rank   " << gflops / NP << std::endl;
        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s per node   " << mflops / NN << std::endl;
+                  << "Deo Gflop/s per node   " << gflops / NN << std::endl;
      }
      grid_small_sep();
      std::cout << GridLogMessage << L << "^4 x " << Ls
-                << " Deo Best  mflop/s        =   " << mflops_best << " ; "
+                << " Deo Best  Gflop/s        =   " << gflops_best << " ; "
-                << mflops_best / NN << " per node " << std::endl;
+                << gflops_best / NN << " per node " << std::endl;
      std::cout << GridLogMessage << L << "^4 x " << Ls
-                << " Deo Worst mflop/s        =   " << mflops_worst << " ; "
+                << " Deo Worst Gflop/s        =   " << gflops_worst << " ; "
-                << mflops_worst / NN << " per node " << std::endl;
+                << gflops_worst / NN << " per node " << std::endl;
      std::cout << GridLogMessage << fmt << std::endl;
      std::cout << GridLogMessage;
-      for (int i = 0; i < mflops_all.size(); i++)
+      for (int i = 0; i < gflops_all.size(); i++)
      {
-        std::cout << mflops_all[i] / NN << " ; ";
+        std::cout << gflops_all[i] / NN << " ; ";
      }
      std::cout << std::endl;
    }
-    return mflops_best;
+    return gflops_best;
  }
  static double Staggered(int L)
  {
-    double mflops;
+    double gflops;
-    double mflops_best = 0;
+    double gflops_best = 0;
-    double mflops_worst = 0;
+    double gflops_worst = 0;
-    std::vector<double> mflops_all;
+    std::vector<double> gflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
@@ -700,51 +679,51 @@ class Benchmark
        double volume = 1;
        for (int mu = 0; mu < Nd; mu++)
          volume = volume * latt4[mu];
-        double flops = (1146.0 * volume) / 2;
+        double flops = (1146.0 * volume) / 2.;
-        double mf_hi, mf_lo, mf_err;
+        double gf_hi, gf_lo, gf_err;
        timestat.statistics(t_time);
-        mf_hi = flops / timestat.min;
+        gf_hi = flops / timestat.min / 1000.;
-        mf_lo = flops / timestat.max;
+        gf_lo = flops / timestat.max / 1000.;
-        mf_err = flops / timestat.min * timestat.err / timestat.mean;
+        gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
-        mflops = flops / timestat.mean;
+        gflops = flops / timestat.mean / 1000.;
-        mflops_all.push_back(mflops);
+        gflops_all.push_back(gflops);
-        if (mflops_best == 0)
+        if (gflops_best == 0)
-          mflops_best = mflops;
+          gflops_best = gflops;
-        if (mflops_worst == 0)
+        if (gflops_worst == 0)
-          mflops_worst = mflops;
+          gflops_worst = gflops;
-        if (mflops > mflops_best)
+        if (gflops > gflops_best)
-          mflops_best = mflops;
+          gflops_best = gflops;
-        if (mflops < mflops_worst)
+        if (gflops < gflops_worst)
-          mflops_worst = mflops;
+          gflops_worst = gflops;
        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo
+                  << "Deo Gflop/s =   " << gflops << " (" << gf_err << ") " << gf_lo
-                  << "-" << mf_hi << std::endl;
+                  << "-" << gf_hi << std::endl;
        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s per rank   " << mflops / NP << std::endl;
+                  << "Deo Gflop/s per rank   " << gflops / NP << std::endl;
        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s per node   " << mflops / NN << std::endl;
+                  << "Deo Gflop/s per node   " << gflops / NN << std::endl;
      }
      grid_small_sep();
      std::cout << GridLogMessage << L
-                << "^4  Deo Best  mflop/s        =   " << mflops_best << " ; "
+                << "^4  Deo Best  Gflop/s        =   " << gflops_best << " ; "
-                << mflops_best / NN << " per node " << std::endl;
+                << gflops_best / NN << " per node " << std::endl;
      std::cout << GridLogMessage << L
-                << "^4  Deo Worst mflop/s        =   " << mflops_worst << " ; "
+                << "^4  Deo Worst Gflop/s        =   " << gflops_worst << " ; "
-                << mflops_worst / NN << " per node " << std::endl;
+                << gflops_worst / NN << " per node " << std::endl;
      std::cout << GridLogMessage << fmt << std::endl;
      std::cout << GridLogMessage;
-      for (int i = 0; i < mflops_all.size(); i++)
+      for (int i = 0; i < gflops_all.size(); i++)
      {
-        std::cout << mflops_all[i] / NN << " ; ";
+        std::cout << gflops_all[i] / NN << " ; ";
      }
      std::cout << std::endl;
    }
-    return mflops_best;
+    return gflops_best;
  }
 };
@@ -782,6 +761,30 @@ int main(int argc, char **argv)
  std::vector<double> dwf4;
  std::vector<double> staggered;
  if (do_memory)
  {
    grid_big_sep();
    std::cout << GridLogMessage << " Memory benchmark " << std::endl;
    grid_big_sep();
    Benchmark::Memory();
  }
  if (do_su4)
  {
    grid_big_sep();
    std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
    grid_big_sep();
    Benchmark::SU4();
  }
  if (do_comms)
  {
    grid_big_sep();
    std::cout << GridLogMessage << " Communications benchmark " << std::endl;
    grid_big_sep();
    Benchmark::Comms();
  }
  if (do_flops)
  {
    Ls = 1;
@@ -810,68 +813,35 @@ int main(int argc, char **argv)
      staggered.push_back(result);
    }
    int NN = NN_global;
    grid_big_sep();
-    std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
+    std::cout << GridLogMessage << "Gflop/s/node Summary table Ls=" << Ls << std::endl;
    grid_big_sep();
-    std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
+    grid_printf("%5s %12s %12s %12s\n", "L", "Wilson", "DWF", "Staggered");
    nlohmann::json tmp_flops;
    for (int l = 0; l < L_list.size(); l++)
    {
-      std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t "
+      grid_printf("%5d %12.2f %12.2f %12.2f\n", L_list[l], wilson[l] / NN, dwf4[l] / NN,
-                << dwf4[l] << " \t\t " << staggered[l] << std::endl;
+                  staggered[l] / NN);
      nlohmann::json tmp;
      tmp["L"] = L_list[l];
-      tmp["Mflops_wilson"] = wilson[l];
+      tmp["Gflops_wilson"] = wilson[l] / NN;
-      tmp["Mflops_dwf4"] = dwf4[l];
+      tmp["Gflops_dwf4"] = dwf4[l] / NN;
-      tmp["Mflops_staggered"] = staggered[l];
+      tmp["Gflops_staggered"] = staggered[l] / NN;
-      json_results["flops"].push_back(tmp);
+      tmp_flops["results"].push_back(tmp);
    }
  }
  int NN = NN_global;
  if (do_memory)
  {
    grid_big_sep();
    std::cout << GridLogMessage << " Memory benchmark " << std::endl;
    grid_big_sep();
    Benchmark::Memory();
  }
  if (do_su4)
  {
    grid_big_sep();
    std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
    grid_big_sep();
    Benchmark::SU4();
  }
  if (do_comms)
  {
    grid_big_sep();
    std::cout << GridLogMessage << " Communications benchmark " << std::endl;
    grid_big_sep();
    Benchmark::Comms();
  }
  if (do_flops)
  {
    grid_big_sep();
    std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
    grid_big_sep();
    std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
    for (int l = 0; l < L_list.size(); l++)
    {
      std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t "
                << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
    }
    grid_big_sep();
    std::cout << GridLogMessage
              << " Comparison point     result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
-              << " Mflop/s per node" << std::endl;
+              << " Gflop/s per node" << std::endl;
    std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
              << dwf4[selm1] / NN << ") " << std::endl;
    std::cout << std::setprecision(3);
    grid_big_sep();
-    json_results["comp_point_Mflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN;
+    tmp_flops["comparison_point_Gflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN;
    json_results["flops"] = tmp_flops;
  }
  if (!json_filename.empty())
--- a/Grid/systems/tursa/files/run.cpu.template.sh
+++ b/Grid/systems/tursa/files/run.cpu.template.sh
@@ -0,0 +1,86 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 ## This set of slurm settings assumes that the AMD chips are using bios setting NPS4 (4 mpi taks per socket).
 #SBATCH -J @job-name@
 #SBATCH -A @budget@
 #SBATCH -t 48:00:00
 #SBATCH --nodes=@nnodes@
 #SBATCH --ntasks=@ntasks@
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=32
 #SBATCH --partition=@partition@
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --qos=standard
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=16
 export OMP_DISPLAY_AFFINITY=true
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=rc,sm,self
 export UCX_RNDV_THRESH=16384
 export UCX_MEMTYPE_CACHE=n
 export UCX_NET_DEVICES=mlx5_0:1
 export OMPI_MCA_BTL_SM_USE_KNEM=1
 export OMPI_MCA_coll_hcoll_enable=1
 export OMPI_MCA_coll_hcoll_np=0
 # IO environment ###############################################################
 if [ @nnodes@ -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 # are these needed here?
 # load environment #############################################################
 env_dir="$(readlink -f @env-dir@)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'cpu' ]; then
 	source "${env_dir}/env-cpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 # application and parameters ###################################################
 app='@application@'
 opt='--comms-overlap --comms-concurrent'
 par='@par@'
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./cpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi @mpi-geom@ \
 	--grid @grid-geom@ \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 ################################################################################
--- a/Grid/systems/tursa/files/run.gpu.template.sh
+++ b/Grid/systems/tursa/files/run.gpu.template.sh
@@ -0,0 +1,84 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J @job-name@
 #SBATCH -A @budget@
 #SBATCH -t 48:00:00
 #SBATCH --nodes=@nnodes@
 #SBATCH --ntasks=@ntasks@
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=@partition@
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --qos=standard
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=8
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ @nnodes@ -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f @env-dir@)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 # application and parameters ###################################################
 app='@application@'
 opt=('--comms-overlap' '--comms-concurrent')
 par='@par@'
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi @mpi-geom@ \
  --accelerator-threads 8 \
 	--grid @grid-geom@ \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 ################################################################################