Finale cleanup!
This commit is contained in:
parent
f180cbb8ec
commit
8f1a556afa
@ -1,6 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
||||||
|
Copyright © 2022 Simon Buerger <simon.buerger@rwth-aachen.de>
|
||||||
|
|
||||||
This is a fork of Benchmark_ITT.cpp from Grid
|
This is a fork of Benchmark_ITT.cpp from Grid
|
||||||
|
|
||||||
@ -24,13 +25,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||||||
|
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
std::vector<int> L_list;
|
|
||||||
std::vector<int> Ls_list;
|
|
||||||
std::vector<double> mflop_list;
|
|
||||||
|
|
||||||
double mflop_ref;
|
|
||||||
double mflop_ref_err;
|
|
||||||
|
|
||||||
int NN_global;
|
int NN_global;
|
||||||
|
|
||||||
nlohmann::json json_results;
|
nlohmann::json json_results;
|
||||||
@ -58,18 +52,6 @@ struct time_statistics
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void comms_header()
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage << " L "
|
|
||||||
<< "\t"
|
|
||||||
<< " Ls "
|
|
||||||
<< "\t"
|
|
||||||
<< "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
|
|
||||||
};
|
|
||||||
|
|
||||||
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
|
|
||||||
|
|
||||||
struct controls
|
struct controls
|
||||||
{
|
{
|
||||||
int Opt;
|
int Opt;
|
||||||
@ -133,10 +115,9 @@ class Benchmark
|
|||||||
std::vector<double> t_time(Nloop);
|
std::vector<double> t_time(Nloop);
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
|
|
||||||
grid_big_sep();
|
std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in "
|
||||||
std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "
|
|
||||||
<< nmu << " dimensions" << std::endl;
|
<< nmu << " dimensions" << std::endl;
|
||||||
grid_big_sep();
|
grid_small_sep();
|
||||||
grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)",
|
grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)",
|
||||||
"time (usec)", "rate (GB/s)", "std dev", "max");
|
"time (usec)", "rate (GB/s)", "std dev", "max");
|
||||||
|
|
||||||
@ -368,10 +349,10 @@ class Benchmark
|
|||||||
RealD mass = 0.1;
|
RealD mass = 0.1;
|
||||||
RealD M5 = 1.8;
|
RealD M5 = 1.8;
|
||||||
|
|
||||||
double mflops;
|
double gflops;
|
||||||
double mflops_best = 0;
|
double gflops_best = 0;
|
||||||
double mflops_worst = 0;
|
double gflops_worst = 0;
|
||||||
std::vector<double> mflops_all;
|
std::vector<double> gflops_all;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
// Set/Get the layout & grid size
|
// Set/Get the layout & grid size
|
||||||
@ -486,8 +467,6 @@ class Benchmark
|
|||||||
|
|
||||||
FGrid->Broadcast(0, &ncall, sizeof(ncall));
|
FGrid->Broadcast(0, &ncall, sizeof(ncall));
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per
|
|
||||||
// second"<<std::endl;
|
|
||||||
Dw.ZeroCounters();
|
Dw.ZeroCounters();
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
@ -515,60 +494,60 @@ class Benchmark
|
|||||||
double fps =
|
double fps =
|
||||||
Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
|
Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
|
||||||
#endif
|
#endif
|
||||||
double flops = (fps * volume) / 2;
|
double flops = (fps * volume) / 2.;
|
||||||
double mf_hi, mf_lo, mf_err;
|
double gf_hi, gf_lo, gf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops / timestat.min;
|
gf_hi = flops / timestat.min / 1000.;
|
||||||
mf_lo = flops / timestat.max;
|
gf_lo = flops / timestat.max / 1000.;
|
||||||
mf_err = flops / timestat.min * timestat.err / timestat.mean;
|
gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
|
||||||
|
|
||||||
mflops = flops / timestat.mean;
|
gflops = flops / timestat.mean / 1000.;
|
||||||
mflops_all.push_back(mflops);
|
gflops_all.push_back(gflops);
|
||||||
if (mflops_best == 0)
|
if (gflops_best == 0)
|
||||||
mflops_best = mflops;
|
gflops_best = gflops;
|
||||||
if (mflops_worst == 0)
|
if (gflops_worst == 0)
|
||||||
mflops_worst = mflops;
|
gflops_worst = gflops;
|
||||||
if (mflops > mflops_best)
|
if (gflops > gflops_best)
|
||||||
mflops_best = mflops;
|
gflops_best = gflops;
|
||||||
if (mflops < mflops_worst)
|
if (gflops < gflops_worst)
|
||||||
mflops_worst = mflops;
|
gflops_worst = gflops;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
|
std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
|
||||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||||
<< "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
|
<< "Deo Gflop/s = " << gflops << " (" << gf_err << ") " << gf_lo
|
||||||
<< "-" << mf_hi << std::endl;
|
<< "-" << gf_hi << std::endl;
|
||||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||||
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
|
<< "Deo Gflop/s per rank " << gflops / NP << std::endl;
|
||||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||||
<< "Deo mflop/s per node " << mflops / NN << std::endl;
|
<< "Deo Gflop/s per node " << gflops / NN << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
grid_small_sep();
|
grid_small_sep();
|
||||||
std::cout << GridLogMessage << L << "^4 x " << Ls
|
std::cout << GridLogMessage << L << "^4 x " << Ls
|
||||||
<< " Deo Best mflop/s = " << mflops_best << " ; "
|
<< " Deo Best Gflop/s = " << gflops_best << " ; "
|
||||||
<< mflops_best / NN << " per node " << std::endl;
|
<< gflops_best / NN << " per node " << std::endl;
|
||||||
std::cout << GridLogMessage << L << "^4 x " << Ls
|
std::cout << GridLogMessage << L << "^4 x " << Ls
|
||||||
<< " Deo Worst mflop/s = " << mflops_worst << " ; "
|
<< " Deo Worst Gflop/s = " << gflops_worst << " ; "
|
||||||
<< mflops_worst / NN << " per node " << std::endl;
|
<< gflops_worst / NN << " per node " << std::endl;
|
||||||
std::cout << GridLogMessage << fmt << std::endl;
|
std::cout << GridLogMessage << fmt << std::endl;
|
||||||
std::cout << GridLogMessage;
|
std::cout << GridLogMessage;
|
||||||
|
|
||||||
for (int i = 0; i < mflops_all.size(); i++)
|
for (int i = 0; i < gflops_all.size(); i++)
|
||||||
{
|
{
|
||||||
std::cout << mflops_all[i] / NN << " ; ";
|
std::cout << gflops_all[i] / NN << " ; ";
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
return mflops_best;
|
return gflops_best;
|
||||||
}
|
}
|
||||||
|
|
||||||
static double Staggered(int L)
|
static double Staggered(int L)
|
||||||
{
|
{
|
||||||
double mflops;
|
double gflops;
|
||||||
double mflops_best = 0;
|
double gflops_best = 0;
|
||||||
double mflops_worst = 0;
|
double gflops_worst = 0;
|
||||||
std::vector<double> mflops_all;
|
std::vector<double> gflops_all;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
// Set/Get the layout & grid size
|
// Set/Get the layout & grid size
|
||||||
@ -700,51 +679,51 @@ class Benchmark
|
|||||||
double volume = 1;
|
double volume = 1;
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
for (int mu = 0; mu < Nd; mu++)
|
||||||
volume = volume * latt4[mu];
|
volume = volume * latt4[mu];
|
||||||
double flops = (1146.0 * volume) / 2;
|
double flops = (1146.0 * volume) / 2.;
|
||||||
double mf_hi, mf_lo, mf_err;
|
double gf_hi, gf_lo, gf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops / timestat.min;
|
gf_hi = flops / timestat.min / 1000.;
|
||||||
mf_lo = flops / timestat.max;
|
gf_lo = flops / timestat.max / 1000.;
|
||||||
mf_err = flops / timestat.min * timestat.err / timestat.mean;
|
gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
|
||||||
|
|
||||||
mflops = flops / timestat.mean;
|
gflops = flops / timestat.mean / 1000.;
|
||||||
mflops_all.push_back(mflops);
|
gflops_all.push_back(gflops);
|
||||||
if (mflops_best == 0)
|
if (gflops_best == 0)
|
||||||
mflops_best = mflops;
|
gflops_best = gflops;
|
||||||
if (mflops_worst == 0)
|
if (gflops_worst == 0)
|
||||||
mflops_worst = mflops;
|
gflops_worst = gflops;
|
||||||
if (mflops > mflops_best)
|
if (gflops > gflops_best)
|
||||||
mflops_best = mflops;
|
gflops_best = gflops;
|
||||||
if (mflops < mflops_worst)
|
if (gflops < gflops_worst)
|
||||||
mflops_worst = mflops;
|
gflops_worst = gflops;
|
||||||
|
|
||||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||||
<< "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
|
<< "Deo Gflop/s = " << gflops << " (" << gf_err << ") " << gf_lo
|
||||||
<< "-" << mf_hi << std::endl;
|
<< "-" << gf_hi << std::endl;
|
||||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||||
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
|
<< "Deo Gflop/s per rank " << gflops / NP << std::endl;
|
||||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||||
<< "Deo mflop/s per node " << mflops / NN << std::endl;
|
<< "Deo Gflop/s per node " << gflops / NN << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
grid_small_sep();
|
grid_small_sep();
|
||||||
std::cout << GridLogMessage << L
|
std::cout << GridLogMessage << L
|
||||||
<< "^4 Deo Best mflop/s = " << mflops_best << " ; "
|
<< "^4 Deo Best Gflop/s = " << gflops_best << " ; "
|
||||||
<< mflops_best / NN << " per node " << std::endl;
|
<< gflops_best / NN << " per node " << std::endl;
|
||||||
std::cout << GridLogMessage << L
|
std::cout << GridLogMessage << L
|
||||||
<< "^4 Deo Worst mflop/s = " << mflops_worst << " ; "
|
<< "^4 Deo Worst Gflop/s = " << gflops_worst << " ; "
|
||||||
<< mflops_worst / NN << " per node " << std::endl;
|
<< gflops_worst / NN << " per node " << std::endl;
|
||||||
std::cout << GridLogMessage << fmt << std::endl;
|
std::cout << GridLogMessage << fmt << std::endl;
|
||||||
std::cout << GridLogMessage;
|
std::cout << GridLogMessage;
|
||||||
|
|
||||||
for (int i = 0; i < mflops_all.size(); i++)
|
for (int i = 0; i < gflops_all.size(); i++)
|
||||||
{
|
{
|
||||||
std::cout << mflops_all[i] / NN << " ; ";
|
std::cout << gflops_all[i] / NN << " ; ";
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
return mflops_best;
|
return gflops_best;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -782,6 +761,30 @@ int main(int argc, char **argv)
|
|||||||
std::vector<double> dwf4;
|
std::vector<double> dwf4;
|
||||||
std::vector<double> staggered;
|
std::vector<double> staggered;
|
||||||
|
|
||||||
|
if (do_memory)
|
||||||
|
{
|
||||||
|
grid_big_sep();
|
||||||
|
std::cout << GridLogMessage << " Memory benchmark " << std::endl;
|
||||||
|
grid_big_sep();
|
||||||
|
Benchmark::Memory();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (do_su4)
|
||||||
|
{
|
||||||
|
grid_big_sep();
|
||||||
|
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
|
||||||
|
grid_big_sep();
|
||||||
|
Benchmark::SU4();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (do_comms)
|
||||||
|
{
|
||||||
|
grid_big_sep();
|
||||||
|
std::cout << GridLogMessage << " Communications benchmark " << std::endl;
|
||||||
|
grid_big_sep();
|
||||||
|
Benchmark::Comms();
|
||||||
|
}
|
||||||
|
|
||||||
if (do_flops)
|
if (do_flops)
|
||||||
{
|
{
|
||||||
Ls = 1;
|
Ls = 1;
|
||||||
@ -810,68 +813,35 @@ int main(int argc, char **argv)
|
|||||||
staggered.push_back(result);
|
staggered.push_back(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int NN = NN_global;
|
||||||
|
|
||||||
grid_big_sep();
|
grid_big_sep();
|
||||||
std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
|
std::cout << GridLogMessage << "Gflop/s/node Summary table Ls=" << Ls << std::endl;
|
||||||
grid_big_sep();
|
grid_big_sep();
|
||||||
std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
|
grid_printf("%5s %12s %12s %12s\n", "L", "Wilson", "DWF", "Staggered");
|
||||||
|
nlohmann::json tmp_flops;
|
||||||
for (int l = 0; l < L_list.size(); l++)
|
for (int l = 0; l < L_list.size(); l++)
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t "
|
grid_printf("%5d %12.2f %12.2f %12.2f\n", L_list[l], wilson[l] / NN, dwf4[l] / NN,
|
||||||
<< dwf4[l] << " \t\t " << staggered[l] << std::endl;
|
staggered[l] / NN);
|
||||||
|
|
||||||
nlohmann::json tmp;
|
nlohmann::json tmp;
|
||||||
tmp["L"] = L_list[l];
|
tmp["L"] = L_list[l];
|
||||||
tmp["Mflops_wilson"] = wilson[l];
|
tmp["Gflops_wilson"] = wilson[l] / NN;
|
||||||
tmp["Mflops_dwf4"] = dwf4[l];
|
tmp["Gflops_dwf4"] = dwf4[l] / NN;
|
||||||
tmp["Mflops_staggered"] = staggered[l];
|
tmp["Gflops_staggered"] = staggered[l] / NN;
|
||||||
json_results["flops"].push_back(tmp);
|
tmp_flops["results"].push_back(tmp);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int NN = NN_global;
|
|
||||||
if (do_memory)
|
|
||||||
{
|
|
||||||
grid_big_sep();
|
|
||||||
std::cout << GridLogMessage << " Memory benchmark " << std::endl;
|
|
||||||
grid_big_sep();
|
|
||||||
Benchmark::Memory();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (do_su4)
|
|
||||||
{
|
|
||||||
grid_big_sep();
|
|
||||||
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
|
|
||||||
grid_big_sep();
|
|
||||||
Benchmark::SU4();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (do_comms)
|
|
||||||
{
|
|
||||||
grid_big_sep();
|
|
||||||
std::cout << GridLogMessage << " Communications benchmark " << std::endl;
|
|
||||||
grid_big_sep();
|
|
||||||
Benchmark::Comms();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (do_flops)
|
|
||||||
{
|
|
||||||
grid_big_sep();
|
|
||||||
std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
|
|
||||||
grid_big_sep();
|
|
||||||
std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
|
|
||||||
for (int l = 0; l < L_list.size(); l++)
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t "
|
|
||||||
<< dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
|
|
||||||
}
|
}
|
||||||
grid_big_sep();
|
grid_big_sep();
|
||||||
std::cout << GridLogMessage
|
std::cout << GridLogMessage
|
||||||
<< " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
|
<< " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
|
||||||
<< " Mflop/s per node" << std::endl;
|
<< " Gflop/s per node" << std::endl;
|
||||||
std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
|
std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
|
||||||
<< dwf4[selm1] / NN << ") " << std::endl;
|
<< dwf4[selm1] / NN << ") " << std::endl;
|
||||||
std::cout << std::setprecision(3);
|
std::cout << std::setprecision(3);
|
||||||
grid_big_sep();
|
grid_big_sep();
|
||||||
json_results["comp_point_Mflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN;
|
tmp_flops["comparison_point_Gflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN;
|
||||||
|
json_results["flops"] = tmp_flops;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!json_filename.empty())
|
if (!json_filename.empty())
|
||||||
|
86
Grid/systems/tursa/files/run.cpu.template.sh
Normal file
86
Grid/systems/tursa/files/run.cpu.template.sh
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
## This set of slurm settings assumes that the AMD chips are using bios setting NPS4 (4 mpi taks per socket).
|
||||||
|
|
||||||
|
#SBATCH -J @job-name@
|
||||||
|
#SBATCH -A @budget@
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=@nnodes@
|
||||||
|
#SBATCH --ntasks=@ntasks@
|
||||||
|
#SBATCH --ntasks-per-node=8
|
||||||
|
#SBATCH --cpus-per-task=32
|
||||||
|
#SBATCH --partition=@partition@
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --qos=standard
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=16
|
||||||
|
export OMP_DISPLAY_AFFINITY=true
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=rc,sm,self
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
export UCX_NET_DEVICES=mlx5_0:1
|
||||||
|
|
||||||
|
export OMPI_MCA_BTL_SM_USE_KNEM=1
|
||||||
|
export OMPI_MCA_coll_hcoll_enable=1
|
||||||
|
export OMPI_MCA_coll_hcoll_np=0
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
if [ @nnodes@ -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 # are these needed here?
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f @env-dir@)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'cpu' ]; then
|
||||||
|
source "${env_dir}/env-cpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='@application@'
|
||||||
|
opt='--comms-overlap --comms-concurrent'
|
||||||
|
par='@par@'
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./cpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi @mpi-geom@ \
|
||||||
|
--grid @grid-geom@ \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
################################################################################
|
84
Grid/systems/tursa/files/run.gpu.template.sh
Normal file
84
Grid/systems/tursa/files/run.gpu.template.sh
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J @job-name@
|
||||||
|
#SBATCH -A @budget@
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=@nnodes@
|
||||||
|
#SBATCH --ntasks=@ntasks@
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=@partition@
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --qos=standard
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=8
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ @nnodes@ -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f @env-dir@)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='@application@'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par='@par@'
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi @mpi-geom@ \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid @grid-geom@ \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
################################################################################
|
Loading…
x
Reference in New Issue
Block a user