70 lines
3.0 KiB
Bash
70 lines
3.0 KiB
Bash
|
#!/usr/bin/env bash
|
||
|
|
||
|
set -euo pipefail
|
||
|
|
||
|
if (( $# != 4 )); then
|
||
|
echo "usage: $(basename "$0") <job db> <job table> <rack power db> <rack power table>" 1>&2
|
||
|
exit 1
|
||
|
fi
|
||
|
job_db=$1
|
||
|
job_table=$2
|
||
|
power_db=$3
|
||
|
power_table=$4
|
||
|
|
||
|
echo '# Columns'
|
||
|
echo '# -------'
|
||
|
echo '# 1 clock limit (MHz)'
|
||
|
echo '# 2 job duration (s)'
|
||
|
echo '# 3 rack energy (kWh)'
|
||
|
echo '# 4 rack median power (kW)'
|
||
|
echo '# 5 GPU energy (kWh)'
|
||
|
echo '# 6 GPU average power (kWh)'
|
||
|
echo '# 7 Dhop average performance (TFlop/s/node)'
|
||
|
echo '# 8 DhopEO average performance (TFlop/s/node)'
|
||
|
for c in $(seq 210 15 1410); do
|
||
|
start=$(sqlite3 "${job_db}" "SELECT MAX(start) FROM ${job_table} WHERE clock_limit = ${c}")
|
||
|
end=$(sqlite3 "${job_db}" "SELECT MIN(end) FROM ${job_table} WHERE clock_limit = ${c}")
|
||
|
duration=$((end - start))
|
||
|
sample_count=$(sqlite3 "${power_db}" "SELECT COUNT(*) FROM ${power_table} WHERE timestamp >= ${start} and timestamp <= ${end};")
|
||
|
if (( sample_count == 0 )); then
|
||
|
echo "error: no rack power sample" 1>&2
|
||
|
exit 1
|
||
|
fi
|
||
|
energy=$(./get-rack12-energy.sh "${power_db}" "${power_table}" "${start}" "${end}")
|
||
|
power_med=$(sqlite3 "${power_db}" "SELECT (rack_1 + rack_2)/1000. FROM ${power_table} WHERE timestamp >= ${start} and timestamp <= ${end};" | datamash median 1)
|
||
|
gpu_energy='0.'
|
||
|
gpu_power='0.'
|
||
|
for nodes in 8 16; do
|
||
|
smi_dbs=$(sqlite3 "${job_db}" "SELECT smi_db FROM ${job_table} WHERE clock_limit = ${c} AND nodes=${nodes}")
|
||
|
for smi_db in ${smi_dbs}; do
|
||
|
job_energy=$(sqlite3 "${smi_db}" "SELECT ${nodes}*SUM(power) FROM clock_limit_${c};")
|
||
|
job_power=$(sqlite3 "${smi_db}" "SELECT ${nodes}*4*AVG(power)/1000. FROM clock_limit_${c};")
|
||
|
gpu_energy=$(echo "${gpu_energy} + ${job_energy}" | bc -l)
|
||
|
gpu_power=$(echo "${gpu_power} + ${job_power}" | bc -l)
|
||
|
done
|
||
|
done
|
||
|
job_dirs=$(sqlite3 "${job_db}" "SELECT job_dir FROM ${job_table} WHERE clock_limit = ${c}")
|
||
|
njobs=$(sqlite3 "${job_db}" "SELECT COUNT(job_dir) FROM ${job_table} WHERE clock_limit = ${c}")
|
||
|
if (( njobs != 4 )); then
|
||
|
echo "error: number of jobs should be 4" 1>&2
|
||
|
exit 1
|
||
|
fi
|
||
|
dhop_perf='0.'
|
||
|
dhopeo_perf='0.'
|
||
|
for job_dir in ${job_dirs}; do
|
||
|
log="${job_dir}/log"
|
||
|
if (( $(grep -c 'Average mflops/s per call per node (full)' "${log}") != 2 )); then
|
||
|
echo "error: log ${log} is ill-formed" 1>&2
|
||
|
exit 1
|
||
|
fi
|
||
|
tmp_dhop_perf=$(grep 'Average mflops/s per call per node (full)' "${log}" | head -n1 | awk '{printf("%.6f", $NF/1.e+6)}')
|
||
|
tmp_dhopeo_perf=$(grep 'Average mflops/s per call per node (full)' "${log}" | tail -n1 | awk '{printf("%.6f", $NF/1.e+6)}')
|
||
|
dhop_perf=$(echo "${dhop_perf} + ${tmp_dhop_perf}" | bc -l)
|
||
|
dhopeo_perf=$(echo "${dhopeo_perf} + ${tmp_dhopeo_perf}" | bc -l)
|
||
|
done
|
||
|
dhop_perf=$(echo "${dhop_perf}/4." | bc -l)
|
||
|
dhopeo_perf=$(echo "${dhopeo_perf}/4." | bc -l)
|
||
|
gpu_energy=$(echo "${gpu_energy}/3600000." | bc -l)
|
||
|
printf '%5d %5d %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n' "${c}" "${duration}" "${energy}" "${power_med}" "${gpu_energy}" "${gpu_power}" "${dhop_perf}" "${dhopeo_perf}"
|
||
|
done
|