forked from portelli/lattice-benchmarks
Compare commits
56 Commits
Author | SHA1 | Date | |
---|---|---|---|
cb538bfbf1 | |||
9056e9023c | |||
c049a2ad0b | |||
fb43d16830 | |||
6fa2e6bcd0 | |||
fb4c456776 | |||
3fbb8ea346 | |||
86b160cb5c | |||
dc411017bb | |||
b2cc780690 | |||
6d87396576 | |||
e9d084ce09 | |||
32e301fc67 | |||
eaa4feee43 | |||
025f9dab50 | |||
3a561091d9 | |||
191c0cfca5 | |||
6f9af8acad | |||
371a329457 | |||
f81cb198ab | |||
a7e1d9e67f | |||
19c9dcb6ae | |||
7d89380b80 | |||
4cd67805b9 | |||
f7e607eae4 | |||
a267986800 | |||
a1ec08cdb3 | |||
fb6c79d9ca | |||
d7647afa72 | |||
ba00493c7d | |||
6055e0503c | |||
6ea093fc80 | |||
fa47ec5bbe | |||
7235bfde4c | |||
e5c61c2db1 | |||
80c80049d7 | |||
ce0d4d9457 | |||
cc4c0255bc | |||
bdfb94bf11 | |||
af950e6e28 | |||
14fb2fddc2 | |||
5198bbe1cd | |||
5f9abbb8d0 | |||
9b6c6d4d40 | |||
43e264d29b | |||
51eae5723e | |||
ce890a8fc2 | |||
8f1a556afa | |||
58080730ae | |||
f180cbb8ec | |||
5098f57f08 | |||
0e2b7225db | |||
7b689a8c94 | |||
77c75ea5b8 | |||
78c464d1d8 | |||
8f043343fb |
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||||||
#ifdef HAVE_LIME
|
#ifdef HAVE_LIME
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
|
std::string filestem(const int l) { return "io/iobench_l" + std::to_string(l); }
|
||||||
|
|
||||||
int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
|
int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
|
||||||
|
|
||||||
@ -56,13 +56,6 @@ template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat
|
|||||||
mean /= n;
|
mean /= n;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define grid_printf(...) \
|
|
||||||
{ \
|
|
||||||
char _buf[1024]; \
|
|
||||||
sprintf(_buf, __VA_ARGS__); \
|
|
||||||
MSG << _buf; \
|
|
||||||
}
|
|
||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
sRead = 0,
|
sRead = 0,
|
||||||
@ -83,58 +76,58 @@ int main(int argc, char **argv)
|
|||||||
std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
|
std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
|
||||||
std::vector<int> latt;
|
std::vector<int> latt;
|
||||||
|
|
||||||
MSG << "Grid is setup to use " << threads << " threads" << std::endl;
|
GRID_MSG << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||||
MSG << "MPI partition " << mpi << std::endl;
|
GRID_MSG << "MPI partition " << mpi << std::endl;
|
||||||
for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
|
for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
|
||||||
{
|
{
|
||||||
MSG << BIGSEP << std::endl;
|
grid_big_sep();
|
||||||
MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
|
GRID_MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
|
||||||
MSG << BIGSEP << std::endl;
|
grid_big_sep();
|
||||||
MSG << SEP << std::endl;
|
grid_small_sep();
|
||||||
MSG << "Benchmark std write" << std::endl;
|
GRID_MSG << "Benchmark std write" << std::endl;
|
||||||
MSG << SEP << std::endl;
|
grid_small_sep();
|
||||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||||
{
|
{
|
||||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||||
|
|
||||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||||
writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
|
writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
|
||||||
perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
|
perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||||
}
|
}
|
||||||
|
|
||||||
MSG << SEP << std::endl;
|
grid_small_sep();
|
||||||
MSG << "Benchmark std read" << std::endl;
|
GRID_MSG << "Benchmark std read" << std::endl;
|
||||||
MSG << SEP << std::endl;
|
grid_small_sep();
|
||||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||||
{
|
{
|
||||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||||
|
|
||||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||||
readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
|
readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
|
||||||
perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
|
perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_LIME
|
#ifdef HAVE_LIME
|
||||||
MSG << SEP << std::endl;
|
grid_small_sep();
|
||||||
MSG << "Benchmark Grid C-Lime write" << std::endl;
|
GRID_MSG << "Benchmark Grid C-Lime write" << std::endl;
|
||||||
MSG << SEP << std::endl;
|
grid_small_sep();
|
||||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||||
{
|
{
|
||||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||||
|
|
||||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||||
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
|
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
|
||||||
perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
|
perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||||
}
|
}
|
||||||
|
|
||||||
MSG << SEP << std::endl;
|
grid_small_sep();
|
||||||
MSG << "Benchmark Grid C-Lime read" << std::endl;
|
GRID_MSG << "Benchmark Grid C-Lime read" << std::endl;
|
||||||
MSG << SEP << std::endl;
|
grid_small_sep();
|
||||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||||
{
|
{
|
||||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||||
|
|
||||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||||
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
|
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
|
||||||
perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
|
perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||||
}
|
}
|
||||||
@ -159,13 +152,13 @@ int main(int argc, char **argv)
|
|||||||
avRob.fill(100.);
|
avRob.fill(100.);
|
||||||
avRob -= 100. * avStdDev.cwiseQuotient(avMean.cwiseAbs());
|
avRob -= 100. * avStdDev.cwiseQuotient(avMean.cwiseAbs());
|
||||||
|
|
||||||
MSG << BIGSEP << std::endl;
|
grid_big_sep();
|
||||||
MSG << "SUMMARY" << std::endl;
|
GRID_MSG << "SUMMARY" << std::endl;
|
||||||
MSG << BIGSEP << std::endl;
|
grid_big_sep();
|
||||||
MSG << "Summary of individual results (all results in MB/s)." << std::endl;
|
GRID_MSG << "Summary of individual results (all results in MB/s)." << std::endl;
|
||||||
MSG << "Every second colum gives the standard deviation of the previous column."
|
GRID_MSG << "Every second colum gives the standard deviation of the previous column."
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
MSG << std::endl;
|
GRID_MSG << std::endl;
|
||||||
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
|
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
|
||||||
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
|
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
|
||||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||||
@ -176,10 +169,10 @@ int main(int argc, char **argv)
|
|||||||
stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
|
stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
|
||||||
stdDev(volInd(l), gWrite));
|
stdDev(volInd(l), gWrite));
|
||||||
}
|
}
|
||||||
MSG << std::endl;
|
GRID_MSG << std::endl;
|
||||||
MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
|
GRID_MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
MSG << std::endl;
|
GRID_MSG << std::endl;
|
||||||
grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
|
grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
|
||||||
"Grid write");
|
"Grid write");
|
||||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||||
@ -187,21 +180,21 @@ int main(int argc, char **argv)
|
|||||||
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
|
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
|
||||||
rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
|
rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
|
||||||
}
|
}
|
||||||
MSG << std::endl;
|
GRID_MSG << std::endl;
|
||||||
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
|
GRID_MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
|
||||||
<< "^4 (all results in MB/s)." << std::endl;
|
<< "^4 (all results in MB/s)." << std::endl;
|
||||||
MSG << "Every second colum gives the standard deviation of the previous column."
|
GRID_MSG << "Every second colum gives the standard deviation of the previous column."
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
MSG << std::endl;
|
GRID_MSG << std::endl;
|
||||||
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
|
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
|
||||||
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
|
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
|
||||||
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
|
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
|
||||||
avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
|
avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
|
||||||
avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
|
avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
|
||||||
MSG << std::endl;
|
GRID_MSG << std::endl;
|
||||||
MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
|
GRID_MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
MSG << std::endl;
|
GRID_MSG << std::endl;
|
||||||
grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
|
grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
|
||||||
"Grid write");
|
"Grid write");
|
||||||
grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
|
grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
|
||||||
|
@ -18,12 +18,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||||||
#ifndef Benchmark_IO_hpp_
|
#ifndef Benchmark_IO_hpp_
|
||||||
#define Benchmark_IO_hpp_
|
#define Benchmark_IO_hpp_
|
||||||
|
|
||||||
|
#include "Common.hpp"
|
||||||
#include <Grid/Grid.h>
|
#include <Grid/Grid.h>
|
||||||
#define MSG std::cout << GridLogMessage
|
|
||||||
#define SEP \
|
|
||||||
"-----------------------------------------------------------------------------"
|
|
||||||
#define BIGSEP \
|
|
||||||
"============================================================================="
|
|
||||||
#ifdef HAVE_LIME
|
#ifdef HAVE_LIME
|
||||||
|
|
||||||
namespace Grid
|
namespace Grid
|
||||||
@ -50,9 +46,9 @@ namespace Grid
|
|||||||
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||||
// std::fwrite(&crc, sizeof(uint32_t), 1, file);
|
// std::fwrite(&crc, sizeof(uint32_t), 1, file);
|
||||||
// crcWatch.Stop();
|
// crcWatch.Stop();
|
||||||
// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
// GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec <<
|
||||||
// ioWatch.Start();
|
// std::endl; ioWatch.Start(); std::fwrite(vec_v.cpu_ptr, sizeof(typename
|
||||||
// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
|
// Field::scalar_object),
|
||||||
// vec.Grid()->lSites(), file);
|
// vec.Grid()->lSites(), file);
|
||||||
// ioWatch.Stop();
|
// ioWatch.Stop();
|
||||||
// std::fclose(file);
|
// std::fclose(file);
|
||||||
@ -61,11 +57,11 @@ namespace Grid
|
|||||||
// p.size = size;
|
// p.size = size;
|
||||||
// p.time = ioWatch.useconds();
|
// p.time = ioWatch.useconds();
|
||||||
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||||
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
// GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||||
// << ",
|
// << ",
|
||||||
// "
|
// "
|
||||||
// << p.mbytesPerSecond << " MB/s" << std::endl;
|
// << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||||
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
// GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// template <typename Field> void stdRead(Field &vec, const std::string filestem)
|
// template <typename Field> void stdRead(Field &vec, const std::string filestem)
|
||||||
@ -94,16 +90,14 @@ namespace Grid
|
|||||||
// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||||
// crcWatch.Stop();
|
// crcWatch.Stop();
|
||||||
// }
|
// }
|
||||||
// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
|
// GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec <<
|
||||||
// assert(crcData == crcRead);
|
// std::endl; assert(crcData == crcRead); size *= vec.Grid()->ProcessorCount(); auto
|
||||||
// size *= vec.Grid()->ProcessorCount();
|
// &p = BinaryIO::lastPerf; p.size = size; p.time = ioWatch.useconds();
|
||||||
// auto &p = BinaryIO::lastPerf;
|
|
||||||
// p.size = size;
|
|
||||||
// p.time = ioWatch.useconds();
|
|
||||||
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||||
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
// GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() <<
|
||||||
|
// ", "
|
||||||
// << p.mbytesPerSecond << " MB/s" << std::endl;
|
// << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||||
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
// GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
template <typename Field> void stdWrite(const std::string filestem, Field &vec)
|
template <typename Field> void stdWrite(const std::string filestem, Field &vec)
|
||||||
@ -122,7 +116,7 @@ namespace Grid
|
|||||||
crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||||
file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t) / sizeof(char));
|
file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t) / sizeof(char));
|
||||||
crcWatch.Stop();
|
crcWatch.Stop();
|
||||||
MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
||||||
ioWatch.Start();
|
ioWatch.Start();
|
||||||
file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
|
file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
|
||||||
file.flush();
|
file.flush();
|
||||||
@ -132,9 +126,9 @@ namespace Grid
|
|||||||
p.size = size;
|
p.size = size;
|
||||||
p.time = ioWatch.useconds();
|
p.time = ioWatch.useconds();
|
||||||
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||||
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||||
<< p.mbytesPerSecond << " MB/s" << std::endl;
|
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||||
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Field> void stdRead(Field &vec, const std::string filestem)
|
template <typename Field> void stdRead(Field &vec, const std::string filestem)
|
||||||
@ -163,16 +157,17 @@ namespace Grid
|
|||||||
crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||||
crcWatch.Stop();
|
crcWatch.Stop();
|
||||||
}
|
}
|
||||||
MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
|
GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec
|
||||||
|
<< std::endl;
|
||||||
assert(crcData == crcRead);
|
assert(crcData == crcRead);
|
||||||
size *= vec.Grid()->ProcessorCount();
|
size *= vec.Grid()->ProcessorCount();
|
||||||
auto &p = BinaryIO::lastPerf;
|
auto &p = BinaryIO::lastPerf;
|
||||||
p.size = size;
|
p.size = size;
|
||||||
p.time = ioWatch.useconds();
|
p.time = ioWatch.useconds();
|
||||||
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||||
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||||
<< p.mbytesPerSecond << " MB/s" << std::endl;
|
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||||
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Field> void limeWrite(const std::string filestem, Field &vec)
|
template <typename Field> void limeWrite(const std::string filestem, Field &vec)
|
||||||
|
@ -1,265 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU General Public License
|
|
||||||
as published by the Free Software Foundation; either version 2
|
|
||||||
of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License
|
|
||||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
|
|
||||||
struct time_statistics
|
|
||||||
{
|
|
||||||
double mean;
|
|
||||||
double err;
|
|
||||||
double min;
|
|
||||||
double max;
|
|
||||||
|
|
||||||
void statistics(std::vector<double> v)
|
|
||||||
{
|
|
||||||
double sum = std::accumulate(v.begin(), v.end(), 0.0);
|
|
||||||
mean = sum / v.size();
|
|
||||||
|
|
||||||
std::vector<double> diff(v.size());
|
|
||||||
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
|
|
||||||
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
|
|
||||||
err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
|
|
||||||
|
|
||||||
auto result = std::minmax_element(v.begin(), v.end());
|
|
||||||
min = *result.first;
|
|
||||||
max = *result.second;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
void header()
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage << " L "
|
|
||||||
<< "\t"
|
|
||||||
<< " Ls "
|
|
||||||
<< "\t" << std::setw(11) << "bytes\t\t"
|
|
||||||
<< "MB/s uni"
|
|
||||||
<< "\t"
|
|
||||||
<< "MB/s bidi" << std::endl;
|
|
||||||
};
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc, &argv);
|
|
||||||
|
|
||||||
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
|
|
||||||
Coordinate mpi_layout = GridDefaultMpi();
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
|
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
int Nloop = 250;
|
|
||||||
int nmu = 0;
|
|
||||||
int maxlat = 32;
|
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
|
||||||
if (mpi_layout[mu] > 1)
|
|
||||||
nmu++;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
|
|
||||||
<< std::endl;
|
|
||||||
std::vector<double> t_time(Nloop);
|
|
||||||
// time_statistics timestat;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "========================================================================="
|
|
||||||
"==========================="
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "= Benchmarking sequential halo exchange from host memory " << std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "========================================================================="
|
|
||||||
"==========================="
|
|
||||||
<< std::endl;
|
|
||||||
header();
|
|
||||||
|
|
||||||
for (int lat = 8; lat <= maxlat; lat += 4)
|
|
||||||
{
|
|
||||||
for (int Ls = 8; Ls <= 8; Ls *= 2)
|
|
||||||
{
|
|
||||||
|
|
||||||
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
|
|
||||||
lat * mpi_layout[3]});
|
|
||||||
|
|
||||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
|
||||||
RealD Nrank = Grid._Nprocessors;
|
|
||||||
RealD Nnode = Grid.NodeCount();
|
|
||||||
RealD ppn = Nrank / Nnode;
|
|
||||||
|
|
||||||
std::vector<std::vector<HalfSpinColourVectorD>> xbuf(8);
|
|
||||||
std::vector<std::vector<HalfSpinColourVectorD>> rbuf(8);
|
|
||||||
|
|
||||||
for (int mu = 0; mu < 8; mu++)
|
|
||||||
{
|
|
||||||
xbuf[mu].resize(lat * lat * lat * Ls);
|
|
||||||
rbuf[mu].resize(lat * lat * lat * Ls);
|
|
||||||
}
|
|
||||||
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
|
|
||||||
|
|
||||||
int ncomm;
|
|
||||||
|
|
||||||
for (int mu = 0; mu < 4; mu++)
|
|
||||||
{
|
|
||||||
if (mpi_layout[mu] > 1)
|
|
||||||
{
|
|
||||||
double start = usecond();
|
|
||||||
for (int i = 0; i < Nloop; i++)
|
|
||||||
{
|
|
||||||
|
|
||||||
ncomm = 0;
|
|
||||||
|
|
||||||
ncomm++;
|
|
||||||
int comm_proc = 1;
|
|
||||||
int xmit_to_rank;
|
|
||||||
int recv_from_rank;
|
|
||||||
|
|
||||||
{
|
|
||||||
std::vector<CommsRequest_t> requests;
|
|
||||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
|
||||||
Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
|
|
||||||
(void *)&rbuf[mu][0], recv_from_rank, bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu] - 1;
|
|
||||||
{
|
|
||||||
std::vector<CommsRequest_t> requests;
|
|
||||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
|
||||||
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
|
|
||||||
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Grid.Barrier();
|
|
||||||
double stop = usecond();
|
|
||||||
double mean = (stop - start) / Nloop;
|
|
||||||
double dbytes = bytes * ppn;
|
|
||||||
double xbytes = dbytes * 2.0 * ncomm;
|
|
||||||
double rbytes = xbytes;
|
|
||||||
double bidibytes = xbytes + rbytes;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
|
|
||||||
<< std::setw(11) << bytes << std::fixed << std::setprecision(1)
|
|
||||||
<< std::setw(7) << " " << std::right << xbytes / mean << " "
|
|
||||||
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "========================================================================="
|
|
||||||
"==========================="
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "========================================================================="
|
|
||||||
"==========================="
|
|
||||||
<< std::endl;
|
|
||||||
header();
|
|
||||||
|
|
||||||
for (int lat = 8; lat <= maxlat; lat += 4)
|
|
||||||
{
|
|
||||||
for (int Ls = 8; Ls <= 8; Ls *= 2)
|
|
||||||
{
|
|
||||||
|
|
||||||
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
|
|
||||||
lat * mpi_layout[3]});
|
|
||||||
|
|
||||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
|
||||||
RealD Nrank = Grid._Nprocessors;
|
|
||||||
RealD Nnode = Grid.NodeCount();
|
|
||||||
RealD ppn = Nrank / Nnode;
|
|
||||||
|
|
||||||
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
|
||||||
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
|
||||||
|
|
||||||
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
|
|
||||||
for (int d = 0; d < 8; d++)
|
|
||||||
{
|
|
||||||
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
|
|
||||||
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ncomm;
|
|
||||||
|
|
||||||
for (int mu = 0; mu < 4; mu++)
|
|
||||||
{
|
|
||||||
if (mpi_layout[mu] > 1)
|
|
||||||
{
|
|
||||||
double start = usecond();
|
|
||||||
for (int i = 0; i < Nloop; i++)
|
|
||||||
{
|
|
||||||
|
|
||||||
ncomm = 0;
|
|
||||||
|
|
||||||
ncomm++;
|
|
||||||
int comm_proc = 1;
|
|
||||||
int xmit_to_rank;
|
|
||||||
int recv_from_rank;
|
|
||||||
|
|
||||||
{
|
|
||||||
std::vector<CommsRequest_t> requests;
|
|
||||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
|
||||||
Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
|
|
||||||
(void *)&rbuf[mu][0], recv_from_rank, bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu] - 1;
|
|
||||||
{
|
|
||||||
std::vector<CommsRequest_t> requests;
|
|
||||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
|
||||||
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
|
|
||||||
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Grid.Barrier();
|
|
||||||
double stop = usecond();
|
|
||||||
double mean = (stop - start) / Nloop;
|
|
||||||
double dbytes = bytes * ppn;
|
|
||||||
double xbytes = dbytes * 2.0 * ncomm;
|
|
||||||
double rbytes = xbytes;
|
|
||||||
double bidibytes = xbytes + rbytes;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
|
|
||||||
<< std::setw(11) << bytes << std::fixed << std::setprecision(1)
|
|
||||||
<< std::setw(7) << " " << std::right << xbytes / mean << " "
|
|
||||||
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int d = 0; d < 8; d++)
|
|
||||||
{
|
|
||||||
acceleratorFreeDevice(xbuf[d]);
|
|
||||||
acceleratorFreeDevice(rbuf[d]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "========================================================================="
|
|
||||||
"==========================="
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "========================================================================="
|
|
||||||
"==========================="
|
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
@ -1,512 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
|
||||||
Copyright © 2023 Simon Bürger <simon.buerger@rwth-aachen.de>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU General Public License
|
|
||||||
as published by the Free Software Foundation; either version 2
|
|
||||||
of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License
|
|
||||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "json.hpp"
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
#define CUDA_PROFILE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
#include <cuda_profiler_api.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
|
|
||||||
template <class d> struct scal
|
|
||||||
{
|
|
||||||
d internal;
|
|
||||||
};
|
|
||||||
|
|
||||||
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc, &argv);
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
|
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
|
||||||
int Ls = 16;
|
|
||||||
std::string json_filename = ""; // empty indicates no json output
|
|
||||||
nlohmann::json json;
|
|
||||||
|
|
||||||
// benchmark specific command line arguments
|
|
||||||
for (int i = 0; i < argc; i++)
|
|
||||||
{
|
|
||||||
if (std::string(argv[i]) == "-Ls")
|
|
||||||
{
|
|
||||||
std::stringstream ss(argv[i + 1]);
|
|
||||||
ss >> Ls;
|
|
||||||
}
|
|
||||||
if (std::string(argv[i]) == "--json-out")
|
|
||||||
json_filename = argv[i + 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
GridLogLayout();
|
|
||||||
|
|
||||||
long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
|
|
||||||
|
|
||||||
json["single_site_flops"] = single_site_flops;
|
|
||||||
|
|
||||||
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
|
|
||||||
GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
|
|
||||||
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
|
|
||||||
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
|
|
||||||
|
|
||||||
json["grid"] = FGrid->FullDimensions().toVector();
|
|
||||||
json["local_grid"] = FGrid->LocalDimensions().toVector();
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
|
|
||||||
GridCartesian *sUGrid =
|
|
||||||
SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
|
|
||||||
|
|
||||||
GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
|
||||||
GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
|
|
||||||
GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1, 2, 3, 4});
|
|
||||||
std::vector<int> seeds5({5, 6, 7, 8});
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
|
||||||
GridParallelRNG RNG4(UGrid);
|
|
||||||
RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
|
||||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
|
||||||
GridParallelRNG RNG5(FGrid);
|
|
||||||
RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
|
||||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
|
||||||
|
|
||||||
LatticeFermionF src(FGrid);
|
|
||||||
random(RNG5, src);
|
|
||||||
#if 0
|
|
||||||
src = Zero();
|
|
||||||
{
|
|
||||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
|
||||||
SpinColourVectorF tmp;
|
|
||||||
tmp=Zero();
|
|
||||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
|
||||||
std::cout << " source site 0 " << tmp<<std::endl;
|
|
||||||
pokeSite(tmp,src,origin);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
RealD N2 = 1.0 / ::sqrt(norm2(src));
|
|
||||||
src = src * N2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
LatticeFermionF result(FGrid);
|
|
||||||
result = Zero();
|
|
||||||
LatticeFermionF ref(FGrid);
|
|
||||||
ref = Zero();
|
|
||||||
LatticeFermionF tmp(FGrid);
|
|
||||||
LatticeFermionF err(FGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
|
||||||
LatticeGaugeFieldF Umu(UGrid);
|
|
||||||
SU<Nc>::HotConfiguration(RNG4, Umu);
|
|
||||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
|
||||||
#if 0
|
|
||||||
Umu=1.0;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
LatticeColourMatrixF ttmp(UGrid);
|
|
||||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
|
||||||
// if (mu !=2 ) ttmp = 0;
|
|
||||||
// ttmp = ttmp* pow(10.0,mu);
|
|
||||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Naive wilson implementation
|
|
||||||
////////////////////////////////////
|
|
||||||
// replicate across fifth dimension
|
|
||||||
// LatticeGaugeFieldF Umu5d(FGrid);
|
|
||||||
std::vector<LatticeColourMatrixF> U(4, UGrid);
|
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
|
||||||
{
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{
|
|
||||||
ref = Zero();
|
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
|
||||||
{
|
|
||||||
|
|
||||||
tmp = Cshift(src, mu + 1, 1);
|
|
||||||
{
|
|
||||||
autoView(tmp_v, tmp, CpuWrite);
|
|
||||||
autoView(U_v, U[mu], CpuRead);
|
|
||||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
|
||||||
{
|
|
||||||
for (int s = 0; s < Ls; s++)
|
|
||||||
{
|
|
||||||
tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ref = ref + tmp - Gamma(Gmu[mu]) * tmp;
|
|
||||||
|
|
||||||
{
|
|
||||||
autoView(tmp_v, tmp, CpuWrite);
|
|
||||||
autoView(U_v, U[mu], CpuRead);
|
|
||||||
autoView(src_v, src, CpuRead);
|
|
||||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
|
||||||
{
|
|
||||||
for (int s = 0; s < Ls; s++)
|
|
||||||
{
|
|
||||||
tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tmp = Cshift(tmp, mu + 1, -1);
|
|
||||||
ref = ref + tmp + Gamma(Gmu[mu]) * tmp;
|
|
||||||
}
|
|
||||||
ref = -0.5 * ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD mass = 0.1;
|
|
||||||
RealD M5 = 1.8;
|
|
||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
|
||||||
RealD NN = UGrid->NodeCount();
|
|
||||||
|
|
||||||
json["ranks"] = NP;
|
|
||||||
json["nodes"] = NN;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "*****************************************************************"
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "*****************************************************************"
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "*****************************************************************"
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "* Benchmarking DomainWallFermionR::Dhop " << std::endl;
|
|
||||||
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
|
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
if (sizeof(RealF) == 4)
|
|
||||||
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
|
|
||||||
if (sizeof(RealF) == 8)
|
|
||||||
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
|
|
||||||
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
|
|
||||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
|
|
||||||
std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
|
|
||||||
#endif
|
|
||||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
|
|
||||||
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
|
|
||||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
|
|
||||||
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
|
|
||||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "*****************************************************************"
|
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
|
|
||||||
int ncall = 300;
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{
|
|
||||||
FGrid->Barrier();
|
|
||||||
Dw.ZeroCounters();
|
|
||||||
Dw.Dhop(src, result, 0);
|
|
||||||
std::cout << GridLogMessage << "Called warmup" << std::endl;
|
|
||||||
double t0 = usecond();
|
|
||||||
for (int i = 0; i < ncall; i++)
|
|
||||||
{
|
|
||||||
__SSC_START;
|
|
||||||
Dw.Dhop(src, result, 0);
|
|
||||||
__SSC_STOP;
|
|
||||||
}
|
|
||||||
double t1 = usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
double volume = Ls;
|
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
|
||||||
volume = volume * latt4[mu];
|
|
||||||
double flops = single_site_flops * volume * ncall;
|
|
||||||
|
|
||||||
auto nsimd = vComplex::Nsimd();
|
|
||||||
auto simdwidth = sizeof(vComplex);
|
|
||||||
|
|
||||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
|
||||||
double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
|
|
||||||
nsimd * ncall / (1024. * 1024. * 1024.);
|
|
||||||
|
|
||||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
|
||||||
double data_mem =
|
|
||||||
(volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
|
|
||||||
nsimd * ncall / (1024. * 1024. * 1024.);
|
|
||||||
|
|
||||||
json["Dw"]["calls"] = ncall;
|
|
||||||
json["Dw"]["time"] = t1 - t0;
|
|
||||||
json["Dw"]["mflops"] = flops / (t1 - t0);
|
|
||||||
json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
|
|
||||||
json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN;
|
|
||||||
json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
|
|
||||||
json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
|
|
||||||
<< " us" << std::endl;
|
|
||||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
|
||||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
|
||||||
std::cout << GridLogMessage << "mflop/s = " << flops / (t1 - t0) << std::endl;
|
|
||||||
std::cout << GridLogMessage << "mflop/s per rank = " << flops / (t1 - t0) / NP
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage << "mflop/s per node = " << flops / (t1 - t0) / NN
|
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "RF GiB/s (base 2) = " << 1000000. * data_rf / ((t1 - t0))
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "mem GiB/s (base 2) = " << 1000000. * data_mem / ((t1 - t0))
|
|
||||||
<< std::endl;
|
|
||||||
err = ref - result;
|
|
||||||
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
|
|
||||||
// exit(0);
|
|
||||||
|
|
||||||
if ((norm2(err) > 1.0e-4))
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
std::cout << "RESULT\n " << result<<std::endl;
|
|
||||||
std::cout << "REF \n " << ref <<std::endl;
|
|
||||||
std::cout << "ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
std::cout << GridLogMessage << "WRONG RESULT" << std::endl;
|
|
||||||
FGrid->Barrier();
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
assert(norm2(err) < 1.0e-4);
|
|
||||||
Dw.Report();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{ // Naive wilson dag implementation
|
|
||||||
ref = Zero();
|
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
|
||||||
{
|
|
||||||
|
|
||||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
|
||||||
tmp = Cshift(src, mu + 1, 1);
|
|
||||||
{
|
|
||||||
autoView(ref_v, ref, CpuWrite);
|
|
||||||
autoView(tmp_v, tmp, CpuRead);
|
|
||||||
autoView(U_v, U[mu], CpuRead);
|
|
||||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
|
||||||
{
|
|
||||||
for (int s = 0; s < Ls; s++)
|
|
||||||
{
|
|
||||||
int i = s + Ls * ss;
|
|
||||||
ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]);
|
|
||||||
;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
autoView(tmp_v, tmp, CpuWrite);
|
|
||||||
autoView(U_v, U[mu], CpuRead);
|
|
||||||
autoView(src_v, src, CpuRead);
|
|
||||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
|
||||||
{
|
|
||||||
for (int s = 0; s < Ls; s++)
|
|
||||||
{
|
|
||||||
tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// tmp =adj(U[mu])*src;
|
|
||||||
tmp = Cshift(tmp, mu + 1, -1);
|
|
||||||
{
|
|
||||||
autoView(ref_v, ref, CpuWrite);
|
|
||||||
autoView(tmp_v, tmp, CpuRead);
|
|
||||||
for (int i = 0; i < ref_v.size(); i++)
|
|
||||||
{
|
|
||||||
ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i];
|
|
||||||
;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ref = -0.5 * ref;
|
|
||||||
}
|
|
||||||
// dump=1;
|
|
||||||
Dw.Dhop(src, result, 1);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "Compare to naive wilson implementation Dag to verify correctness"
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage << "Called DwDag" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
|
|
||||||
std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl;
|
|
||||||
err = ref - result;
|
|
||||||
std::cout << GridLogMessage << "norm dag diff " << norm2(err) << std::endl;
|
|
||||||
if ((norm2(err) > 1.0e-4))
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
|
||||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
|
||||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
LatticeFermionF src_e(FrbGrid);
|
|
||||||
LatticeFermionF src_o(FrbGrid);
|
|
||||||
LatticeFermionF r_e(FrbGrid);
|
|
||||||
LatticeFermionF r_o(FrbGrid);
|
|
||||||
LatticeFermionF r_eo(FGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
|
|
||||||
<< std::endl;
|
|
||||||
pickCheckerboard(Even, src_e, src);
|
|
||||||
pickCheckerboard(Odd, src_o, src);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "src_e" << norm2(src_e) << std::endl;
|
|
||||||
std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
|
|
||||||
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
|
|
||||||
<< "*********************************************************" << std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "* Benchmarking DomainWallFermionF::DhopEO " << std::endl;
|
|
||||||
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
|
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
if (sizeof(RealF) == 4)
|
|
||||||
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
|
|
||||||
if (sizeof(RealF) == 8)
|
|
||||||
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
|
|
||||||
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
|
|
||||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
|
|
||||||
std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
|
|
||||||
#endif
|
|
||||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
|
|
||||||
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
|
|
||||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
|
|
||||||
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
|
|
||||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
|
|
||||||
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< "*********************************************************" << std::endl;
|
|
||||||
|
|
||||||
{
|
|
||||||
Dw.ZeroCounters();
|
|
||||||
FGrid->Barrier();
|
|
||||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
|
||||||
double t0 = usecond();
|
|
||||||
for (int i = 0; i < ncall; i++)
|
|
||||||
{
|
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
if (i == 10)
|
|
||||||
cudaProfilerStart();
|
|
||||||
#endif
|
|
||||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
if (i == 20)
|
|
||||||
cudaProfilerStop();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
double t1 = usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
double volume = Ls;
|
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
|
||||||
volume = volume * latt4[mu];
|
|
||||||
double flops = (single_site_flops * volume * ncall) / 2.0;
|
|
||||||
|
|
||||||
json["Deo"]["calls"] = ncall;
|
|
||||||
json["Deo"]["time"] = t1 - t0;
|
|
||||||
json["Deo"]["mflops"] = flops / (t1 - t0);
|
|
||||||
json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
|
|
||||||
json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP
|
|
||||||
<< std::endl;
|
|
||||||
std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN
|
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
Dw.Report();
|
|
||||||
}
|
|
||||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
|
||||||
Dw.DhopOE(src_e, r_o, DaggerNo);
|
|
||||||
Dw.Dhop(src, result, DaggerNo);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl;
|
|
||||||
std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl;
|
|
||||||
std::cout << GridLogMessage << "res" << norm2(result) << std::endl;
|
|
||||||
|
|
||||||
setCheckerboard(r_eo, r_o);
|
|
||||||
setCheckerboard(r_eo, r_e);
|
|
||||||
|
|
||||||
err = r_eo - result;
|
|
||||||
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
|
|
||||||
if ((norm2(err) > 1.0e-4))
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
|
||||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
|
||||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
pickCheckerboard(Even, src_e, err);
|
|
||||||
pickCheckerboard(Odd, src_o, err);
|
|
||||||
std::cout << GridLogMessage << "norm diff even " << norm2(src_e) << std::endl;
|
|
||||||
std::cout << GridLogMessage << "norm diff odd " << norm2(src_o) << std::endl;
|
|
||||||
|
|
||||||
assert(norm2(src_e) < 1.0e-4);
|
|
||||||
assert(norm2(src_o) < 1.0e-4);
|
|
||||||
|
|
||||||
if (!json_filename.empty())
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage << "writing benchmark results to " << json_filename
|
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
int me = 0;
|
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
|
||||||
if (me == 0)
|
|
||||||
{
|
|
||||||
std::ofstream json_file(json_filename);
|
|
||||||
json_file << std::setw(4) << json;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
exit(0);
|
|
||||||
}
|
|
@ -26,6 +26,20 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||||||
#define GRID_MSG_MAXSIZE 1024
|
#define GRID_MSG_MAXSIZE 1024
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define GRID_BIG_SEP \
|
||||||
|
"==============================================================================="
|
||||||
|
#define GRID_SMALL_SEP "------------------------------------------"
|
||||||
|
|
||||||
|
#define grid_big_sep() \
|
||||||
|
{ \
|
||||||
|
GRID_MSG << GRID_BIG_SEP << std::endl; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define grid_small_sep() \
|
||||||
|
{ \
|
||||||
|
GRID_MSG << GRID_SMALL_SEP << std::endl; \
|
||||||
|
}
|
||||||
|
|
||||||
#define grid_printf(...) \
|
#define grid_printf(...) \
|
||||||
{ \
|
{ \
|
||||||
char _buf[GRID_MSG_MAXSIZE]; \
|
char _buf[GRID_MSG_MAXSIZE]; \
|
||||||
|
@ -1,12 +1,8 @@
|
|||||||
ACLOCAL_AMFLAGS = -I .buildutils/m4
|
ACLOCAL_AMFLAGS = -I .buildutils/m4
|
||||||
|
|
||||||
bin_PROGRAMS = \
|
bin_PROGRAMS = \
|
||||||
Benchmark_comms_host_device \
|
|
||||||
Benchmark_dwf_fp32 \
|
|
||||||
Benchmark_Grid \
|
Benchmark_Grid \
|
||||||
Benchmark_IO
|
Benchmark_IO
|
||||||
|
|
||||||
Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp
|
|
||||||
Benchmark_dwf_fp32_SOURCES = Benchmark_dwf_fp32.cpp
|
|
||||||
Benchmark_Grid_SOURCES = Benchmark_Grid.cpp
|
Benchmark_Grid_SOURCES = Benchmark_Grid.cpp
|
||||||
Benchmark_IO_SOURCES = Benchmark_IO.cpp
|
Benchmark_IO_SOURCES = Benchmark_IO.cpp
|
||||||
|
@ -6,6 +6,7 @@ The benchmarks can be summarised as follows
|
|||||||
- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
|
- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
|
||||||
matrices, as well as bandwidth measurement for different operations. Measurements are
|
matrices, as well as bandwidth measurement for different operations. Measurements are
|
||||||
performed for a fixed range of problem sizes.
|
performed for a fixed range of problem sizes.
|
||||||
|
- `Benchmark_IO`: Parallel I/O benchmark.
|
||||||
|
|
||||||
## TL;DR
|
## TL;DR
|
||||||
Build and install Grid, all dependencies, and the benchmark with
|
Build and install Grid, all dependencies, and the benchmark with
|
||||||
@ -28,7 +29,7 @@ You should first deploy the environment for the specific system you are using, f
|
|||||||
systems/tursa/bootstrap-env.sh ./env
|
systems/tursa/bootstrap-env.sh ./env
|
||||||
```
|
```
|
||||||
will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
|
will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
|
||||||
of packages, and might take some time to complete.
|
of packages, and take some time to complete.
|
||||||
|
|
||||||
After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
|
After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
|
||||||
```bash
|
```bash
|
||||||
@ -66,4 +67,84 @@ where `<env_dir>` is the environment directory and `<config>` is the build confi
|
|||||||
|
|
||||||
## Running the benchmarks
|
## Running the benchmarks
|
||||||
After building the benchmarks as above you can find the binaries in
|
After building the benchmarks as above you can find the binaries in
|
||||||
`<env_dir>/prefix/gridbench_<config>`.
|
`<env_dir>/prefix/gridbench_<config>`. Depending on the system selected, the environment
|
||||||
|
directory might also contain batch script examples. More information about the benchmarks
|
||||||
|
is provided below.
|
||||||
|
|
||||||
|
### `Benchmark_Grid`
|
||||||
|
This benchmark performs flop/s measurement for typical lattice QCD sparse matrices, as
|
||||||
|
well as memory and inter-process bandwidth measurement using Grid routines. The benchmark
|
||||||
|
command accept any Grid flag (see complete list with `--help`), as well as a
|
||||||
|
`--json-out <file>` flag to save the measurement results in JSON to `<file>`. The
|
||||||
|
benchmarks are performed on a fix set of problem sizes, and the Grid flag `--grid` will
|
||||||
|
be ignored.
|
||||||
|
|
||||||
|
The resulting metrics are as follows, all data size units are in base 2
|
||||||
|
(i.e. 1 kB = 1024 B).
|
||||||
|
|
||||||
|
*Memory bandwidth*
|
||||||
|
|
||||||
|
One sub-benchmark measure the memory bandwidth using a lattice version of the `axpy` BLAS
|
||||||
|
routine, in a similar fashion to the STREAM benchmark. The JSON entries under `"axpy"`
|
||||||
|
have the form
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"GBps": 215.80653375861607, // bandwidth in GB/s/node
|
||||||
|
"GFlops": 19.310041765757834, // FP performance (double precision)
|
||||||
|
"L": 8, // local lattice volume
|
||||||
|
"size_MB": 3.0 // memory size in MB/node
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
A second benchmark performs site-wise SU(4) matrix multiplication, and has a higher
|
||||||
|
arithmetic intensity than the `axpy` one (although it is still memory-bound).
|
||||||
|
The JSON entries under `"SU4"` have the form
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"GBps": 394.76639187026865, // bandwidth in GB/s/node
|
||||||
|
"GFlops": 529.8464820758512, // FP performance (single precision)
|
||||||
|
"L": 8, // local lattice size
|
||||||
|
"size_MB": 6.0 // memory size in MB/node
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
*Inter-process bandwidth*
|
||||||
|
|
||||||
|
This sub-benchmark measures the achieved bidirectional bandwidth in threaded halo exchange
|
||||||
|
using routines in Grid. The exchange is performed in each direction on the MPI Cartesian
|
||||||
|
grid which is parallelised across at least 2 processes. The resulting bandwidth is related
|
||||||
|
to node-local transfers (inter-CPU, NVLink, ...) or network transfers depending on the MPI
|
||||||
|
decomposition. he JSON entries under `"comms"` have the form
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"L": 40, // local lattice size
|
||||||
|
"bytes": 73728000, // payload size in B/rank
|
||||||
|
"dir": 2, // direction of the exchange, 8 possible directions
|
||||||
|
// (0: +x, 1: +y, ..., 5: -x, 6: -y, ...)
|
||||||
|
"rate_GBps": {
|
||||||
|
"error": 6.474271894240327, // standard deviation across measurements (GB/s/node)
|
||||||
|
"max": 183.10546875, // maximum measured bandwidth (GB/s/node)
|
||||||
|
"mean": 175.21747026766676 // average measured bandwidth (GB/s/node)
|
||||||
|
},
|
||||||
|
"time_usec": 3135.055 // average transfer time (microseconds)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
*Floating-point performances*
|
||||||
|
|
||||||
|
This sub-benchmark measures the achieved floating-point performances using the
|
||||||
|
Wilson fermion, domain-wall fermion, and staggered fermion sparse matrices from Grid.
|
||||||
|
In the `"flops"` and `"results"` section of the JSON output are recorded the best
|
||||||
|
performances, e.g.
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"Gflops_dwf4": 366.5251173474483, // domain-wall in Gflop/s/node (single precision)
|
||||||
|
"Gflops_staggered": 7.5982861018529455, // staggered in Gflop/s/node (single precision)
|
||||||
|
"Gflops_wilson": 15.221839719288932, // Wilson in Gflop/s/node (single precision)
|
||||||
|
"L": 8 // local lattice size
|
||||||
|
}
|
||||||
|
```
|
||||||
|
Here "best" means across a number of different implementations of the routines. Please
|
||||||
|
see the log of the benchmark for an additional breakdown. Finally, the JSON output
|
||||||
|
contains a "comparison point", which is the average of the L=24 and L=32 best
|
||||||
|
domain-wall performances.
|
@ -20,8 +20,12 @@ mkdir -p "${build_dir}"
|
|||||||
source "${env_dir}/env.sh"
|
source "${env_dir}/env.sh"
|
||||||
entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
|
entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
|
||||||
env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
|
env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
|
||||||
cd "${build_dir}" || return
|
|
||||||
source "${env_dir}/${env_script}"
|
source "${env_dir}/${env_script}"
|
||||||
|
cd "${script_dir}"
|
||||||
|
if [ ! -f configure ]; then
|
||||||
|
./bootstrap.sh
|
||||||
|
fi
|
||||||
|
cd "${build_dir}"
|
||||||
if [ ! -f Makefile ]; then
|
if [ ! -f Makefile ]; then
|
||||||
"${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
|
"${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
|
||||||
--prefix="${env_dir}/prefix/gridbench_${cfg}"
|
--prefix="${env_dir}/prefix/gridbench_${cfg}"
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
|
|
||||||
GRIDENVDIR="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
env_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
||||||
export GRIDENVDIR
|
mkdir -p ~/.config/lattice-benchmarks
|
||||||
export PATH="${GRIDENVDIR}/prefix/base/bin:${PATH}"
|
echo "${env_dir}" > ~/.config/lattice-benchmarks/grid-env
|
||||||
export ACLOCAL_PATH="${GRIDENVDIR}/prefix/base/share/aclocal:${ACLOCAL_PATH}"
|
source "${env_dir}/spack/share/spack/setup-env.sh"
|
||||||
source "${GRIDENVDIR}"/spack/share/spack/setup-env.sh
|
spack load jq git
|
||||||
|
0
Grid/systems/tursa/files/cpu-mpi-wrapper.sh
Normal file → Executable file
0
Grid/systems/tursa/files/cpu-mpi-wrapper.sh
Normal file → Executable file
5
Grid/systems/tursa/files/gpu-mpi-wrapper.sh
Normal file → Executable file
5
Grid/systems/tursa/files/gpu-mpi-wrapper.sh
Normal file → Executable file
@ -1,13 +1,12 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
|
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
|
||||||
numa1=$(( 2 * lrank))
|
numa1=$((lrank))
|
||||||
numa2=$(( 2 * lrank + 1 ))
|
|
||||||
netdev=mlx5_${lrank}:1
|
netdev=mlx5_${lrank}:1
|
||||||
|
|
||||||
export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
|
export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
|
||||||
export UCX_NET_DEVICES=${netdev}
|
export UCX_NET_DEVICES=${netdev}
|
||||||
BINDING="--interleave=$numa1,$numa2"
|
BINDING="--interleave=$numa1"
|
||||||
|
|
||||||
echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
|
echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
|
||||||
|
|
||||||
|
17
Grid/systems/tursa/files/ompi-gpu.sh
Normal file
17
Grid/systems/tursa/files/ompi-gpu.sh
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=8
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
60
Grid/systems/tursa/files/run.gpu.16nodes.sh
Normal file
60
Grid/systems/tursa/files/run.gpu.16nodes.sh
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
#SBATCH -J benchmark-grid-16
|
||||||
|
#SBATCH -t 1:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --qos=standard
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
#SBATCH --gpu-freq=1410
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||||
|
if [ ! -f "${env_cfg}" ]; then
|
||||||
|
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||||
|
source "${env_dir}/env.sh" # load base Spack environment
|
||||||
|
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||||
|
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd "${app}" > "${job_info_dir}/ldd"
|
||||||
|
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||||
|
"${app}" \
|
||||||
|
--json-out "${job_info_dir}/result.json" \
|
||||||
|
--mpi 1.4.4.4 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--threads 8 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
################################################################################
|
60
Grid/systems/tursa/files/run.gpu.1nodes.sh
Normal file
60
Grid/systems/tursa/files/run.gpu.1nodes.sh
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
#SBATCH -J benchmark-grid-1
|
||||||
|
#SBATCH -t 1:00:00
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks=4
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --qos=standard
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
#SBATCH --gpu-freq=1410
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||||
|
if [ ! -f "${env_cfg}" ]; then
|
||||||
|
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||||
|
source "${env_dir}/env.sh" # load base Spack environment
|
||||||
|
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||||
|
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd "${app}" > "${job_info_dir}/ldd"
|
||||||
|
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||||
|
"${app}" \
|
||||||
|
--json-out "${job_info_dir}/result.json" \
|
||||||
|
--mpi 1.1.1.4 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--threads 8 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
################################################################################
|
60
Grid/systems/tursa/files/run.gpu.32nodes.sh
Normal file
60
Grid/systems/tursa/files/run.gpu.32nodes.sh
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
#SBATCH -J benchmark-grid-32
|
||||||
|
#SBATCH -t 1:00:00
|
||||||
|
#SBATCH --nodes=32
|
||||||
|
#SBATCH --ntasks=128
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --qos=standard
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
#SBATCH --gpu-freq=1410
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||||
|
if [ ! -f "${env_cfg}" ]; then
|
||||||
|
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||||
|
source "${env_dir}/env.sh" # load base Spack environment
|
||||||
|
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||||
|
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd "${app}" > "${job_info_dir}/ldd"
|
||||||
|
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||||
|
"${app}" \
|
||||||
|
--json-out "${job_info_dir}/result.json" \
|
||||||
|
--mpi 1.4.4.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--threads 8 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
################################################################################
|
@ -4,7 +4,13 @@ set -euo pipefail
|
|||||||
|
|
||||||
gcc_spec='gcc@9.4.0'
|
gcc_spec='gcc@9.4.0'
|
||||||
cuda_spec='cuda@11.4.0'
|
cuda_spec='cuda@11.4.0'
|
||||||
hdf5_spec='hdf5@1.10.7'
|
|
||||||
|
# hdf5 and fftw depend on OpenMPI, which we install manually. To make sure this
|
||||||
|
# dependency is picked by spack, we specify the compiler here explicitly. For
|
||||||
|
# most other packages we dont really care about the compiler (i.e. system
|
||||||
|
# compiler versus ${gcc_spec})
|
||||||
|
hdf5_spec="hdf5@1.10.7+cxx+threadsafe%${gcc_spec}"
|
||||||
|
fftw_spec="fftw%${gcc_spec}"
|
||||||
|
|
||||||
if (( $# != 1 )); then
|
if (( $# != 1 )); then
|
||||||
echo "usage: $(basename "$0") <env dir>" 1>&2
|
echo "usage: $(basename "$0") <env dir>" 1>&2
|
||||||
@ -18,7 +24,7 @@ cd "${cwd}"
|
|||||||
|
|
||||||
# General configuration ########################################################
|
# General configuration ########################################################
|
||||||
# build with 128 tasks
|
# build with 128 tasks
|
||||||
echo 'config:
|
echo 'config:
|
||||||
build_jobs: 128
|
build_jobs: 128
|
||||||
build_stage:
|
build_stage:
|
||||||
- $spack/var/spack/stage
|
- $spack/var/spack/stage
|
||||||
@ -38,26 +44,23 @@ rm external.yaml
|
|||||||
|
|
||||||
# Base compilers ###############################################################
|
# Base compilers ###############################################################
|
||||||
# configure system base
|
# configure system base
|
||||||
|
|
||||||
|
spack env create base
|
||||||
|
spack env activate base
|
||||||
spack compiler find --scope site
|
spack compiler find --scope site
|
||||||
|
|
||||||
# install GCC, CUDA & LLVM
|
# install GCC, CUDA
|
||||||
spack install ${gcc_spec} ${cuda_spec} llvm
|
spack add ${gcc_spec} ${cuda_spec}
|
||||||
|
spack concretize
|
||||||
spack load llvm
|
spack env depfile -o Makefile.tmp
|
||||||
|
make -j128 -f Makefile.tmp
|
||||||
spack compiler find --scope site
|
spack compiler find --scope site
|
||||||
spack unload llvm
|
|
||||||
|
|
||||||
spack load ${gcc_spec}
|
|
||||||
spack compiler find --scope site
|
|
||||||
spack unload ${gcc_spec}
|
|
||||||
|
|
||||||
# Manual compilation of OpenMPI & UCX ##########################################
|
# Manual compilation of OpenMPI & UCX ##########################################
|
||||||
# set build directories
|
# set build directories
|
||||||
mkdir -p "${dir}"/build
|
mkdir -p "${dir}"/build
|
||||||
cd "${dir}"/build
|
cd "${dir}"/build
|
||||||
|
|
||||||
spack load ${gcc_spec} ${cuda_spec}
|
|
||||||
|
|
||||||
cuda_path=$(spack find --format "{prefix}" cuda)
|
cuda_path=$(spack find --format "{prefix}" cuda)
|
||||||
gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1
|
gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1
|
||||||
|
|
||||||
@ -77,7 +80,7 @@ mkdir -p build_gpu; cd build_gpu
|
|||||||
--enable-devel-headers --enable-examples --enable-optimizations \
|
--enable-devel-headers --enable-examples --enable-optimizations \
|
||||||
--with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging \
|
--with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging \
|
||||||
--disable-debug --disable-assertions --enable-cma \
|
--disable-debug --disable-assertions --enable-cma \
|
||||||
--with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \
|
--with-knem=/opt/knem-1.1.4.90mlnx2/ --with-rdmacm \
|
||||||
--without-rocm --without-ugni --without-java \
|
--without-rocm --without-ugni --without-java \
|
||||||
--enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm \
|
--enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm \
|
||||||
--with-rc --with-ud --with-dc --with-mlx5-dv --with-dm \
|
--with-rc --with-ud --with-dc --with-mlx5-dv --with-dm \
|
||||||
@ -93,7 +96,7 @@ mkdir -p build_cpu; cd build_cpu
|
|||||||
--enable-devel-headers --enable-examples --enable-optimizations \
|
--enable-devel-headers --enable-examples --enable-optimizations \
|
||||||
--with-verbs --disable-logging --disable-debug \
|
--with-verbs --disable-logging --disable-debug \
|
||||||
--disable-assertions --enable-mt --enable-cma \
|
--disable-assertions --enable-mt --enable-cma \
|
||||||
--with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \
|
--with-knem=/opt/knem-1.1.4.90mlnx2/--with-rdmacm \
|
||||||
--without-rocm --without-ugni --without-java \
|
--without-rocm --without-ugni --without-java \
|
||||||
--enable-compiler-opt=3 --without-cm --without-ugni --with-rc \
|
--enable-compiler-opt=3 --without-cm --without-ugni --with-rc \
|
||||||
--with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go
|
--with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go
|
||||||
@ -119,13 +122,13 @@ mkdir build_gpu; cd build_gpu
|
|||||||
../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem \
|
../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem \
|
||||||
--with-ucx="${dir}"/prefix/ucx_gpu \
|
--with-ucx="${dir}"/prefix/ucx_gpu \
|
||||||
--with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib \
|
--with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib \
|
||||||
--with-knem=/opt/knem-1.1.4.90mlnx1/ \
|
--with-knem=/opt/knem-1.1.4.90mlnx2/ \
|
||||||
--enable-mca-no-build=btl-uct \
|
--enable-mca-no-build=btl-uct \
|
||||||
--with-cuda="${cuda_path}" --disable-getpwuid \
|
--with-cuda="${cuda_path}" --disable-getpwuid \
|
||||||
--with-verbs --with-slurm --enable-mpi-fortran=all \
|
--with-verbs --with-slurm --enable-mpi-fortran=all \
|
||||||
--with-pmix=internal --with-libevent=internal
|
--with-pmix=internal --with-libevent=internal
|
||||||
make -j 128
|
make -j 128
|
||||||
make install
|
make install
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
# openmpi cpu build
|
# openmpi cpu build
|
||||||
@ -133,7 +136,7 @@ mkdir build_cpu; cd build_cpu
|
|||||||
../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem \
|
../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem \
|
||||||
--with-ucx="${dir}"/prefix/ucx_cpu \
|
--with-ucx="${dir}"/prefix/ucx_cpu \
|
||||||
--with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib \
|
--with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib \
|
||||||
--with-knem=/opt/knem-1.1.4.90mlnx1/ \
|
--with-knem=/opt/knem-1.1.4.90mlnx2/ \
|
||||||
--enable-mca-no-build=btl-uct --disable-getpwuid \
|
--enable-mca-no-build=btl-uct --disable-getpwuid \
|
||||||
--with-verbs --with-slurm --enable-mpi-fortran=all \
|
--with-verbs --with-slurm --enable-mpi-fortran=all \
|
||||||
--with-pmix=internal --with-libevent=internal
|
--with-pmix=internal --with-libevent=internal
|
||||||
@ -141,65 +144,65 @@ make -j 128
|
|||||||
make install
|
make install
|
||||||
cd "${dir}"
|
cd "${dir}"
|
||||||
|
|
||||||
|
ucx_spec_gpu="ucx@1.12.0.GPU%${gcc_spec}"
|
||||||
|
ucx_spec_cpu="ucx@1.12.0.CPU%${gcc_spec}"
|
||||||
|
openmpi_spec_gpu="openmpi@4.1.1.GPU%${gcc_spec}"
|
||||||
|
openmpi_spec_cpu="openmpi@4.1.1.CPU%${gcc_spec}"
|
||||||
|
|
||||||
# Add externals to spack
|
# Add externals to spack
|
||||||
echo "packages:
|
echo "packages:
|
||||||
ucx:
|
ucx:
|
||||||
externals:
|
externals:
|
||||||
- spec: \"ucx@1.12.0.GPU%gcc@9.4.0\"
|
- spec: \"${ucx_spec_gpu}\"
|
||||||
prefix: ${dir}/prefix/ucx_gpu
|
prefix: ${dir}/prefix/ucx_gpu
|
||||||
- spec: \"ucx@1.12.0.CPU%gcc@9.4.0\"
|
- spec: \"${ucx_spec_cpu}\"
|
||||||
prefix: ${dir}/prefix/ucx_cpu
|
prefix: ${dir}/prefix/ucx_cpu
|
||||||
buildable: False
|
buildable: False
|
||||||
openmpi:
|
openmpi:
|
||||||
externals:
|
externals:
|
||||||
- spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\"
|
- spec: \"${openmpi_spec_gpu}\"
|
||||||
prefix: ${dir}/prefix/ompi_gpu
|
prefix: ${dir}/prefix/ompi_gpu
|
||||||
- spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\"
|
- spec: \"${openmpi_spec_cpu}\"
|
||||||
prefix: ${dir}/prefix/ompi_cpu
|
prefix: ${dir}/prefix/ompi_cpu
|
||||||
buildable: False" > spack.yaml
|
buildable: False" > spack.yaml
|
||||||
|
|
||||||
spack config --scope site add -f spack.yaml
|
spack config --scope site add -f spack.yaml
|
||||||
rm spack.yaml
|
rm spack.yaml
|
||||||
spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
|
spack env deactivate
|
||||||
spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
|
|
||||||
|
|
||||||
cd "${cwd}"
|
cd "${cwd}"
|
||||||
|
|
||||||
# environments #################################################################
|
# environments #################################################################
|
||||||
dev_tools=("autoconf" "automake" "libtool" "jq")
|
dev_tools=("autoconf" "automake" "libtool" "jq" "git")
|
||||||
ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU)
|
|
||||||
ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU)
|
|
||||||
|
|
||||||
spack env create grid-gpu
|
spack env create grid-gpu
|
||||||
spack env activate grid-gpu
|
spack env activate grid-gpu
|
||||||
spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}"
|
spack compiler find --scope site
|
||||||
spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
|
spack add ${gcc_spec} ${cuda_spec} ${ucx_spec_gpu} ${openmpi_spec_gpu}
|
||||||
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}"
|
spack add ${hdf5_spec} ${fftw_spec}
|
||||||
spack add fftw ^/"${ompi_gpu_hash}"
|
spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
|
||||||
spack add openssl gmp mpfr c-lime
|
spack concretize
|
||||||
spack install
|
spack env depfile -o Makefile.tmp
|
||||||
|
make -j128 -f Makefile.tmp
|
||||||
spack env deactivate
|
spack env deactivate
|
||||||
|
|
||||||
spack env create grid-cpu
|
spack env create grid-cpu
|
||||||
spack env activate grid-cpu
|
spack env activate grid-cpu
|
||||||
spack add llvm "${dev_tools[@]}"
|
spack compiler find --scope site
|
||||||
spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
|
spack add ${gcc_spec} ${ucx_spec_cpu} ${openmpi_spec_cpu}
|
||||||
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}"
|
spack add ${hdf5_spec} ${fftw_spec}
|
||||||
spack add fftw ^/"${ompi_cpu_hash}"
|
spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
|
||||||
spack add openssl gmp mpfr c-lime
|
spack concretize
|
||||||
spack install
|
spack env depfile -o Makefile.tmp
|
||||||
|
make -j128 -f Makefile.tmp
|
||||||
spack env deactivate
|
spack env deactivate
|
||||||
|
|
||||||
# Final setup ##################################################################
|
# Final setup ##################################################################
|
||||||
spack clean
|
spack clean
|
||||||
spack gc -y
|
#spack gc -y # "spack gc" tends to get hung up for unknown reasons
|
||||||
|
|
||||||
# add more environment variables in module loading
|
# add more environment variables in module loading
|
||||||
spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]'
|
spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]'
|
||||||
spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]'
|
spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]'
|
||||||
spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]'
|
spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]'
|
||||||
spack module tcl refresh -y
|
spack module tcl refresh -y
|
||||||
|
|
||||||
# permission change for group access
|
|
||||||
chmod -R g+rw "${dir}/spack/var/spack/cache"
|
|
||||||
setfacl -d -R -m g::rwX "${dir}/spack/var/spack/cache"
|
|
||||||
|
14
Quda/.clang-format
Normal file
14
Quda/.clang-format
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
BasedOnStyle: LLVM,
|
||||||
|
UseTab: Never,
|
||||||
|
IndentWidth: 2,
|
||||||
|
TabWidth: 2,
|
||||||
|
BreakBeforeBraces: Allman,
|
||||||
|
AllowShortIfStatementsOnASingleLine: false,
|
||||||
|
IndentCaseLabels: false,
|
||||||
|
ColumnLimit: 90,
|
||||||
|
AccessModifierOffset: -4,
|
||||||
|
NamespaceIndentation: All,
|
||||||
|
FixNamespaceComments: false,
|
||||||
|
SortIncludes: true,
|
||||||
|
}
|
458
Quda/Benchmark_Quda.cpp
Normal file
458
Quda/Benchmark_Quda.cpp
Normal file
@ -0,0 +1,458 @@
|
|||||||
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
|
#include <blas_quda.h>
|
||||||
|
#include <cassert>
|
||||||
|
#include <chrono>
|
||||||
|
#include <color_spinor_field.h>
|
||||||
|
#include <communicator_quda.h>
|
||||||
|
#include <dirac_quda.h>
|
||||||
|
#include <fstream>
|
||||||
|
#include <gauge_tools.h>
|
||||||
|
#include <memory>
|
||||||
|
#include <mpi.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// remove to use QUDA's own flop counting instead of Grid's convention
|
||||||
|
#define FLOP_COUNTING_GRID
|
||||||
|
|
||||||
|
#include "json.hpp"
|
||||||
|
using nlohmann::json;
|
||||||
|
json json_results;
|
||||||
|
|
||||||
|
using namespace quda;
|
||||||
|
|
||||||
|
// thanks chatGPT :)
|
||||||
|
std::string get_timestamp()
|
||||||
|
{
|
||||||
|
// Get the current time
|
||||||
|
auto now = std::chrono::system_clock::now();
|
||||||
|
|
||||||
|
// Convert the current time to a time_t object
|
||||||
|
std::time_t currentTime = std::chrono::system_clock::to_time_t(now);
|
||||||
|
|
||||||
|
// Format the time using std::put_time
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << std::put_time(std::localtime(¤tTime), "%Y%m%d %H:%M:%S");
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the MPI grid, i.e. the layout of ranks
|
||||||
|
int nranks = -1;
|
||||||
|
std::array<int, 4> mpi_grid = {1, 1, 1, 1};
|
||||||
|
|
||||||
|
// run f() in a loop for roughly target_time seconds
|
||||||
|
// returns seconds per iteration it took
|
||||||
|
template <class F> double bench(F const &f, double target_time, int niter_warmup = 5)
|
||||||
|
{
|
||||||
|
device_timer_t timer;
|
||||||
|
timer.start();
|
||||||
|
for (int iter = 0; iter < niter_warmup; ++iter)
|
||||||
|
f();
|
||||||
|
timer.stop();
|
||||||
|
|
||||||
|
double secs = timer.last() / niter_warmup;
|
||||||
|
int niter = std::max(1, int(target_time / secs));
|
||||||
|
// niter = std::min(1000, niter);
|
||||||
|
// printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter);
|
||||||
|
|
||||||
|
// important: each rank has its own timer, so their measurements can slightly vary. But
|
||||||
|
// 'niter' needs to be consistent (bug took me a couple hours to track down)
|
||||||
|
comm_broadcast_global(&niter, sizeof(niter), 0);
|
||||||
|
|
||||||
|
timer.reset(__FUNCTION__, __FILE__, __LINE__);
|
||||||
|
timer.start();
|
||||||
|
for (int iter = 0; iter < niter; ++iter)
|
||||||
|
f();
|
||||||
|
timer.stop();
|
||||||
|
return timer.last() / niter;
|
||||||
|
}
|
||||||
|
|
||||||
|
void initComms(int argc, char **argv)
|
||||||
|
{
|
||||||
|
// init MPI communication
|
||||||
|
MPI_Init(&argc, &argv);
|
||||||
|
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
|
||||||
|
assert(1 <= nranks && nranks <= 100000);
|
||||||
|
|
||||||
|
mpi_grid[3] = nranks;
|
||||||
|
|
||||||
|
// this maps coordinates to rank number
|
||||||
|
auto lex_rank_from_coords = [](int const *coords, void *)
|
||||||
|
{
|
||||||
|
int rank = coords[0];
|
||||||
|
for (int i = 1; i < 4; i++)
|
||||||
|
rank = mpi_grid[i] * rank + coords[i];
|
||||||
|
return rank;
|
||||||
|
};
|
||||||
|
|
||||||
|
initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr);
|
||||||
|
|
||||||
|
for (int d = 0; d < 4; d++)
|
||||||
|
if (mpi_grid[d] > 1)
|
||||||
|
commDimPartitionedSet(d);
|
||||||
|
|
||||||
|
json_results["geometry"]["ranks"] = nranks;
|
||||||
|
json_results["geometry"]["mpi"] = mpi_grid;
|
||||||
|
}
|
||||||
|
|
||||||
|
// creates a random gauge field. L = local(!) size
|
||||||
|
cudaGaugeField make_gauge_field(int L)
|
||||||
|
{
|
||||||
|
GaugeFieldParam param;
|
||||||
|
|
||||||
|
// dimension and type of the lattice object
|
||||||
|
param.nDim = 4;
|
||||||
|
param.x[0] = L;
|
||||||
|
param.x[1] = L;
|
||||||
|
param.x[2] = L;
|
||||||
|
param.x[3] = L;
|
||||||
|
|
||||||
|
// number of colors. potentially confusingly, QUDA sometimes uses the word "color" to
|
||||||
|
// things unrelated with physical color. things like "nColor=32" do pop up in deflation
|
||||||
|
// solvers where it (to my understanding) refers to the number of (parallely processed)
|
||||||
|
// deflation vectors.
|
||||||
|
param.nColor = 3;
|
||||||
|
|
||||||
|
// boundary conditions (dont really care for benchmark)
|
||||||
|
param.t_boundary = QUDA_PERIODIC_T;
|
||||||
|
|
||||||
|
// for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller
|
||||||
|
// precisions are available in QUDA too
|
||||||
|
param.setPrecision(QUDA_SINGLE_PRECISION);
|
||||||
|
|
||||||
|
// no even/odd subset, we want a full lattice
|
||||||
|
param.siteSubset = QUDA_FULL_SITE_SUBSET;
|
||||||
|
|
||||||
|
// what kind of 3x3 matrices the field contains. A proper gauge field has SU(3)
|
||||||
|
// matrices, but (for example) smeared/thick links could have non-unitary links.
|
||||||
|
param.link_type = QUDA_SU3_LINKS;
|
||||||
|
|
||||||
|
// "NULL" does not initialize the field upon creation, "ZERO" would set everything to 0
|
||||||
|
param.create = QUDA_NULL_FIELD_CREATE;
|
||||||
|
|
||||||
|
// field should be allocated directly on the accelerator/GPU
|
||||||
|
param.location = QUDA_CUDA_FIELD_LOCATION;
|
||||||
|
|
||||||
|
// "reconstruct" here means reconstructing a SU(3) matrix from fewer than 18 real
|
||||||
|
// numbers (=3x3 complex numbers). Great feature in production (saving
|
||||||
|
// memory/cache/network bandwidth), not used for this benchmark.
|
||||||
|
param.reconstruct = QUDA_RECONSTRUCT_NO;
|
||||||
|
|
||||||
|
// "ghostExchange" would often be called "halo exchange" outside of Quda. This has
|
||||||
|
// nothing to do with ghost fields from continuum/perturbative qcd.
|
||||||
|
param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
|
||||||
|
|
||||||
|
// This controls the physical order of elements. "float2" is the the default
|
||||||
|
param.order = QUDA_FLOAT2_GAUGE_ORDER;
|
||||||
|
|
||||||
|
// this means the field is a LORENTZ vector (which a gauge field must be). Has nothing
|
||||||
|
// to do with spin.
|
||||||
|
param.geometry = QUDA_VECTOR_GEOMETRY;
|
||||||
|
|
||||||
|
// create the field and fill with random SU(3) matrices
|
||||||
|
// std::cout << param << std::endl; // double-check parameters
|
||||||
|
auto U = cudaGaugeField(param);
|
||||||
|
gaugeGauss(U, /*seed=*/1234, 1.0);
|
||||||
|
return U;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create a random source vector (L = local size)
|
||||||
|
ColorSpinorField make_source(int L, int Ls = 1)
|
||||||
|
{
|
||||||
|
// NOTE: `param.x` directly determines the size of the (local, per rank) memory
|
||||||
|
// allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a
|
||||||
|
// physical local volume of L^4, thus implicity choosing a dimension for the
|
||||||
|
// checkerboarding (shouldnt really matter of course which one).
|
||||||
|
ColorSpinorParam param;
|
||||||
|
param.nColor = 3;
|
||||||
|
param.nSpin = 4;
|
||||||
|
param.nVec = 1; // only a single vector
|
||||||
|
param.pad = 0;
|
||||||
|
param.siteSubset = QUDA_PARITY_SITE_SUBSET;
|
||||||
|
param.nDim = Ls == 1 ? 4 : 5;
|
||||||
|
param.x[0] = L / 2;
|
||||||
|
param.x[1] = L;
|
||||||
|
param.x[2] = L;
|
||||||
|
param.x[3] = L;
|
||||||
|
param.x[4] = Ls;
|
||||||
|
param.pc_type = QUDA_4D_PC;
|
||||||
|
param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
|
||||||
|
|
||||||
|
// somewhat surprisingly, the DiracWilson::Dslash(...) function only works with the
|
||||||
|
// UKQCD_GAMMA_BASIS
|
||||||
|
param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
|
||||||
|
|
||||||
|
param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field
|
||||||
|
param.setPrecision(QUDA_SINGLE_PRECISION);
|
||||||
|
param.location = QUDA_CUDA_FIELD_LOCATION;
|
||||||
|
|
||||||
|
// create the field and fill it with random values
|
||||||
|
auto src = ColorSpinorField(param);
|
||||||
|
quda::RNG rng(src, 1234);
|
||||||
|
spinorNoise(src, rng, QUDA_NOISE_GAUSS);
|
||||||
|
/*printfQuda(
|
||||||
|
"created src with norm = %f (sanity check: should be close to %f) and %f bytes\n",
|
||||||
|
blas::norm2(src), 2.0 * 12 * geom[0] * geom[1] * geom[2] * geom[3],
|
||||||
|
src.Bytes() * 1.0);*/
|
||||||
|
// src.PrintDims();
|
||||||
|
|
||||||
|
return src;
|
||||||
|
}
|
||||||
|
|
||||||
|
void benchmark_wilson(std::vector<int> const &L_list, double target_time)
|
||||||
|
{
|
||||||
|
printfQuda("==================== wilson dirac operator ====================\n");
|
||||||
|
#ifdef FLOP_COUNTING_GRID
|
||||||
|
printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
|
||||||
|
#else
|
||||||
|
printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from "
|
||||||
|
"Benchmark_Grid)\n");
|
||||||
|
#endif
|
||||||
|
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
||||||
|
|
||||||
|
for (int L : L_list)
|
||||||
|
{
|
||||||
|
// printfQuda("starting wilson L=%d\n", L);
|
||||||
|
|
||||||
|
auto U = make_gauge_field(L);
|
||||||
|
auto src = make_source(L);
|
||||||
|
|
||||||
|
// create (Wilson) dirac operator
|
||||||
|
DiracParam param;
|
||||||
|
param.kappa = 0.10;
|
||||||
|
param.dagger = QUDA_DAG_NO;
|
||||||
|
param.matpcType = QUDA_MATPC_EVEN_EVEN;
|
||||||
|
auto dirac = DiracWilson(param);
|
||||||
|
|
||||||
|
// insert gauge field into the dirac operator
|
||||||
|
// (the additional nullptr's are for smeared links and fancy preconditioners and such.
|
||||||
|
// Not used for simple Wilson fermions)
|
||||||
|
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
||||||
|
auto res = ColorSpinorField(ColorSpinorParam(src));
|
||||||
|
auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
|
||||||
|
|
||||||
|
// first run to get the quda tuning out of the way
|
||||||
|
dirac.Flops(); // reset flops counter
|
||||||
|
f();
|
||||||
|
double flops = 1.0 * dirac.Flops();
|
||||||
|
|
||||||
|
// actual benchmarking
|
||||||
|
auto start_time = get_timestamp();
|
||||||
|
double secs = bench(f, target_time);
|
||||||
|
auto end_time = get_timestamp();
|
||||||
|
|
||||||
|
#ifdef FLOP_COUNTING_GRID
|
||||||
|
// this is the flop counting from Benchmark_Grid
|
||||||
|
double Nc = 3;
|
||||||
|
double Nd = 4;
|
||||||
|
double Ns = 4;
|
||||||
|
flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
|
||||||
|
flops *= L * L * L * L / 2.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
||||||
|
|
||||||
|
json tmp;
|
||||||
|
tmp["L"] = L;
|
||||||
|
tmp["Gflops_wilson"] = flops / secs * 1e-9;
|
||||||
|
tmp["start_time"] = start_time;
|
||||||
|
tmp["end_time"] = end_time;
|
||||||
|
json_results["flops"]["results"].push_back(tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void benchmark_dwf(std::vector<int> const &L_list, double target_time)
|
||||||
|
{
|
||||||
|
printfQuda("==================== domain wall dirac operator ====================\n");
|
||||||
|
#ifdef FLOP_COUNTING_GRID
|
||||||
|
printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
|
||||||
|
#else
|
||||||
|
printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from "
|
||||||
|
"Benchmark_Grid)\n");
|
||||||
|
#endif
|
||||||
|
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
||||||
|
int Ls = 12;
|
||||||
|
for (int L : L_list)
|
||||||
|
{
|
||||||
|
// printfQuda("starting dwf L=%d\n", L);
|
||||||
|
auto U = make_gauge_field(L);
|
||||||
|
auto src = make_source(L, Ls);
|
||||||
|
|
||||||
|
// create dirac operator
|
||||||
|
DiracParam param;
|
||||||
|
param.kappa = 0.10;
|
||||||
|
param.Ls = Ls;
|
||||||
|
param.m5 = 0.1;
|
||||||
|
param.dagger = QUDA_DAG_NO;
|
||||||
|
param.matpcType = QUDA_MATPC_EVEN_EVEN;
|
||||||
|
auto dirac = DiracDomainWall(param);
|
||||||
|
|
||||||
|
// insert gauge field into the dirac operator
|
||||||
|
// (the additional nullptr's are for smeared links and fancy preconditioners and such)
|
||||||
|
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
||||||
|
auto res = ColorSpinorField(ColorSpinorParam(src));
|
||||||
|
auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
|
||||||
|
|
||||||
|
// first run to get the quda tuning out of the way
|
||||||
|
dirac.Flops(); // reset flops counter
|
||||||
|
f();
|
||||||
|
double flops = 1.0 * dirac.Flops();
|
||||||
|
|
||||||
|
// actual benchmarking
|
||||||
|
auto start_time = get_timestamp();
|
||||||
|
double secs = bench(f, target_time);
|
||||||
|
auto end_time = get_timestamp();
|
||||||
|
|
||||||
|
#ifdef FLOP_COUNTING_GRID
|
||||||
|
// this is the flop counting from Benchmark_Grid
|
||||||
|
double Nc = 3;
|
||||||
|
double Nd = 4;
|
||||||
|
double Ns = 4;
|
||||||
|
flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
|
||||||
|
flops *= L * L * L * L * Ls / 2.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
||||||
|
json tmp;
|
||||||
|
tmp["L"] = L;
|
||||||
|
tmp["Gflops_dwf4"] = flops / secs * 1e-9;
|
||||||
|
tmp["start_time"] = start_time;
|
||||||
|
tmp["end_time"] = end_time;
|
||||||
|
json_results["flops"]["results"].push_back(tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void benchmark_axpy(std::vector<int> const &L_list, double target_time)
|
||||||
|
{
|
||||||
|
// number of iterations for warmup / measurement
|
||||||
|
// (feel free to change for noise/time tradeoff)
|
||||||
|
constexpr int niter_warmup = 5;
|
||||||
|
|
||||||
|
printfQuda("==================== axpy / memory ====================\n");
|
||||||
|
|
||||||
|
ColorSpinorParam param;
|
||||||
|
param.nDim = 4; // 4-dimensional lattice
|
||||||
|
param.x[4] = 1; // no fifth dimension
|
||||||
|
param.nColor = 3; // supported values for nSpin/nColor are configured when compiling
|
||||||
|
// QUDA. "3*4" will probably always be enabled, so we stick with this
|
||||||
|
param.nSpin = 4;
|
||||||
|
param.nVec = 1; // just a single vector
|
||||||
|
param.siteSubset = QUDA_FULL_SITE_SUBSET; // full lattice = no odd/even
|
||||||
|
param.pad = 0; // no padding
|
||||||
|
param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field
|
||||||
|
param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU
|
||||||
|
param.setPrecision(QUDA_SINGLE_PRECISION);
|
||||||
|
|
||||||
|
// the following dont matter for an axpy benchmark, but need to choose something
|
||||||
|
param.pc_type = QUDA_4D_PC;
|
||||||
|
param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
|
||||||
|
param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
|
||||||
|
|
||||||
|
printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)",
|
||||||
|
"GiB/s/rank", "Gflop/s/rank");
|
||||||
|
for (int L : L_list)
|
||||||
|
{
|
||||||
|
// printfQuda("starting axpy L=%d\n", L);
|
||||||
|
// IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
|
||||||
|
// are LOCAL, i.e. per rank / per GPU
|
||||||
|
|
||||||
|
param.x[0] = L;
|
||||||
|
param.x[1] = L;
|
||||||
|
param.x[2] = L;
|
||||||
|
param.x[3] = L;
|
||||||
|
|
||||||
|
// number of (real) elements in one (local) field
|
||||||
|
size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] *
|
||||||
|
param.nColor * param.nSpin;
|
||||||
|
|
||||||
|
// create the field(s)
|
||||||
|
auto fieldA = ColorSpinorField(param);
|
||||||
|
auto fieldB = ColorSpinorField(param);
|
||||||
|
assert(fieldA.Bytes() == sizeof(float) * field_elements); // sanity check
|
||||||
|
assert(fieldB.Bytes() == sizeof(float) * field_elements); // sanity check
|
||||||
|
|
||||||
|
// fill fields with random values
|
||||||
|
quda::RNG rng(fieldA, 1234);
|
||||||
|
spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS);
|
||||||
|
spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS);
|
||||||
|
|
||||||
|
// number of operations / bytes per iteration
|
||||||
|
// axpy is one addition, one multiplication, two read, one write
|
||||||
|
double flops = 2 * field_elements;
|
||||||
|
double memory = 3 * sizeof(float) * field_elements;
|
||||||
|
|
||||||
|
auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); };
|
||||||
|
|
||||||
|
// first run to get the quda tuning out of the way
|
||||||
|
f();
|
||||||
|
|
||||||
|
// actual benchmarking
|
||||||
|
auto start_time = get_timestamp();
|
||||||
|
double secs = bench(f, target_time);
|
||||||
|
auto end_time = get_timestamp();
|
||||||
|
|
||||||
|
double mem_MiB = memory / 1024. / 1024.;
|
||||||
|
double GBps = mem_MiB / 1024 / secs;
|
||||||
|
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
|
||||||
|
flops / secs * 1e-9);
|
||||||
|
|
||||||
|
json tmp;
|
||||||
|
tmp["L"] = L;
|
||||||
|
tmp["size_MB"] = mem_MiB;
|
||||||
|
tmp["GBps"] = GBps;
|
||||||
|
tmp["GFlops"] = flops / secs * 1e-9;
|
||||||
|
tmp["start_time"] = start_time;
|
||||||
|
tmp["end_time"] = end_time;
|
||||||
|
json_results["axpy"].push_back(tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
std::string json_filename = ""; // empty indicates no json output
|
||||||
|
for (int i = 0; i < argc; i++)
|
||||||
|
{
|
||||||
|
if (std::string(argv[i]) == "--json-out")
|
||||||
|
json_filename = argv[i + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
initComms(argc, argv);
|
||||||
|
|
||||||
|
initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used
|
||||||
|
|
||||||
|
// verbosity options are:
|
||||||
|
// SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE
|
||||||
|
setVerbosity(QUDA_SUMMARIZE);
|
||||||
|
|
||||||
|
printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
|
||||||
|
mpi_grid[3]);
|
||||||
|
|
||||||
|
benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0);
|
||||||
|
|
||||||
|
setVerbosity(QUDA_SILENT);
|
||||||
|
benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0);
|
||||||
|
benchmark_dwf({8, 12, 16, 24, 32}, 1.0);
|
||||||
|
setVerbosity(QUDA_SUMMARIZE);
|
||||||
|
|
||||||
|
printfQuda("==================== done with all benchmarks ====================\n");
|
||||||
|
|
||||||
|
if (!json_filename.empty())
|
||||||
|
{
|
||||||
|
printfQuda("writing benchmark results to %s\n", json_filename.c_str());
|
||||||
|
|
||||||
|
int me = 0;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||||
|
if (me == 0)
|
||||||
|
{
|
||||||
|
std::ofstream json_file(json_filename);
|
||||||
|
json_file << std::setw(2) << json_results;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
endQuda();
|
||||||
|
quda::comm_finalize();
|
||||||
|
MPI_Finalize();
|
||||||
|
}
|
30
Quda/Readme.md
Normal file
30
Quda/Readme.md
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# QUDA benchmarks
|
||||||
|
|
||||||
|
This folder contains benchmarks for the [QUDA](https://github.com/lattice/quda) library.
|
||||||
|
|
||||||
|
- `Benchmark_Quda`: This benchmark measure floating point performances of fermion
|
||||||
|
matrices (Wilson and DWF), as well as memory bandwidth (using a simple `axpy` operation). Measurements are
|
||||||
|
performed for a fixed range of problem sizes.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
After setting up your compilation environment (Tursa: `source /home/dp207/dp207/shared/env/production/env-{base,gpu}.sh`):
|
||||||
|
```bash
|
||||||
|
./build-quda.sh <env_dir> # build Quda
|
||||||
|
./build-benchmark.sh <env_dir> # build benchmark
|
||||||
|
```
|
||||||
|
where `<env_dir>` is an arbitrary directory where every product will be stored.
|
||||||
|
|
||||||
|
## Running the Benchmark
|
||||||
|
|
||||||
|
The benchmark should be run as
|
||||||
|
```bash
|
||||||
|
mpirun -np <ranks> <env_dir>/prefix/qudabench/Benchmark_Quda
|
||||||
|
```
|
||||||
|
where `<ranks>` is the total number of GPU's to use. On Tursa this is 4 times the number of nodes.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary.
|
||||||
|
- due to Qudas automatic tuning, the benchmark might take significantly longer to run than `Benchmark_Grid` (even though it does fewer things).
|
||||||
|
- setting `QUDA_ENABLE_TUNING=0` disables all tuning (degrades performance severely). By default, it is turned on.
|
||||||
|
- setting `QUDA_RESOURCE_PATH=<some folder>` enables Quda to save and reuse optimal tuning parameters, making repeated runs much faster
|
||||||
|
|
32
Quda/build-benchmark.sh
Executable file
32
Quda/build-benchmark.sh
Executable file
@ -0,0 +1,32 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1090,SC1091
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if (( $# != 1 )); then
|
||||||
|
echo "usage: $(basename "$0") <environment directory>" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
env_dir=$1
|
||||||
|
|
||||||
|
# TODO: this is Tursa specific. have not figured out the correct way to do this.
|
||||||
|
EXTRA_LIBS="/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so"
|
||||||
|
|
||||||
|
# NOTE: these flags need to be in sync with Qudas compilation options (see build-quda.sh)
|
||||||
|
BUILD_FLAGS="-O3 -std=c++17 -DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=12 -DQUDA_RECONSTRUCT=4"
|
||||||
|
|
||||||
|
call_dir=$(pwd -P)
|
||||||
|
script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
||||||
|
cd "${env_dir}"
|
||||||
|
env_dir=$(pwd -P)
|
||||||
|
cd "${call_dir}"
|
||||||
|
BUILD_DIR="${env_dir}/build/Quda-benchmarks"
|
||||||
|
PREFIX_DIR="${env_dir}/prefix/qudabench"
|
||||||
|
QUDA_DIR=${env_dir}/prefix/quda
|
||||||
|
mkdir -p "${BUILD_DIR}"
|
||||||
|
mkdir -p "${PREFIX_DIR}"
|
||||||
|
|
||||||
|
LINK_FLAGS="-Wl,-rpath,$QUDA_DIR/lib: $QUDA_DIR/lib/libquda.so $EXTRA_LIBS -lpthread -lmpi"
|
||||||
|
|
||||||
|
g++ $BUILD_FLAGS -I$QUDA_DIR/include/targets/cuda -I$QUDA_DIR/include -c -o $BUILD_DIR/Benchmark_Quda.o $script_dir/Benchmark_Quda.cpp
|
||||||
|
g++ -g -O3 $BUILD_DIR/Benchmark_Quda.o -o $PREFIX_DIR/Benchmark_Quda $LINK_FLAGS -lmpi
|
36
Quda/build-quda.sh
Executable file
36
Quda/build-quda.sh
Executable file
@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1090,SC1091
|
||||||
|
|
||||||
|
BUILD_FLAGS="-O3 -std=c++17"
|
||||||
|
QUDA_FLAGS="-DQUDA_MPI=ON -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=4 -DQUDA_GPU_ARCH=sm_80"
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if (( $# != 1 )); then
|
||||||
|
echo "usage: $(basename "$0") <environment directory>" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
env_dir=$1
|
||||||
|
|
||||||
|
call_dir=$(pwd -P)
|
||||||
|
mkdir -p ${env_dir}
|
||||||
|
cd "${env_dir}"
|
||||||
|
env_dir=$(pwd -P)
|
||||||
|
cd "${call_dir}"
|
||||||
|
|
||||||
|
build_dir="${env_dir}/build/quda"
|
||||||
|
if [ -d "${build_dir}" ]; then
|
||||||
|
echo "error: directory '${build_dir}' exists"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
mkdir -p "${build_dir}"
|
||||||
|
|
||||||
|
git clone https://github.com/lattice/quda.git "${build_dir}"
|
||||||
|
cd "${build_dir}"
|
||||||
|
|
||||||
|
mkdir build; cd build
|
||||||
|
cmake .. $QUDA_FLAGS -DCMAKE_INSTALL_PREFIX=${env_dir}/prefix/quda
|
||||||
|
make -j128
|
||||||
|
make install
|
||||||
|
|
||||||
|
cd "${call_dir}"
|
21
Quda/env.sh
Normal file
21
Quda/env.sh
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
module load gcc/9.3.0
|
||||||
|
module load cuda/11.4.1
|
||||||
|
module load openmpi/4.1.1-cuda11.4
|
||||||
|
|
||||||
|
export QUDA_RESOURCE_PATH=$(pwd)/tuning
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx # by fabian. no idea what this is
|
||||||
|
#export UCX_TLS=rc,rc_x,sm,cuda_copy,cuda_ipc,gdr_copy
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
export QUDA_REORDER_LOCATION=GPU # this is the default anyway
|
Loading…
x
Reference in New Issue
Block a user