Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
2efbad5f70 |
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#ifdef HAVE_LIME
|
||||
using namespace Grid;
|
||||
|
||||
std::string filestem(const int l) { return "io/iobench_l" + std::to_string(l); }
|
||||
std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
|
||||
|
||||
int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
|
||||
|
||||
@ -56,6 +56,13 @@ template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat
|
||||
mean /= n;
|
||||
}
|
||||
|
||||
#define grid_printf(...) \
|
||||
{ \
|
||||
char _buf[1024]; \
|
||||
sprintf(_buf, __VA_ARGS__); \
|
||||
MSG << _buf; \
|
||||
}
|
||||
|
||||
enum
|
||||
{
|
||||
sRead = 0,
|
||||
@ -76,58 +83,58 @@ int main(int argc, char **argv)
|
||||
std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
|
||||
std::vector<int> latt;
|
||||
|
||||
GRID_MSG << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
GRID_MSG << "MPI partition " << mpi << std::endl;
|
||||
MSG << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
MSG << "MPI partition " << mpi << std::endl;
|
||||
for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
|
||||
{
|
||||
grid_big_sep();
|
||||
GRID_MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
|
||||
grid_big_sep();
|
||||
grid_small_sep();
|
||||
GRID_MSG << "Benchmark std write" << std::endl;
|
||||
grid_small_sep();
|
||||
MSG << BIGSEP << std::endl;
|
||||
MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
|
||||
MSG << BIGSEP << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark std write" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||
|
||||
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
|
||||
perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||
}
|
||||
|
||||
grid_small_sep();
|
||||
GRID_MSG << "Benchmark std read" << std::endl;
|
||||
grid_small_sep();
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark std read" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||
|
||||
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
|
||||
perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||
}
|
||||
|
||||
#ifdef HAVE_LIME
|
||||
grid_small_sep();
|
||||
GRID_MSG << "Benchmark Grid C-Lime write" << std::endl;
|
||||
grid_small_sep();
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark Grid C-Lime write" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||
|
||||
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
|
||||
perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||
}
|
||||
|
||||
grid_small_sep();
|
||||
GRID_MSG << "Benchmark Grid C-Lime read" << std::endl;
|
||||
grid_small_sep();
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark Grid C-Lime read" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||
|
||||
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
|
||||
perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||
}
|
||||
@ -152,13 +159,13 @@ int main(int argc, char **argv)
|
||||
avRob.fill(100.);
|
||||
avRob -= 100. * avStdDev.cwiseQuotient(avMean.cwiseAbs());
|
||||
|
||||
grid_big_sep();
|
||||
GRID_MSG << "SUMMARY" << std::endl;
|
||||
grid_big_sep();
|
||||
GRID_MSG << "Summary of individual results (all results in MB/s)." << std::endl;
|
||||
GRID_MSG << "Every second colum gives the standard deviation of the previous column."
|
||||
<< std::endl;
|
||||
GRID_MSG << std::endl;
|
||||
MSG << BIGSEP << std::endl;
|
||||
MSG << "SUMMARY" << std::endl;
|
||||
MSG << BIGSEP << std::endl;
|
||||
MSG << "Summary of individual results (all results in MB/s)." << std::endl;
|
||||
MSG << "Every second colum gives the standard deviation of the previous column."
|
||||
<< std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
|
||||
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
@ -169,10 +176,10 @@ int main(int argc, char **argv)
|
||||
stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
|
||||
stdDev(volInd(l), gWrite));
|
||||
}
|
||||
GRID_MSG << std::endl;
|
||||
GRID_MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
|
||||
<< std::endl;
|
||||
GRID_MSG << std::endl;
|
||||
MSG << std::endl;
|
||||
MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
|
||||
<< std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
|
||||
"Grid write");
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
@ -180,21 +187,21 @@ int main(int argc, char **argv)
|
||||
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
|
||||
rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
|
||||
}
|
||||
GRID_MSG << std::endl;
|
||||
GRID_MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
|
||||
<< "^4 (all results in MB/s)." << std::endl;
|
||||
GRID_MSG << "Every second colum gives the standard deviation of the previous column."
|
||||
<< std::endl;
|
||||
GRID_MSG << std::endl;
|
||||
MSG << std::endl;
|
||||
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
|
||||
<< "^4 (all results in MB/s)." << std::endl;
|
||||
MSG << "Every second colum gives the standard deviation of the previous column."
|
||||
<< std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
|
||||
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
|
||||
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
|
||||
avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
|
||||
avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
|
||||
GRID_MSG << std::endl;
|
||||
GRID_MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
|
||||
<< std::endl;
|
||||
GRID_MSG << std::endl;
|
||||
MSG << std::endl;
|
||||
MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
|
||||
<< std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
|
||||
"Grid write");
|
||||
grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
|
||||
|
@ -18,8 +18,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#ifndef Benchmark_IO_hpp_
|
||||
#define Benchmark_IO_hpp_
|
||||
|
||||
#include "Common.hpp"
|
||||
#include <Grid/Grid.h>
|
||||
#define MSG std::cout << GridLogMessage
|
||||
#define SEP \
|
||||
"-----------------------------------------------------------------------------"
|
||||
#define BIGSEP \
|
||||
"============================================================================="
|
||||
#ifdef HAVE_LIME
|
||||
|
||||
namespace Grid
|
||||
@ -46,9 +50,9 @@ namespace Grid
|
||||
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||
// std::fwrite(&crc, sizeof(uint32_t), 1, file);
|
||||
// crcWatch.Stop();
|
||||
// GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec <<
|
||||
// std::endl; ioWatch.Start(); std::fwrite(vec_v.cpu_ptr, sizeof(typename
|
||||
// Field::scalar_object),
|
||||
// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
||||
// ioWatch.Start();
|
||||
// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
|
||||
// vec.Grid()->lSites(), file);
|
||||
// ioWatch.Stop();
|
||||
// std::fclose(file);
|
||||
@ -57,11 +61,11 @@ namespace Grid
|
||||
// p.size = size;
|
||||
// p.time = ioWatch.useconds();
|
||||
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||
// GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||
// << ",
|
||||
// "
|
||||
// << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
// GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
// }
|
||||
|
||||
// template <typename Field> void stdRead(Field &vec, const std::string filestem)
|
||||
@ -90,14 +94,16 @@ namespace Grid
|
||||
// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||
// crcWatch.Stop();
|
||||
// }
|
||||
// GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec <<
|
||||
// std::endl; assert(crcData == crcRead); size *= vec.Grid()->ProcessorCount(); auto
|
||||
// &p = BinaryIO::lastPerf; p.size = size; p.time = ioWatch.useconds();
|
||||
// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
|
||||
// assert(crcData == crcRead);
|
||||
// size *= vec.Grid()->ProcessorCount();
|
||||
// auto &p = BinaryIO::lastPerf;
|
||||
// p.size = size;
|
||||
// p.time = ioWatch.useconds();
|
||||
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||
// GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() <<
|
||||
// ", "
|
||||
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
||||
// << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
// GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
// }
|
||||
|
||||
template <typename Field> void stdWrite(const std::string filestem, Field &vec)
|
||||
@ -116,7 +122,7 @@ namespace Grid
|
||||
crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||
file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t) / sizeof(char));
|
||||
crcWatch.Stop();
|
||||
GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
||||
MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
||||
ioWatch.Start();
|
||||
file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
|
||||
file.flush();
|
||||
@ -126,9 +132,9 @@ namespace Grid
|
||||
p.size = size;
|
||||
p.time = ioWatch.useconds();
|
||||
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||
GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
||||
<< p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
}
|
||||
|
||||
template <typename Field> void stdRead(Field &vec, const std::string filestem)
|
||||
@ -157,17 +163,16 @@ namespace Grid
|
||||
crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||
crcWatch.Stop();
|
||||
}
|
||||
GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec
|
||||
<< std::endl;
|
||||
MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
|
||||
assert(crcData == crcRead);
|
||||
size *= vec.Grid()->ProcessorCount();
|
||||
auto &p = BinaryIO::lastPerf;
|
||||
p.size = size;
|
||||
p.time = ioWatch.useconds();
|
||||
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||
GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
||||
<< p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
}
|
||||
|
||||
template <typename Field> void limeWrite(const std::string filestem, Field &vec)
|
||||
|
265
Grid/Benchmark_comms_host_device.cpp
Normal file
265
Grid/Benchmark_comms_host_device.cpp
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
struct time_statistics
|
||||
{
|
||||
double mean;
|
||||
double err;
|
||||
double min;
|
||||
double max;
|
||||
|
||||
void statistics(std::vector<double> v)
|
||||
{
|
||||
double sum = std::accumulate(v.begin(), v.end(), 0.0);
|
||||
mean = sum / v.size();
|
||||
|
||||
std::vector<double> diff(v.size());
|
||||
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
|
||||
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
|
||||
err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
|
||||
|
||||
auto result = std::minmax_element(v.begin(), v.end());
|
||||
min = *result.first;
|
||||
max = *result.second;
|
||||
}
|
||||
};
|
||||
|
||||
void header()
|
||||
{
|
||||
std::cout << GridLogMessage << " L "
|
||||
<< "\t"
|
||||
<< " Ls "
|
||||
<< "\t" << std::setw(11) << "bytes\t\t"
|
||||
<< "MB/s uni"
|
||||
<< "\t"
|
||||
<< "MB/s bidi" << std::endl;
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
Grid_init(&argc, &argv);
|
||||
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
|
||||
<< std::endl;
|
||||
|
||||
int Nloop = 250;
|
||||
int nmu = 0;
|
||||
int maxlat = 32;
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
if (mpi_layout[mu] > 1)
|
||||
nmu++;
|
||||
|
||||
std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
|
||||
<< std::endl;
|
||||
std::vector<double> t_time(Nloop);
|
||||
// time_statistics timestat;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "= Benchmarking sequential halo exchange from host memory " << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
header();
|
||||
|
||||
for (int lat = 8; lat <= maxlat; lat += 4)
|
||||
{
|
||||
for (int Ls = 8; Ls <= 8; Ls *= 2)
|
||||
{
|
||||
|
||||
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
|
||||
lat * mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
RealD Nnode = Grid.NodeCount();
|
||||
RealD ppn = Nrank / Nnode;
|
||||
|
||||
std::vector<std::vector<HalfSpinColourVectorD>> xbuf(8);
|
||||
std::vector<std::vector<HalfSpinColourVectorD>> rbuf(8);
|
||||
|
||||
for (int mu = 0; mu < 8; mu++)
|
||||
{
|
||||
xbuf[mu].resize(lat * lat * lat * Ls);
|
||||
rbuf[mu].resize(lat * lat * lat * Ls);
|
||||
}
|
||||
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
|
||||
|
||||
int ncomm;
|
||||
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
if (mpi_layout[mu] > 1)
|
||||
{
|
||||
double start = usecond();
|
||||
for (int i = 0; i < Nloop; i++)
|
||||
{
|
||||
|
||||
ncomm = 0;
|
||||
|
||||
ncomm++;
|
||||
int comm_proc = 1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
|
||||
(void *)&rbuf[mu][0], recv_from_rank, bytes);
|
||||
}
|
||||
|
||||
comm_proc = mpi_layout[mu] - 1;
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
|
||||
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
|
||||
}
|
||||
}
|
||||
Grid.Barrier();
|
||||
double stop = usecond();
|
||||
double mean = (stop - start) / Nloop;
|
||||
double dbytes = bytes * ppn;
|
||||
double xbytes = dbytes * 2.0 * ncomm;
|
||||
double rbytes = xbytes;
|
||||
double bidibytes = xbytes + rbytes;
|
||||
|
||||
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
|
||||
<< std::setw(11) << bytes << std::fixed << std::setprecision(1)
|
||||
<< std::setw(7) << " " << std::right << xbytes / mean << " "
|
||||
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
header();
|
||||
|
||||
for (int lat = 8; lat <= maxlat; lat += 4)
|
||||
{
|
||||
for (int Ls = 8; Ls <= 8; Ls *= 2)
|
||||
{
|
||||
|
||||
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
|
||||
lat * mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
RealD Nnode = Grid.NodeCount();
|
||||
RealD ppn = Nrank / Nnode;
|
||||
|
||||
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
||||
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
||||
|
||||
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
|
||||
for (int d = 0; d < 8; d++)
|
||||
{
|
||||
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
|
||||
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
|
||||
}
|
||||
|
||||
int ncomm;
|
||||
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
if (mpi_layout[mu] > 1)
|
||||
{
|
||||
double start = usecond();
|
||||
for (int i = 0; i < Nloop; i++)
|
||||
{
|
||||
|
||||
ncomm = 0;
|
||||
|
||||
ncomm++;
|
||||
int comm_proc = 1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
|
||||
(void *)&rbuf[mu][0], recv_from_rank, bytes);
|
||||
}
|
||||
|
||||
comm_proc = mpi_layout[mu] - 1;
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
|
||||
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
|
||||
}
|
||||
}
|
||||
Grid.Barrier();
|
||||
double stop = usecond();
|
||||
double mean = (stop - start) / Nloop;
|
||||
double dbytes = bytes * ppn;
|
||||
double xbytes = dbytes * 2.0 * ncomm;
|
||||
double rbytes = xbytes;
|
||||
double bidibytes = xbytes + rbytes;
|
||||
|
||||
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
|
||||
<< std::setw(11) << bytes << std::fixed << std::setprecision(1)
|
||||
<< std::setw(7) << " " << std::right << xbytes / mean << " "
|
||||
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
for (int d = 0; d < 8; d++)
|
||||
{
|
||||
acceleratorFreeDevice(xbuf[d]);
|
||||
acceleratorFreeDevice(rbuf[d]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
512
Grid/Benchmark_dwf_fp32.cpp
Normal file
512
Grid/Benchmark_dwf_fp32.cpp
Normal file
@ -0,0 +1,512 @@
|
||||
/*
|
||||
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
||||
Copyright © 2023 Simon Bürger <simon.buerger@rwth-aachen.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "json.hpp"
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PROFILE
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template <class d> struct scal
|
||||
{
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
Grid_init(&argc, &argv);
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
int Ls = 16;
|
||||
std::string json_filename = ""; // empty indicates no json output
|
||||
nlohmann::json json;
|
||||
|
||||
// benchmark specific command line arguments
|
||||
for (int i = 0; i < argc; i++)
|
||||
{
|
||||
if (std::string(argv[i]) == "-Ls")
|
||||
{
|
||||
std::stringstream ss(argv[i + 1]);
|
||||
ss >> Ls;
|
||||
}
|
||||
if (std::string(argv[i]) == "--json-out")
|
||||
json_filename = argv[i + 1];
|
||||
}
|
||||
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
|
||||
|
||||
json["single_site_flops"] = single_site_flops;
|
||||
|
||||
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
|
||||
GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
|
||||
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
|
||||
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
|
||||
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
|
||||
|
||||
json["grid"] = FGrid->FullDimensions().toVector();
|
||||
json["local_grid"] = FGrid->LocalDimensions().toVector();
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
|
||||
GridCartesian *sUGrid =
|
||||
SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
|
||||
|
||||
GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
|
||||
GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
|
||||
|
||||
std::vector<int> seeds4({1, 2, 3, 4});
|
||||
std::vector<int> seeds5({5, 6, 7, 8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid);
|
||||
RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid);
|
||||
RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermionF src(FGrid);
|
||||
random(RNG5, src);
|
||||
#if 0
|
||||
src = Zero();
|
||||
{
|
||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
||||
SpinColourVectorF tmp;
|
||||
tmp=Zero();
|
||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
||||
std::cout << " source site 0 " << tmp<<std::endl;
|
||||
pokeSite(tmp,src,origin);
|
||||
}
|
||||
#else
|
||||
RealD N2 = 1.0 / ::sqrt(norm2(src));
|
||||
src = src * N2;
|
||||
#endif
|
||||
|
||||
LatticeFermionF result(FGrid);
|
||||
result = Zero();
|
||||
LatticeFermionF ref(FGrid);
|
||||
ref = Zero();
|
||||
LatticeFermionF tmp(FGrid);
|
||||
LatticeFermionF err(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
LatticeGaugeFieldF Umu(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4, Umu);
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
#if 0
|
||||
Umu=1.0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
LatticeColourMatrixF ttmp(UGrid);
|
||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
// if (mu !=2 ) ttmp = 0;
|
||||
// ttmp = ttmp* pow(10.0,mu);
|
||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
||||
#endif
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
// replicate across fifth dimension
|
||||
// LatticeGaugeFieldF Umu5d(FGrid);
|
||||
std::vector<LatticeColourMatrixF> U(4, UGrid);
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
if (1)
|
||||
{
|
||||
ref = Zero();
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
|
||||
tmp = Cshift(src, mu + 1, 1);
|
||||
{
|
||||
autoView(tmp_v, tmp, CpuWrite);
|
||||
autoView(U_v, U[mu], CpuRead);
|
||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
||||
{
|
||||
for (int s = 0; s < Ls; s++)
|
||||
{
|
||||
tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s];
|
||||
}
|
||||
}
|
||||
}
|
||||
ref = ref + tmp - Gamma(Gmu[mu]) * tmp;
|
||||
|
||||
{
|
||||
autoView(tmp_v, tmp, CpuWrite);
|
||||
autoView(U_v, U[mu], CpuRead);
|
||||
autoView(src_v, src, CpuRead);
|
||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
||||
{
|
||||
for (int s = 0; s < Ls; s++)
|
||||
{
|
||||
tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp = Cshift(tmp, mu + 1, -1);
|
||||
ref = ref + tmp + Gamma(Gmu[mu]) * tmp;
|
||||
}
|
||||
ref = -0.5 * ref;
|
||||
}
|
||||
|
||||
RealD mass = 0.1;
|
||||
RealD M5 = 1.8;
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
RealD NN = UGrid->NodeCount();
|
||||
|
||||
json["ranks"] = NP;
|
||||
json["nodes"] = NN;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "*****************************************************************"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "*****************************************************************"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "*****************************************************************"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "* Benchmarking DomainWallFermionR::Dhop " << std::endl;
|
||||
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
|
||||
<< std::endl;
|
||||
|
||||
if (sizeof(RealF) == 4)
|
||||
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
|
||||
if (sizeof(RealF) == 8)
|
||||
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
|
||||
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
|
||||
std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
|
||||
#endif
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
|
||||
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
|
||||
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
|
||||
|
||||
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "*****************************************************************"
|
||||
<< std::endl;
|
||||
|
||||
DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
|
||||
int ncall = 300;
|
||||
|
||||
if (1)
|
||||
{
|
||||
FGrid->Barrier();
|
||||
Dw.ZeroCounters();
|
||||
Dw.Dhop(src, result, 0);
|
||||
std::cout << GridLogMessage << "Called warmup" << std::endl;
|
||||
double t0 = usecond();
|
||||
for (int i = 0; i < ncall; i++)
|
||||
{
|
||||
__SSC_START;
|
||||
Dw.Dhop(src, result, 0);
|
||||
__SSC_STOP;
|
||||
}
|
||||
double t1 = usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume = Ls;
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
volume = volume * latt4[mu];
|
||||
double flops = single_site_flops * volume * ncall;
|
||||
|
||||
auto nsimd = vComplex::Nsimd();
|
||||
auto simdwidth = sizeof(vComplex);
|
||||
|
||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||
double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
|
||||
nsimd * ncall / (1024. * 1024. * 1024.);
|
||||
|
||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
||||
double data_mem =
|
||||
(volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
|
||||
nsimd * ncall / (1024. * 1024. * 1024.);
|
||||
|
||||
json["Dw"]["calls"] = ncall;
|
||||
json["Dw"]["time"] = t1 - t0;
|
||||
json["Dw"]["mflops"] = flops / (t1 - t0);
|
||||
json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
|
||||
json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN;
|
||||
json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
|
||||
json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
|
||||
|
||||
std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
|
||||
<< " us" << std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout << GridLogMessage << "mflop/s = " << flops / (t1 - t0) << std::endl;
|
||||
std::cout << GridLogMessage << "mflop/s per rank = " << flops / (t1 - t0) / NP
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "mflop/s per node = " << flops / (t1 - t0) / NN
|
||||
<< std::endl;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "RF GiB/s (base 2) = " << 1000000. * data_rf / ((t1 - t0))
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "mem GiB/s (base 2) = " << 1000000. * data_mem / ((t1 - t0))
|
||||
<< std::endl;
|
||||
err = ref - result;
|
||||
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
|
||||
// exit(0);
|
||||
|
||||
if ((norm2(err) > 1.0e-4))
|
||||
{
|
||||
/*
|
||||
std::cout << "RESULT\n " << result<<std::endl;
|
||||
std::cout << "REF \n " << ref <<std::endl;
|
||||
std::cout << "ERR \n " << err <<std::endl;
|
||||
*/
|
||||
std::cout << GridLogMessage << "WRONG RESULT" << std::endl;
|
||||
FGrid->Barrier();
|
||||
exit(-1);
|
||||
}
|
||||
assert(norm2(err) < 1.0e-4);
|
||||
Dw.Report();
|
||||
}
|
||||
|
||||
if (1)
|
||||
{ // Naive wilson dag implementation
|
||||
ref = Zero();
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
|
||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||
tmp = Cshift(src, mu + 1, 1);
|
||||
{
|
||||
autoView(ref_v, ref, CpuWrite);
|
||||
autoView(tmp_v, tmp, CpuRead);
|
||||
autoView(U_v, U[mu], CpuRead);
|
||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
||||
{
|
||||
for (int s = 0; s < Ls; s++)
|
||||
{
|
||||
int i = s + Ls * ss;
|
||||
ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]);
|
||||
;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
autoView(tmp_v, tmp, CpuWrite);
|
||||
autoView(U_v, U[mu], CpuRead);
|
||||
autoView(src_v, src, CpuRead);
|
||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
||||
{
|
||||
for (int s = 0; s < Ls; s++)
|
||||
{
|
||||
tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
|
||||
}
|
||||
}
|
||||
}
|
||||
// tmp =adj(U[mu])*src;
|
||||
tmp = Cshift(tmp, mu + 1, -1);
|
||||
{
|
||||
autoView(ref_v, ref, CpuWrite);
|
||||
autoView(tmp_v, tmp, CpuRead);
|
||||
for (int i = 0; i < ref_v.size(); i++)
|
||||
{
|
||||
ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i];
|
||||
;
|
||||
}
|
||||
}
|
||||
}
|
||||
ref = -0.5 * ref;
|
||||
}
|
||||
// dump=1;
|
||||
Dw.Dhop(src, result, 1);
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "Compare to naive wilson implementation Dag to verify correctness"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "Called DwDag" << std::endl;
|
||||
std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
|
||||
std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl;
|
||||
err = ref - result;
|
||||
std::cout << GridLogMessage << "norm dag diff " << norm2(err) << std::endl;
|
||||
if ((norm2(err) > 1.0e-4))
|
||||
{
|
||||
/*
|
||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
LatticeFermionF src_e(FrbGrid);
|
||||
LatticeFermionF src_o(FrbGrid);
|
||||
LatticeFermionF r_e(FrbGrid);
|
||||
LatticeFermionF r_o(FrbGrid);
|
||||
LatticeFermionF r_eo(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
|
||||
<< std::endl;
|
||||
pickCheckerboard(Even, src_e, src);
|
||||
pickCheckerboard(Odd, src_o, src);
|
||||
|
||||
std::cout << GridLogMessage << "src_e" << norm2(src_e) << std::endl;
|
||||
std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
std::cout << GridLogMessage
|
||||
|
||||
<< "*********************************************************" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "* Benchmarking DomainWallFermionF::DhopEO " << std::endl;
|
||||
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
|
||||
<< std::endl;
|
||||
|
||||
if (sizeof(RealF) == 4)
|
||||
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
|
||||
if (sizeof(RealF) == 8)
|
||||
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
|
||||
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
|
||||
std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
|
||||
#endif
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
|
||||
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
|
||||
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
|
||||
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "*********************************************************" << std::endl;
|
||||
|
||||
{
|
||||
Dw.ZeroCounters();
|
||||
FGrid->Barrier();
|
||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
||||
double t0 = usecond();
|
||||
for (int i = 0; i < ncall; i++)
|
||||
{
|
||||
#ifdef CUDA_PROFILE
|
||||
if (i == 10)
|
||||
cudaProfilerStart();
|
||||
#endif
|
||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
||||
#ifdef CUDA_PROFILE
|
||||
if (i == 20)
|
||||
cudaProfilerStop();
|
||||
#endif
|
||||
}
|
||||
double t1 = usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume = Ls;
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
volume = volume * latt4[mu];
|
||||
double flops = (single_site_flops * volume * ncall) / 2.0;
|
||||
|
||||
json["Deo"]["calls"] = ncall;
|
||||
json["Deo"]["time"] = t1 - t0;
|
||||
json["Deo"]["mflops"] = flops / (t1 - t0);
|
||||
json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
|
||||
json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
|
||||
|
||||
std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl;
|
||||
std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN
|
||||
<< std::endl;
|
||||
|
||||
Dw.Report();
|
||||
}
|
||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
||||
Dw.DhopOE(src_e, r_o, DaggerNo);
|
||||
Dw.Dhop(src, result, DaggerNo);
|
||||
|
||||
std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl;
|
||||
std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl;
|
||||
std::cout << GridLogMessage << "res" << norm2(result) << std::endl;
|
||||
|
||||
setCheckerboard(r_eo, r_o);
|
||||
setCheckerboard(r_eo, r_e);
|
||||
|
||||
err = r_eo - result;
|
||||
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
|
||||
if ((norm2(err) > 1.0e-4))
|
||||
{
|
||||
/*
|
||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
|
||||
pickCheckerboard(Even, src_e, err);
|
||||
pickCheckerboard(Odd, src_o, err);
|
||||
std::cout << GridLogMessage << "norm diff even " << norm2(src_e) << std::endl;
|
||||
std::cout << GridLogMessage << "norm diff odd " << norm2(src_o) << std::endl;
|
||||
|
||||
assert(norm2(src_e) < 1.0e-4);
|
||||
assert(norm2(src_o) < 1.0e-4);
|
||||
|
||||
if (!json_filename.empty())
|
||||
{
|
||||
std::cout << GridLogMessage << "writing benchmark results to " << json_filename
|
||||
<< std::endl;
|
||||
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
if (me == 0)
|
||||
{
|
||||
std::ofstream json_file(json_filename);
|
||||
json_file << std::setw(4) << json;
|
||||
}
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
@ -26,20 +26,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#define GRID_MSG_MAXSIZE 1024
|
||||
#endif
|
||||
|
||||
#define GRID_BIG_SEP \
|
||||
"==============================================================================="
|
||||
#define GRID_SMALL_SEP "------------------------------------------"
|
||||
|
||||
#define grid_big_sep() \
|
||||
{ \
|
||||
GRID_MSG << GRID_BIG_SEP << std::endl; \
|
||||
}
|
||||
|
||||
#define grid_small_sep() \
|
||||
{ \
|
||||
GRID_MSG << GRID_SMALL_SEP << std::endl; \
|
||||
}
|
||||
|
||||
#define grid_printf(...) \
|
||||
{ \
|
||||
char _buf[GRID_MSG_MAXSIZE]; \
|
||||
|
@ -1,8 +1,12 @@
|
||||
ACLOCAL_AMFLAGS = -I .buildutils/m4
|
||||
|
||||
bin_PROGRAMS = \
|
||||
Benchmark_comms_host_device \
|
||||
Benchmark_dwf_fp32 \
|
||||
Benchmark_Grid \
|
||||
Benchmark_IO
|
||||
|
||||
|
||||
Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp
|
||||
Benchmark_dwf_fp32_SOURCES = Benchmark_dwf_fp32.cpp
|
||||
Benchmark_Grid_SOURCES = Benchmark_Grid.cpp
|
||||
Benchmark_IO_SOURCES = Benchmark_IO.cpp
|
||||
|
@ -6,7 +6,6 @@ The benchmarks can be summarised as follows
|
||||
- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
|
||||
matrices, as well as bandwidth measurement for different operations. Measurements are
|
||||
performed for a fixed range of problem sizes.
|
||||
- `Benchmark_IO`: Parallel I/O benchmark.
|
||||
|
||||
## TL;DR
|
||||
Build and install Grid, all dependencies, and the benchmark with
|
||||
@ -29,7 +28,7 @@ You should first deploy the environment for the specific system you are using, f
|
||||
systems/tursa/bootstrap-env.sh ./env
|
||||
```
|
||||
will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
|
||||
of packages, and take some time to complete.
|
||||
of packages, and might take some time to complete.
|
||||
|
||||
After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
|
||||
```bash
|
||||
@ -67,84 +66,4 @@ where `<env_dir>` is the environment directory and `<config>` is the build confi
|
||||
|
||||
## Running the benchmarks
|
||||
After building the benchmarks as above you can find the binaries in
|
||||
`<env_dir>/prefix/gridbench_<config>`. Depending on the system selected, the environment
|
||||
directory might also contain batch script examples. More information about the benchmarks
|
||||
is provided below.
|
||||
|
||||
### `Benchmark_Grid`
|
||||
This benchmark performs flop/s measurement for typical lattice QCD sparse matrices, as
|
||||
well as memory and inter-process bandwidth measurement using Grid routines. The benchmark
|
||||
command accept any Grid flag (see complete list with `--help`), as well as a
|
||||
`--json-out <file>` flag to save the measurement results in JSON to `<file>`. The
|
||||
benchmarks are performed on a fix set of problem sizes, and the Grid flag `--grid` will
|
||||
be ignored.
|
||||
|
||||
The resulting metrics are as follows, all data size units are in base 2
|
||||
(i.e. 1 kB = 1024 B).
|
||||
|
||||
*Memory bandwidth*
|
||||
|
||||
One sub-benchmark measure the memory bandwidth using a lattice version of the `axpy` BLAS
|
||||
routine, in a similar fashion to the STREAM benchmark. The JSON entries under `"axpy"`
|
||||
have the form
|
||||
```json
|
||||
{
|
||||
"GBps": 215.80653375861607, // bandwidth in GB/s/node
|
||||
"GFlops": 19.310041765757834, // FP performance (double precision)
|
||||
"L": 8, // local lattice volume
|
||||
"size_MB": 3.0 // memory size in MB/node
|
||||
}
|
||||
```
|
||||
|
||||
A second benchmark performs site-wise SU(4) matrix multiplication, and has a higher
|
||||
arithmetic intensity than the `axpy` one (although it is still memory-bound).
|
||||
The JSON entries under `"SU4"` have the form
|
||||
```json
|
||||
{
|
||||
"GBps": 394.76639187026865, // bandwidth in GB/s/node
|
||||
"GFlops": 529.8464820758512, // FP performance (single precision)
|
||||
"L": 8, // local lattice size
|
||||
"size_MB": 6.0 // memory size in MB/node
|
||||
}
|
||||
```
|
||||
|
||||
*Inter-process bandwidth*
|
||||
|
||||
This sub-benchmark measures the achieved bidirectional bandwidth in threaded halo exchange
|
||||
using routines in Grid. The exchange is performed in each direction on the MPI Cartesian
|
||||
grid which is parallelised across at least 2 processes. The resulting bandwidth is related
|
||||
to node-local transfers (inter-CPU, NVLink, ...) or network transfers depending on the MPI
|
||||
decomposition. he JSON entries under `"comms"` have the form
|
||||
```json
|
||||
{
|
||||
"L": 40, // local lattice size
|
||||
"bytes": 73728000, // payload size in B/rank
|
||||
"dir": 2, // direction of the exchange, 8 possible directions
|
||||
// (0: +x, 1: +y, ..., 5: -x, 6: -y, ...)
|
||||
"rate_GBps": {
|
||||
"error": 6.474271894240327, // standard deviation across measurements (GB/s/node)
|
||||
"max": 183.10546875, // maximum measured bandwidth (GB/s/node)
|
||||
"mean": 175.21747026766676 // average measured bandwidth (GB/s/node)
|
||||
},
|
||||
"time_usec": 3135.055 // average transfer time (microseconds)
|
||||
}
|
||||
```
|
||||
|
||||
*Floating-point performances*
|
||||
|
||||
This sub-benchmark measures the achieved floating-point performances using the
|
||||
Wilson fermion, domain-wall fermion, and staggered fermion sparse matrices from Grid.
|
||||
In the `"flops"` and `"results"` section of the JSON output are recorded the best
|
||||
performances, e.g.
|
||||
```json
|
||||
{
|
||||
"Gflops_dwf4": 366.5251173474483, // domain-wall in Gflop/s/node (single precision)
|
||||
"Gflops_staggered": 7.5982861018529455, // staggered in Gflop/s/node (single precision)
|
||||
"Gflops_wilson": 15.221839719288932, // Wilson in Gflop/s/node (single precision)
|
||||
"L": 8 // local lattice size
|
||||
}
|
||||
```
|
||||
Here "best" means across a number of different implementations of the routines. Please
|
||||
see the log of the benchmark for an additional breakdown. Finally, the JSON output
|
||||
contains a "comparison point", which is the average of the L=24 and L=32 best
|
||||
domain-wall performances.
|
||||
`<env_dir>/prefix/gridbench_<config>`.
|
@ -20,12 +20,8 @@ mkdir -p "${build_dir}"
|
||||
source "${env_dir}/env.sh"
|
||||
entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
|
||||
env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
|
||||
cd "${build_dir}" || return
|
||||
source "${env_dir}/${env_script}"
|
||||
cd "${script_dir}"
|
||||
if [ ! -f configure ]; then
|
||||
./bootstrap.sh
|
||||
fi
|
||||
cd "${build_dir}"
|
||||
if [ ! -f Makefile ]; then
|
||||
"${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
|
||||
--prefix="${env_dir}/prefix/gridbench_${cfg}"
|
||||
|
@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1091
|
||||
|
||||
env_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
||||
mkdir -p ~/.config/lattice-benchmarks
|
||||
echo "${env_dir}" > ~/.config/lattice-benchmarks/grid-env
|
||||
source "${env_dir}/spack/share/spack/setup-env.sh"
|
||||
spack load jq git
|
||||
GRIDENVDIR="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
||||
export GRIDENVDIR
|
||||
export PATH="${GRIDENVDIR}/prefix/base/bin:${PATH}"
|
||||
export ACLOCAL_PATH="${GRIDENVDIR}/prefix/base/share/aclocal:${ACLOCAL_PATH}"
|
||||
source "${GRIDENVDIR}"/spack/share/spack/setup-env.sh
|
||||
|
0
Grid/systems/tursa/files/cpu-mpi-wrapper.sh
Executable file → Normal file
0
Grid/systems/tursa/files/cpu-mpi-wrapper.sh
Executable file → Normal file
5
Grid/systems/tursa/files/gpu-mpi-wrapper.sh
Executable file → Normal file
5
Grid/systems/tursa/files/gpu-mpi-wrapper.sh
Executable file → Normal file
@ -1,12 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
|
||||
numa1=$((lrank))
|
||||
numa1=$(( 2 * lrank))
|
||||
numa2=$(( 2 * lrank + 1 ))
|
||||
netdev=mlx5_${lrank}:1
|
||||
|
||||
export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
|
||||
export UCX_NET_DEVICES=${netdev}
|
||||
BINDING="--interleave=$numa1"
|
||||
BINDING="--interleave=$numa1,$numa2"
|
||||
|
||||
echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
|
||||
|
||||
|
@ -1,17 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||
export OMP_NUM_THREADS=8
|
||||
export OMPI_MCA_btl=^uct,openib
|
||||
export OMPI_MCA_pml=ucx
|
||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
export UCX_RNDV_THRESH=16384
|
||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
|
||||
# IO environment ###############################################################
|
||||
export OMPI_MCA_io=romio321
|
||||
export OMPI_MCA_btl_openib_allow_ib=true
|
||||
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
@ -1,60 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1091,SC2050,SC2170
|
||||
|
||||
#SBATCH -J benchmark-grid-16
|
||||
#SBATCH -t 1:00:00
|
||||
#SBATCH --nodes=16
|
||||
#SBATCH --ntasks=64
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --output=%x.%j.out
|
||||
#SBATCH --error=%x.%j.err
|
||||
#SBATCH --qos=standard
|
||||
#SBATCH --no-requeue
|
||||
#SBATCH --gpu-freq=1410
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# load environment #############################################################
|
||||
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||
if [ ! -f "${env_cfg}" ]; then
|
||||
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||
exit 1
|
||||
fi
|
||||
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||
source "${env_dir}/env.sh" # load base Spack environment
|
||||
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||
|
||||
# application and parameters ###################################################
|
||||
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||
|
||||
# collect job information ######################################################
|
||||
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||
mkdir -p "${job_info_dir}"
|
||||
|
||||
date > "${job_info_dir}/start-date"
|
||||
set > "${job_info_dir}/env"
|
||||
ldd "${app}" > "${job_info_dir}/ldd"
|
||||
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||
|
||||
# run! #########################################################################
|
||||
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||
"${app}" \
|
||||
--json-out "${job_info_dir}/result.json" \
|
||||
--mpi 1.4.4.4 \
|
||||
--accelerator-threads 8 \
|
||||
--threads 8 \
|
||||
--shm 2048 &> "${job_info_dir}/log"
|
||||
|
||||
# if we reach that point the application exited successfully ###################
|
||||
touch "${job_info_dir}/success"
|
||||
date > "${job_info_dir}/end-date"
|
||||
|
||||
################################################################################
|
@ -1,60 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1091,SC2050,SC2170
|
||||
|
||||
#SBATCH -J benchmark-grid-1
|
||||
#SBATCH -t 1:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=4
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --output=%x.%j.out
|
||||
#SBATCH --error=%x.%j.err
|
||||
#SBATCH --qos=standard
|
||||
#SBATCH --no-requeue
|
||||
#SBATCH --gpu-freq=1410
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# load environment #############################################################
|
||||
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||
if [ ! -f "${env_cfg}" ]; then
|
||||
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||
exit 1
|
||||
fi
|
||||
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||
source "${env_dir}/env.sh" # load base Spack environment
|
||||
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||
|
||||
# application and parameters ###################################################
|
||||
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||
|
||||
# collect job information ######################################################
|
||||
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||
mkdir -p "${job_info_dir}"
|
||||
|
||||
date > "${job_info_dir}/start-date"
|
||||
set > "${job_info_dir}/env"
|
||||
ldd "${app}" > "${job_info_dir}/ldd"
|
||||
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||
|
||||
# run! #########################################################################
|
||||
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||
"${app}" \
|
||||
--json-out "${job_info_dir}/result.json" \
|
||||
--mpi 1.1.1.4 \
|
||||
--accelerator-threads 8 \
|
||||
--threads 8 \
|
||||
--shm 2048 &> "${job_info_dir}/log"
|
||||
|
||||
# if we reach that point the application exited successfully ###################
|
||||
touch "${job_info_dir}/success"
|
||||
date > "${job_info_dir}/end-date"
|
||||
|
||||
################################################################################
|
@ -1,60 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1091,SC2050,SC2170
|
||||
|
||||
#SBATCH -J benchmark-grid-32
|
||||
#SBATCH -t 1:00:00
|
||||
#SBATCH --nodes=32
|
||||
#SBATCH --ntasks=128
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --output=%x.%j.out
|
||||
#SBATCH --error=%x.%j.err
|
||||
#SBATCH --qos=standard
|
||||
#SBATCH --no-requeue
|
||||
#SBATCH --gpu-freq=1410
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# load environment #############################################################
|
||||
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||
if [ ! -f "${env_cfg}" ]; then
|
||||
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||
exit 1
|
||||
fi
|
||||
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||
source "${env_dir}/env.sh" # load base Spack environment
|
||||
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||
|
||||
# application and parameters ###################################################
|
||||
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||
|
||||
# collect job information ######################################################
|
||||
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||
mkdir -p "${job_info_dir}"
|
||||
|
||||
date > "${job_info_dir}/start-date"
|
||||
set > "${job_info_dir}/env"
|
||||
ldd "${app}" > "${job_info_dir}/ldd"
|
||||
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||
|
||||
# run! #########################################################################
|
||||
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||
"${app}" \
|
||||
--json-out "${job_info_dir}/result.json" \
|
||||
--mpi 1.4.4.8 \
|
||||
--accelerator-threads 8 \
|
||||
--threads 8 \
|
||||
--shm 2048 &> "${job_info_dir}/log"
|
||||
|
||||
# if we reach that point the application exited successfully ###################
|
||||
touch "${job_info_dir}/success"
|
||||
date > "${job_info_dir}/end-date"
|
||||
|
||||
################################################################################
|
@ -4,13 +4,7 @@ set -euo pipefail
|
||||
|
||||
gcc_spec='gcc@9.4.0'
|
||||
cuda_spec='cuda@11.4.0'
|
||||
|
||||
# hdf5 and fftw depend on OpenMPI, which we install manually. To make sure this
|
||||
# dependency is picked by spack, we specify the compiler here explicitly. For
|
||||
# most other packages we dont really care about the compiler (i.e. system
|
||||
# compiler versus ${gcc_spec})
|
||||
hdf5_spec="hdf5@1.10.7+cxx+threadsafe%${gcc_spec}"
|
||||
fftw_spec="fftw%${gcc_spec}"
|
||||
hdf5_spec='hdf5@1.10.7'
|
||||
|
||||
if (( $# != 1 )); then
|
||||
echo "usage: $(basename "$0") <env dir>" 1>&2
|
||||
@ -24,7 +18,7 @@ cd "${cwd}"
|
||||
|
||||
# General configuration ########################################################
|
||||
# build with 128 tasks
|
||||
echo 'config:
|
||||
echo 'config:
|
||||
build_jobs: 128
|
||||
build_stage:
|
||||
- $spack/var/spack/stage
|
||||
@ -44,23 +38,26 @@ rm external.yaml
|
||||
|
||||
# Base compilers ###############################################################
|
||||
# configure system base
|
||||
|
||||
spack env create base
|
||||
spack env activate base
|
||||
spack compiler find --scope site
|
||||
|
||||
# install GCC, CUDA
|
||||
spack add ${gcc_spec} ${cuda_spec}
|
||||
spack concretize
|
||||
spack env depfile -o Makefile.tmp
|
||||
make -j128 -f Makefile.tmp
|
||||
# install GCC, CUDA & LLVM
|
||||
spack install ${gcc_spec} ${cuda_spec} llvm
|
||||
|
||||
spack load llvm
|
||||
spack compiler find --scope site
|
||||
spack unload llvm
|
||||
|
||||
spack load ${gcc_spec}
|
||||
spack compiler find --scope site
|
||||
spack unload ${gcc_spec}
|
||||
|
||||
# Manual compilation of OpenMPI & UCX ##########################################
|
||||
# set build directories
|
||||
mkdir -p "${dir}"/build
|
||||
cd "${dir}"/build
|
||||
|
||||
spack load ${gcc_spec} ${cuda_spec}
|
||||
|
||||
cuda_path=$(spack find --format "{prefix}" cuda)
|
||||
gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1
|
||||
|
||||
@ -80,7 +77,7 @@ mkdir -p build_gpu; cd build_gpu
|
||||
--enable-devel-headers --enable-examples --enable-optimizations \
|
||||
--with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging \
|
||||
--disable-debug --disable-assertions --enable-cma \
|
||||
--with-knem=/opt/knem-1.1.4.90mlnx2/ --with-rdmacm \
|
||||
--with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \
|
||||
--without-rocm --without-ugni --without-java \
|
||||
--enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm \
|
||||
--with-rc --with-ud --with-dc --with-mlx5-dv --with-dm \
|
||||
@ -96,7 +93,7 @@ mkdir -p build_cpu; cd build_cpu
|
||||
--enable-devel-headers --enable-examples --enable-optimizations \
|
||||
--with-verbs --disable-logging --disable-debug \
|
||||
--disable-assertions --enable-mt --enable-cma \
|
||||
--with-knem=/opt/knem-1.1.4.90mlnx2/--with-rdmacm \
|
||||
--with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \
|
||||
--without-rocm --without-ugni --without-java \
|
||||
--enable-compiler-opt=3 --without-cm --without-ugni --with-rc \
|
||||
--with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go
|
||||
@ -122,13 +119,13 @@ mkdir build_gpu; cd build_gpu
|
||||
../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem \
|
||||
--with-ucx="${dir}"/prefix/ucx_gpu \
|
||||
--with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib \
|
||||
--with-knem=/opt/knem-1.1.4.90mlnx2/ \
|
||||
--with-knem=/opt/knem-1.1.4.90mlnx1/ \
|
||||
--enable-mca-no-build=btl-uct \
|
||||
--with-cuda="${cuda_path}" --disable-getpwuid \
|
||||
--with-verbs --with-slurm --enable-mpi-fortran=all \
|
||||
--with-pmix=internal --with-libevent=internal
|
||||
make -j 128
|
||||
make install
|
||||
make -j 128
|
||||
make install
|
||||
cd ..
|
||||
|
||||
# openmpi cpu build
|
||||
@ -136,7 +133,7 @@ mkdir build_cpu; cd build_cpu
|
||||
../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem \
|
||||
--with-ucx="${dir}"/prefix/ucx_cpu \
|
||||
--with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib \
|
||||
--with-knem=/opt/knem-1.1.4.90mlnx2/ \
|
||||
--with-knem=/opt/knem-1.1.4.90mlnx1/ \
|
||||
--enable-mca-no-build=btl-uct --disable-getpwuid \
|
||||
--with-verbs --with-slurm --enable-mpi-fortran=all \
|
||||
--with-pmix=internal --with-libevent=internal
|
||||
@ -144,65 +141,65 @@ make -j 128
|
||||
make install
|
||||
cd "${dir}"
|
||||
|
||||
ucx_spec_gpu="ucx@1.12.0.GPU%${gcc_spec}"
|
||||
ucx_spec_cpu="ucx@1.12.0.CPU%${gcc_spec}"
|
||||
openmpi_spec_gpu="openmpi@4.1.1.GPU%${gcc_spec}"
|
||||
openmpi_spec_cpu="openmpi@4.1.1.CPU%${gcc_spec}"
|
||||
|
||||
# Add externals to spack
|
||||
echo "packages:
|
||||
ucx:
|
||||
externals:
|
||||
- spec: \"${ucx_spec_gpu}\"
|
||||
- spec: \"ucx@1.12.0.GPU%gcc@9.4.0\"
|
||||
prefix: ${dir}/prefix/ucx_gpu
|
||||
- spec: \"${ucx_spec_cpu}\"
|
||||
- spec: \"ucx@1.12.0.CPU%gcc@9.4.0\"
|
||||
prefix: ${dir}/prefix/ucx_cpu
|
||||
buildable: False
|
||||
openmpi:
|
||||
externals:
|
||||
- spec: \"${openmpi_spec_gpu}\"
|
||||
- spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\"
|
||||
prefix: ${dir}/prefix/ompi_gpu
|
||||
- spec: \"${openmpi_spec_cpu}\"
|
||||
- spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\"
|
||||
prefix: ${dir}/prefix/ompi_cpu
|
||||
buildable: False" > spack.yaml
|
||||
|
||||
spack config --scope site add -f spack.yaml
|
||||
rm spack.yaml
|
||||
spack env deactivate
|
||||
spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
|
||||
spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
|
||||
|
||||
cd "${cwd}"
|
||||
|
||||
# environments #################################################################
|
||||
dev_tools=("autoconf" "automake" "libtool" "jq" "git")
|
||||
dev_tools=("autoconf" "automake" "libtool" "jq")
|
||||
ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU)
|
||||
ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU)
|
||||
|
||||
spack env create grid-gpu
|
||||
spack env activate grid-gpu
|
||||
spack compiler find --scope site
|
||||
spack add ${gcc_spec} ${cuda_spec} ${ucx_spec_gpu} ${openmpi_spec_gpu}
|
||||
spack add ${hdf5_spec} ${fftw_spec}
|
||||
spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
|
||||
spack concretize
|
||||
spack env depfile -o Makefile.tmp
|
||||
make -j128 -f Makefile.tmp
|
||||
spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}"
|
||||
spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
|
||||
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}"
|
||||
spack add fftw ^/"${ompi_gpu_hash}"
|
||||
spack add openssl gmp mpfr c-lime
|
||||
spack install
|
||||
spack env deactivate
|
||||
|
||||
spack env create grid-cpu
|
||||
spack env activate grid-cpu
|
||||
spack compiler find --scope site
|
||||
spack add ${gcc_spec} ${ucx_spec_cpu} ${openmpi_spec_cpu}
|
||||
spack add ${hdf5_spec} ${fftw_spec}
|
||||
spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
|
||||
spack concretize
|
||||
spack env depfile -o Makefile.tmp
|
||||
make -j128 -f Makefile.tmp
|
||||
spack add llvm "${dev_tools[@]}"
|
||||
spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
|
||||
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}"
|
||||
spack add fftw ^/"${ompi_cpu_hash}"
|
||||
spack add openssl gmp mpfr c-lime
|
||||
spack install
|
||||
spack env deactivate
|
||||
|
||||
# Final setup ##################################################################
|
||||
spack clean
|
||||
#spack gc -y # "spack gc" tends to get hung up for unknown reasons
|
||||
spack gc -y
|
||||
|
||||
# add more environment variables in module loading
|
||||
spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]'
|
||||
spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]'
|
||||
spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]'
|
||||
spack module tcl refresh -y
|
||||
|
||||
# permission change for group access
|
||||
chmod -R g+rw "${dir}/spack/var/spack/cache"
|
||||
setfacl -d -R -m g::rwX "${dir}/spack/var/spack/cache"
|
||||
|
@ -1,14 +0,0 @@
|
||||
{
|
||||
BasedOnStyle: LLVM,
|
||||
UseTab: Never,
|
||||
IndentWidth: 2,
|
||||
TabWidth: 2,
|
||||
BreakBeforeBraces: Allman,
|
||||
AllowShortIfStatementsOnASingleLine: false,
|
||||
IndentCaseLabels: false,
|
||||
ColumnLimit: 90,
|
||||
AccessModifierOffset: -4,
|
||||
NamespaceIndentation: All,
|
||||
FixNamespaceComments: false,
|
||||
SortIncludes: true,
|
||||
}
|
@ -1,458 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <blas_quda.h>
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <color_spinor_field.h>
|
||||
#include <communicator_quda.h>
|
||||
#include <dirac_quda.h>
|
||||
#include <fstream>
|
||||
#include <gauge_tools.h>
|
||||
#include <memory>
|
||||
#include <mpi.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// remove to use QUDA's own flop counting instead of Grid's convention
|
||||
#define FLOP_COUNTING_GRID
|
||||
|
||||
#include "json.hpp"
|
||||
using nlohmann::json;
|
||||
json json_results;
|
||||
|
||||
using namespace quda;
|
||||
|
||||
// thanks chatGPT :)
|
||||
std::string get_timestamp()
|
||||
{
|
||||
// Get the current time
|
||||
auto now = std::chrono::system_clock::now();
|
||||
|
||||
// Convert the current time to a time_t object
|
||||
std::time_t currentTime = std::chrono::system_clock::to_time_t(now);
|
||||
|
||||
// Format the time using std::put_time
|
||||
std::stringstream ss;
|
||||
ss << std::put_time(std::localtime(¤tTime), "%Y%m%d %H:%M:%S");
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// This is the MPI grid, i.e. the layout of ranks
|
||||
int nranks = -1;
|
||||
std::array<int, 4> mpi_grid = {1, 1, 1, 1};
|
||||
|
||||
// run f() in a loop for roughly target_time seconds
|
||||
// returns seconds per iteration it took
|
||||
template <class F> double bench(F const &f, double target_time, int niter_warmup = 5)
|
||||
{
|
||||
device_timer_t timer;
|
||||
timer.start();
|
||||
for (int iter = 0; iter < niter_warmup; ++iter)
|
||||
f();
|
||||
timer.stop();
|
||||
|
||||
double secs = timer.last() / niter_warmup;
|
||||
int niter = std::max(1, int(target_time / secs));
|
||||
// niter = std::min(1000, niter);
|
||||
// printfQuda("during warmup took %f s/iter, deciding on %d iters\n", secs, niter);
|
||||
|
||||
// important: each rank has its own timer, so their measurements can slightly vary. But
|
||||
// 'niter' needs to be consistent (bug took me a couple hours to track down)
|
||||
comm_broadcast_global(&niter, sizeof(niter), 0);
|
||||
|
||||
timer.reset(__FUNCTION__, __FILE__, __LINE__);
|
||||
timer.start();
|
||||
for (int iter = 0; iter < niter; ++iter)
|
||||
f();
|
||||
timer.stop();
|
||||
return timer.last() / niter;
|
||||
}
|
||||
|
||||
void initComms(int argc, char **argv)
|
||||
{
|
||||
// init MPI communication
|
||||
MPI_Init(&argc, &argv);
|
||||
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
|
||||
assert(1 <= nranks && nranks <= 100000);
|
||||
|
||||
mpi_grid[3] = nranks;
|
||||
|
||||
// this maps coordinates to rank number
|
||||
auto lex_rank_from_coords = [](int const *coords, void *)
|
||||
{
|
||||
int rank = coords[0];
|
||||
for (int i = 1; i < 4; i++)
|
||||
rank = mpi_grid[i] * rank + coords[i];
|
||||
return rank;
|
||||
};
|
||||
|
||||
initCommsGridQuda(4, mpi_grid.data(), lex_rank_from_coords, nullptr);
|
||||
|
||||
for (int d = 0; d < 4; d++)
|
||||
if (mpi_grid[d] > 1)
|
||||
commDimPartitionedSet(d);
|
||||
|
||||
json_results["geometry"]["ranks"] = nranks;
|
||||
json_results["geometry"]["mpi"] = mpi_grid;
|
||||
}
|
||||
|
||||
// creates a random gauge field. L = local(!) size
|
||||
cudaGaugeField make_gauge_field(int L)
|
||||
{
|
||||
GaugeFieldParam param;
|
||||
|
||||
// dimension and type of the lattice object
|
||||
param.nDim = 4;
|
||||
param.x[0] = L;
|
||||
param.x[1] = L;
|
||||
param.x[2] = L;
|
||||
param.x[3] = L;
|
||||
|
||||
// number of colors. potentially confusingly, QUDA sometimes uses the word "color" to
|
||||
// things unrelated with physical color. things like "nColor=32" do pop up in deflation
|
||||
// solvers where it (to my understanding) refers to the number of (parallely processed)
|
||||
// deflation vectors.
|
||||
param.nColor = 3;
|
||||
|
||||
// boundary conditions (dont really care for benchmark)
|
||||
param.t_boundary = QUDA_PERIODIC_T;
|
||||
|
||||
// for this benchmark we only need "SINGLE" and/or "DOUBLE" precision. But smaller
|
||||
// precisions are available in QUDA too
|
||||
param.setPrecision(QUDA_SINGLE_PRECISION);
|
||||
|
||||
// no even/odd subset, we want a full lattice
|
||||
param.siteSubset = QUDA_FULL_SITE_SUBSET;
|
||||
|
||||
// what kind of 3x3 matrices the field contains. A proper gauge field has SU(3)
|
||||
// matrices, but (for example) smeared/thick links could have non-unitary links.
|
||||
param.link_type = QUDA_SU3_LINKS;
|
||||
|
||||
// "NULL" does not initialize the field upon creation, "ZERO" would set everything to 0
|
||||
param.create = QUDA_NULL_FIELD_CREATE;
|
||||
|
||||
// field should be allocated directly on the accelerator/GPU
|
||||
param.location = QUDA_CUDA_FIELD_LOCATION;
|
||||
|
||||
// "reconstruct" here means reconstructing a SU(3) matrix from fewer than 18 real
|
||||
// numbers (=3x3 complex numbers). Great feature in production (saving
|
||||
// memory/cache/network bandwidth), not used for this benchmark.
|
||||
param.reconstruct = QUDA_RECONSTRUCT_NO;
|
||||
|
||||
// "ghostExchange" would often be called "halo exchange" outside of Quda. This has
|
||||
// nothing to do with ghost fields from continuum/perturbative qcd.
|
||||
param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
|
||||
|
||||
// This controls the physical order of elements. "float2" is the the default
|
||||
param.order = QUDA_FLOAT2_GAUGE_ORDER;
|
||||
|
||||
// this means the field is a LORENTZ vector (which a gauge field must be). Has nothing
|
||||
// to do with spin.
|
||||
param.geometry = QUDA_VECTOR_GEOMETRY;
|
||||
|
||||
// create the field and fill with random SU(3) matrices
|
||||
// std::cout << param << std::endl; // double-check parameters
|
||||
auto U = cudaGaugeField(param);
|
||||
gaugeGauss(U, /*seed=*/1234, 1.0);
|
||||
return U;
|
||||
}
|
||||
|
||||
// create a random source vector (L = local size)
|
||||
ColorSpinorField make_source(int L, int Ls = 1)
|
||||
{
|
||||
// NOTE: `param.x` directly determines the size of the (local, per rank) memory
|
||||
// allocation. Thus for checkerboarding, we have to specifly x=(L/2,L,L,L) to get a
|
||||
// physical local volume of L^4, thus implicity choosing a dimension for the
|
||||
// checkerboarding (shouldnt really matter of course which one).
|
||||
ColorSpinorParam param;
|
||||
param.nColor = 3;
|
||||
param.nSpin = 4;
|
||||
param.nVec = 1; // only a single vector
|
||||
param.pad = 0;
|
||||
param.siteSubset = QUDA_PARITY_SITE_SUBSET;
|
||||
param.nDim = Ls == 1 ? 4 : 5;
|
||||
param.x[0] = L / 2;
|
||||
param.x[1] = L;
|
||||
param.x[2] = L;
|
||||
param.x[3] = L;
|
||||
param.x[4] = Ls;
|
||||
param.pc_type = QUDA_4D_PC;
|
||||
param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
|
||||
|
||||
// somewhat surprisingly, the DiracWilson::Dslash(...) function only works with the
|
||||
// UKQCD_GAMMA_BASIS
|
||||
param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
|
||||
|
||||
param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field
|
||||
param.setPrecision(QUDA_SINGLE_PRECISION);
|
||||
param.location = QUDA_CUDA_FIELD_LOCATION;
|
||||
|
||||
// create the field and fill it with random values
|
||||
auto src = ColorSpinorField(param);
|
||||
quda::RNG rng(src, 1234);
|
||||
spinorNoise(src, rng, QUDA_NOISE_GAUSS);
|
||||
/*printfQuda(
|
||||
"created src with norm = %f (sanity check: should be close to %f) and %f bytes\n",
|
||||
blas::norm2(src), 2.0 * 12 * geom[0] * geom[1] * geom[2] * geom[3],
|
||||
src.Bytes() * 1.0);*/
|
||||
// src.PrintDims();
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
void benchmark_wilson(std::vector<int> const &L_list, double target_time)
|
||||
{
|
||||
printfQuda("==================== wilson dirac operator ====================\n");
|
||||
#ifdef FLOP_COUNTING_GRID
|
||||
printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
|
||||
#else
|
||||
printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from "
|
||||
"Benchmark_Grid)\n");
|
||||
#endif
|
||||
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
||||
|
||||
for (int L : L_list)
|
||||
{
|
||||
// printfQuda("starting wilson L=%d\n", L);
|
||||
|
||||
auto U = make_gauge_field(L);
|
||||
auto src = make_source(L);
|
||||
|
||||
// create (Wilson) dirac operator
|
||||
DiracParam param;
|
||||
param.kappa = 0.10;
|
||||
param.dagger = QUDA_DAG_NO;
|
||||
param.matpcType = QUDA_MATPC_EVEN_EVEN;
|
||||
auto dirac = DiracWilson(param);
|
||||
|
||||
// insert gauge field into the dirac operator
|
||||
// (the additional nullptr's are for smeared links and fancy preconditioners and such.
|
||||
// Not used for simple Wilson fermions)
|
||||
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
||||
auto res = ColorSpinorField(ColorSpinorParam(src));
|
||||
auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
|
||||
|
||||
// first run to get the quda tuning out of the way
|
||||
dirac.Flops(); // reset flops counter
|
||||
f();
|
||||
double flops = 1.0 * dirac.Flops();
|
||||
|
||||
// actual benchmarking
|
||||
auto start_time = get_timestamp();
|
||||
double secs = bench(f, target_time);
|
||||
auto end_time = get_timestamp();
|
||||
|
||||
#ifdef FLOP_COUNTING_GRID
|
||||
// this is the flop counting from Benchmark_Grid
|
||||
double Nc = 3;
|
||||
double Nd = 4;
|
||||
double Ns = 4;
|
||||
flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
|
||||
flops *= L * L * L * L / 2.0;
|
||||
#endif
|
||||
|
||||
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
||||
|
||||
json tmp;
|
||||
tmp["L"] = L;
|
||||
tmp["Gflops_wilson"] = flops / secs * 1e-9;
|
||||
tmp["start_time"] = start_time;
|
||||
tmp["end_time"] = end_time;
|
||||
json_results["flops"]["results"].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void benchmark_dwf(std::vector<int> const &L_list, double target_time)
|
||||
{
|
||||
printfQuda("==================== domain wall dirac operator ====================\n");
|
||||
#ifdef FLOP_COUNTING_GRID
|
||||
printfQuda("IMPORTANT: flop counting as in Benchmark_Grid\n");
|
||||
#else
|
||||
printfQuda("IMPORTANT: flop counting by QUDA's own convention (different from "
|
||||
"Benchmark_Grid)\n");
|
||||
#endif
|
||||
printfQuda("%5s %15s %15s\n", "L", "time (usec)", "Gflop/s/rank");
|
||||
int Ls = 12;
|
||||
for (int L : L_list)
|
||||
{
|
||||
// printfQuda("starting dwf L=%d\n", L);
|
||||
auto U = make_gauge_field(L);
|
||||
auto src = make_source(L, Ls);
|
||||
|
||||
// create dirac operator
|
||||
DiracParam param;
|
||||
param.kappa = 0.10;
|
||||
param.Ls = Ls;
|
||||
param.m5 = 0.1;
|
||||
param.dagger = QUDA_DAG_NO;
|
||||
param.matpcType = QUDA_MATPC_EVEN_EVEN;
|
||||
auto dirac = DiracDomainWall(param);
|
||||
|
||||
// insert gauge field into the dirac operator
|
||||
// (the additional nullptr's are for smeared links and fancy preconditioners and such)
|
||||
dirac.updateFields(&U, nullptr, nullptr, nullptr);
|
||||
auto res = ColorSpinorField(ColorSpinorParam(src));
|
||||
auto f = [&]() { dirac.Dslash(res, src, QUDA_EVEN_PARITY); };
|
||||
|
||||
// first run to get the quda tuning out of the way
|
||||
dirac.Flops(); // reset flops counter
|
||||
f();
|
||||
double flops = 1.0 * dirac.Flops();
|
||||
|
||||
// actual benchmarking
|
||||
auto start_time = get_timestamp();
|
||||
double secs = bench(f, target_time);
|
||||
auto end_time = get_timestamp();
|
||||
|
||||
#ifdef FLOP_COUNTING_GRID
|
||||
// this is the flop counting from Benchmark_Grid
|
||||
double Nc = 3;
|
||||
double Nd = 4;
|
||||
double Ns = 4;
|
||||
flops = (Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2);
|
||||
flops *= L * L * L * L * Ls / 2.0;
|
||||
#endif
|
||||
|
||||
printfQuda("%5d %15.2f %15.2f\n", L, secs * 1e6, flops / secs * 1e-9);
|
||||
json tmp;
|
||||
tmp["L"] = L;
|
||||
tmp["Gflops_dwf4"] = flops / secs * 1e-9;
|
||||
tmp["start_time"] = start_time;
|
||||
tmp["end_time"] = end_time;
|
||||
json_results["flops"]["results"].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void benchmark_axpy(std::vector<int> const &L_list, double target_time)
|
||||
{
|
||||
// number of iterations for warmup / measurement
|
||||
// (feel free to change for noise/time tradeoff)
|
||||
constexpr int niter_warmup = 5;
|
||||
|
||||
printfQuda("==================== axpy / memory ====================\n");
|
||||
|
||||
ColorSpinorParam param;
|
||||
param.nDim = 4; // 4-dimensional lattice
|
||||
param.x[4] = 1; // no fifth dimension
|
||||
param.nColor = 3; // supported values for nSpin/nColor are configured when compiling
|
||||
// QUDA. "3*4" will probably always be enabled, so we stick with this
|
||||
param.nSpin = 4;
|
||||
param.nVec = 1; // just a single vector
|
||||
param.siteSubset = QUDA_FULL_SITE_SUBSET; // full lattice = no odd/even
|
||||
param.pad = 0; // no padding
|
||||
param.create = QUDA_NULL_FIELD_CREATE; // do not (zero-) initilize the field
|
||||
param.location = QUDA_CUDA_FIELD_LOCATION; // field should reside on GPU
|
||||
param.setPrecision(QUDA_SINGLE_PRECISION);
|
||||
|
||||
// the following dont matter for an axpy benchmark, but need to choose something
|
||||
param.pc_type = QUDA_4D_PC;
|
||||
param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
|
||||
param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
|
||||
|
||||
printfQuda("%5s %15s %15s %15s %15s\n", "L", "size (MiB/rank)", "time (usec)",
|
||||
"GiB/s/rank", "Gflop/s/rank");
|
||||
for (int L : L_list)
|
||||
{
|
||||
// printfQuda("starting axpy L=%d\n", L);
|
||||
// IMPORTANT: all of `param.x`, `field_elements`, `field.Bytes()`
|
||||
// are LOCAL, i.e. per rank / per GPU
|
||||
|
||||
param.x[0] = L;
|
||||
param.x[1] = L;
|
||||
param.x[2] = L;
|
||||
param.x[3] = L;
|
||||
|
||||
// number of (real) elements in one (local) field
|
||||
size_t field_elements = 2 * param.x[0] * param.x[1] * param.x[2] * param.x[3] *
|
||||
param.nColor * param.nSpin;
|
||||
|
||||
// create the field(s)
|
||||
auto fieldA = ColorSpinorField(param);
|
||||
auto fieldB = ColorSpinorField(param);
|
||||
assert(fieldA.Bytes() == sizeof(float) * field_elements); // sanity check
|
||||
assert(fieldB.Bytes() == sizeof(float) * field_elements); // sanity check
|
||||
|
||||
// fill fields with random values
|
||||
quda::RNG rng(fieldA, 1234);
|
||||
spinorNoise(fieldA, rng, QUDA_NOISE_GAUSS);
|
||||
spinorNoise(fieldB, rng, QUDA_NOISE_GAUSS);
|
||||
|
||||
// number of operations / bytes per iteration
|
||||
// axpy is one addition, one multiplication, two read, one write
|
||||
double flops = 2 * field_elements;
|
||||
double memory = 3 * sizeof(float) * field_elements;
|
||||
|
||||
auto f = [&]() { blas::axpy(1.234, fieldA, fieldB); };
|
||||
|
||||
// first run to get the quda tuning out of the way
|
||||
f();
|
||||
|
||||
// actual benchmarking
|
||||
auto start_time = get_timestamp();
|
||||
double secs = bench(f, target_time);
|
||||
auto end_time = get_timestamp();
|
||||
|
||||
double mem_MiB = memory / 1024. / 1024.;
|
||||
double GBps = mem_MiB / 1024 / secs;
|
||||
printfQuda("%5d %15.2f %15.2f %15.2f %15.2f\n", L, mem_MiB, secs * 1e6, GBps,
|
||||
flops / secs * 1e-9);
|
||||
|
||||
json tmp;
|
||||
tmp["L"] = L;
|
||||
tmp["size_MB"] = mem_MiB;
|
||||
tmp["GBps"] = GBps;
|
||||
tmp["GFlops"] = flops / secs * 1e-9;
|
||||
tmp["start_time"] = start_time;
|
||||
tmp["end_time"] = end_time;
|
||||
json_results["axpy"].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
std::string json_filename = ""; // empty indicates no json output
|
||||
for (int i = 0; i < argc; i++)
|
||||
{
|
||||
if (std::string(argv[i]) == "--json-out")
|
||||
json_filename = argv[i + 1];
|
||||
}
|
||||
|
||||
initComms(argc, argv);
|
||||
|
||||
initQuda(-1); // -1 for multi-gpu. otherwise this selects the device to be used
|
||||
|
||||
// verbosity options are:
|
||||
// SILENT, SUMMARIZE, VERBOSE, DEBUG_VERBOSE
|
||||
setVerbosity(QUDA_SUMMARIZE);
|
||||
|
||||
printfQuda("MPI layout = %d %d %d %d\n", mpi_grid[0], mpi_grid[1], mpi_grid[2],
|
||||
mpi_grid[3]);
|
||||
|
||||
benchmark_axpy({8, 12, 16, 24, 32, 48}, 1.0);
|
||||
|
||||
setVerbosity(QUDA_SILENT);
|
||||
benchmark_wilson({8, 12, 16, 24, 32, 48}, 1.0);
|
||||
benchmark_dwf({8, 12, 16, 24, 32}, 1.0);
|
||||
setVerbosity(QUDA_SUMMARIZE);
|
||||
|
||||
printfQuda("==================== done with all benchmarks ====================\n");
|
||||
|
||||
if (!json_filename.empty())
|
||||
{
|
||||
printfQuda("writing benchmark results to %s\n", json_filename.c_str());
|
||||
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
if (me == 0)
|
||||
{
|
||||
std::ofstream json_file(json_filename);
|
||||
json_file << std::setw(2) << json_results;
|
||||
}
|
||||
}
|
||||
|
||||
endQuda();
|
||||
quda::comm_finalize();
|
||||
MPI_Finalize();
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
# QUDA benchmarks
|
||||
|
||||
This folder contains benchmarks for the [QUDA](https://github.com/lattice/quda) library.
|
||||
|
||||
- `Benchmark_Quda`: This benchmark measure floating point performances of fermion
|
||||
matrices (Wilson and DWF), as well as memory bandwidth (using a simple `axpy` operation). Measurements are
|
||||
performed for a fixed range of problem sizes.
|
||||
|
||||
## Building
|
||||
After setting up your compilation environment (Tursa: `source /home/dp207/dp207/shared/env/production/env-{base,gpu}.sh`):
|
||||
```bash
|
||||
./build-quda.sh <env_dir> # build Quda
|
||||
./build-benchmark.sh <env_dir> # build benchmark
|
||||
```
|
||||
where `<env_dir>` is an arbitrary directory where every product will be stored.
|
||||
|
||||
## Running the Benchmark
|
||||
|
||||
The benchmark should be run as
|
||||
```bash
|
||||
mpirun -np <ranks> <env_dir>/prefix/qudabench/Benchmark_Quda
|
||||
```
|
||||
where `<ranks>` is the total number of GPU's to use. On Tursa this is 4 times the number of nodes.
|
||||
|
||||
Note:
|
||||
- on Tursa, the `wrapper.sh` script that is typically used with Grid is not necessary.
|
||||
- due to Qudas automatic tuning, the benchmark might take significantly longer to run than `Benchmark_Grid` (even though it does fewer things).
|
||||
- setting `QUDA_ENABLE_TUNING=0` disables all tuning (degrades performance severely). By default, it is turned on.
|
||||
- setting `QUDA_RESOURCE_PATH=<some folder>` enables Quda to save and reuse optimal tuning parameters, making repeated runs much faster
|
||||
|
@ -1,32 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1090,SC1091
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if (( $# != 1 )); then
|
||||
echo "usage: $(basename "$0") <environment directory>" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
env_dir=$1
|
||||
|
||||
# TODO: this is Tursa specific. have not figured out the correct way to do this.
|
||||
EXTRA_LIBS="/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libcuda.so /home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/stubs/libnvidia-ml.so"
|
||||
|
||||
# NOTE: these flags need to be in sync with Qudas compilation options (see build-quda.sh)
|
||||
BUILD_FLAGS="-O3 -std=c++17 -DMPI_COMMS -DMULTI_GPU -DQUDA_PRECISION=12 -DQUDA_RECONSTRUCT=4"
|
||||
|
||||
call_dir=$(pwd -P)
|
||||
script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
||||
cd "${env_dir}"
|
||||
env_dir=$(pwd -P)
|
||||
cd "${call_dir}"
|
||||
BUILD_DIR="${env_dir}/build/Quda-benchmarks"
|
||||
PREFIX_DIR="${env_dir}/prefix/qudabench"
|
||||
QUDA_DIR=${env_dir}/prefix/quda
|
||||
mkdir -p "${BUILD_DIR}"
|
||||
mkdir -p "${PREFIX_DIR}"
|
||||
|
||||
LINK_FLAGS="-Wl,-rpath,$QUDA_DIR/lib: $QUDA_DIR/lib/libquda.so $EXTRA_LIBS -lpthread -lmpi"
|
||||
|
||||
g++ $BUILD_FLAGS -I$QUDA_DIR/include/targets/cuda -I$QUDA_DIR/include -c -o $BUILD_DIR/Benchmark_Quda.o $script_dir/Benchmark_Quda.cpp
|
||||
g++ -g -O3 $BUILD_DIR/Benchmark_Quda.o -o $PREFIX_DIR/Benchmark_Quda $LINK_FLAGS -lmpi
|
@ -1,36 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1090,SC1091
|
||||
|
||||
BUILD_FLAGS="-O3 -std=c++17"
|
||||
QUDA_FLAGS="-DQUDA_MPI=ON -DQUDA_PRECISION=14 -DQUDA_RECONSTRUCT=4 -DQUDA_GPU_ARCH=sm_80"
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if (( $# != 1 )); then
|
||||
echo "usage: $(basename "$0") <environment directory>" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
env_dir=$1
|
||||
|
||||
call_dir=$(pwd -P)
|
||||
mkdir -p ${env_dir}
|
||||
cd "${env_dir}"
|
||||
env_dir=$(pwd -P)
|
||||
cd "${call_dir}"
|
||||
|
||||
build_dir="${env_dir}/build/quda"
|
||||
if [ -d "${build_dir}" ]; then
|
||||
echo "error: directory '${build_dir}' exists"
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p "${build_dir}"
|
||||
|
||||
git clone https://github.com/lattice/quda.git "${build_dir}"
|
||||
cd "${build_dir}"
|
||||
|
||||
mkdir build; cd build
|
||||
cmake .. $QUDA_FLAGS -DCMAKE_INSTALL_PREFIX=${env_dir}/prefix/quda
|
||||
make -j128
|
||||
make install
|
||||
|
||||
cd "${call_dir}"
|
21
Quda/env.sh
21
Quda/env.sh
@ -1,21 +0,0 @@
|
||||
module load gcc/9.3.0
|
||||
module load cuda/11.4.1
|
||||
module load openmpi/4.1.1-cuda11.4
|
||||
|
||||
export QUDA_RESOURCE_PATH=$(pwd)/tuning
|
||||
export OMP_NUM_THREADS=4
|
||||
export OMPI_MCA_btl=^uct,openib
|
||||
export OMPI_MCA_pml=ucx # by fabian. no idea what this is
|
||||
#export UCX_TLS=rc,rc_x,sm,cuda_copy,cuda_ipc,gdr_copy
|
||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||
export UCX_RNDV_THRESH=16384
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
|
||||
export OMPI_MCA_io=romio321
|
||||
export OMPI_MCA_btl_openib_allow_ib=true
|
||||
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||
|
||||
export QUDA_REORDER_LOCATION=GPU # this is the default anyway
|
Loading…
x
Reference in New Issue
Block a user