Merge pull request 'main' (#2) from portelli/lattice-benchmarks:main into main

Reviewed-on: #2
This commit is contained in:
Fabian Joswig 2023-01-27 16:35:48 +00:00
commit 2efbad5f70
28 changed files with 1831 additions and 1224 deletions

6
.gitignore vendored
View File

@ -1,5 +1,4 @@
/.vscode /.vscode
build*
.buildutils .buildutils
autom4te.cache autom4te.cache
config.* config.*
@ -12,4 +11,7 @@ install-sh
missing missing
Makefile.in Makefile.in
.DS_Store .DS_Store
*~ *~
/*/env
/*/build
/Grid/json.hpp

14
Grid/.clang-format Normal file
View File

@ -0,0 +1,14 @@
{
BasedOnStyle: LLVM,
UseTab: Never,
IndentWidth: 2,
TabWidth: 2,
BreakBeforeBraces: Allman,
AllowShortIfStatementsOnASingleLine: false,
IndentCaseLabels: false,
ColumnLimit: 90,
AccessModifierOffset: -4,
NamespaceIndentation: All,
FixNamespaceComments: false,
SortIncludes: true,
}

1054
Grid/Benchmark_Grid.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -32,23 +32,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#ifdef HAVE_LIME #ifdef HAVE_LIME
using namespace Grid; using namespace Grid;
std::string filestem(const int l) std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
{
return "iobench_l" + std::to_string(l);
}
int vol(const int i) int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
{
return BENCH_IO_LMIN + 2 * i;
}
int volInd(const int l) int volInd(const int l) { return (l - BENCH_IO_LMIN) / 2; }
{
return (l - BENCH_IO_LMIN) / 2;
}
template <typename Mat> template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
{ {
auto nr = data[0].rows(), nc = data[0].cols(); auto nr = data[0].rows(), nc = data[0].cols();
Eigen::MatrixXd sqSum(nr, nc); Eigen::MatrixXd sqSum(nr, nc);
@ -66,11 +56,11 @@ void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
mean /= n; mean /= n;
} }
#define grid_printf(...) \ #define grid_printf(...) \
{ \ { \
char _buf[1024]; \ char _buf[1024]; \
sprintf(_buf, __VA_ARGS__); \ sprintf(_buf, __VA_ARGS__); \
MSG << _buf; \ MSG << _buf; \
} }
enum enum
@ -173,47 +163,49 @@ int main(int argc, char **argv)
MSG << "SUMMARY" << std::endl; MSG << "SUMMARY" << std::endl;
MSG << BIGSEP << std::endl; MSG << BIGSEP << std::endl;
MSG << "Summary of individual results (all results in MB/s)." << std::endl; MSG << "Summary of individual results (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; MSG << "Every second colum gives the standard deviation of the previous column."
<< std::endl;
MSG << std::endl; MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
"L", "std read", "std dev", "std write", "std dev", "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
"Grid read", "std dev", "Grid write", "std dev");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{ {
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", l,
l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), mean(volInd(l), sRead), stdDev(volInd(l), sRead), mean(volInd(l), sWrite),
mean(volInd(l), sWrite), stdDev(volInd(l), sWrite), stdDev(volInd(l), sWrite), mean(volInd(l), gRead),
mean(volInd(l), gRead), stdDev(volInd(l), gRead), stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); stdDev(volInd(l), gWrite));
} }
MSG << std::endl; MSG << std::endl;
MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl; MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
<< std::endl;
MSG << std::endl; MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s\n", grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
"L", "std read", "std write", "Grid read", "Grid write"); "Grid write");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{ {
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
l, rob(volInd(l), sRead), rob(volInd(l), sWrite), rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
rob(volInd(l), gRead), rob(volInd(l), gWrite));
} }
MSG << std::endl; MSG << std::endl;
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl; MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; << "^4 (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column."
<< std::endl;
MSG << std::endl; MSG << std::endl;
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
"std read", "std dev", "std write", "std dev", "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
"Grid read", "std dev", "Grid write", "std dev"); grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
MSG << std::endl; MSG << std::endl;
MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl; MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
<< std::endl;
MSG << std::endl; MSG << std::endl;
grid_printf("%12s %12s %12s %12s\n", grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
"std read", "std write", "Grid read", "Grid write"); "Grid write");
grid_printf("%12.1f %12.1f %12.1f %12.1f\n", grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite)); avRob(gWrite));
Grid_finalize(); Grid_finalize();

View File

@ -20,9 +20,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <Grid/Grid.h> #include <Grid/Grid.h>
#define MSG std::cout << GridLogMessage #define MSG std::cout << GridLogMessage
#define SEP \ #define SEP \
"-----------------------------------------------------------------------------" "-----------------------------------------------------------------------------"
#define BIGSEP \ #define BIGSEP \
"=============================================================================" "============================================================================="
#ifdef HAVE_LIME #ifdef HAVE_LIME
@ -36,16 +36,15 @@ namespace Grid
// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
// //
// template <typename Field> // template <typename Field> void stdWrite(const std::string filestem, Field &vec)
// void stdWrite(const std::string filestem, Field &vec)
// { // {
// std::string rankStr = std::to_string(vec.Grid()->ThisRank()); // std::string rankStr = std::to_string(vec.Grid()->ThisRank());
// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
// size_t size; // size_t size;
// uint32_t crc; // uint32_t crc;
// GridStopWatch ioWatch, crcWatch; // GridStopWatch ioWatch, crcWatch;
// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); // size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
// autoView(vec_v, vec, CpuRead); // autoView(vec_v, vec, CpuRead);
// crcWatch.Start(); // crcWatch.Start();
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size); // crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
@ -53,36 +52,39 @@ namespace Grid
// crcWatch.Stop(); // crcWatch.Stop();
// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; // MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
// ioWatch.Start(); // ioWatch.Start();
// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); // std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
// vec.Grid()->lSites(), file);
// ioWatch.Stop(); // ioWatch.Stop();
// std::fclose(file); // std::fclose(file);
// size *= vec.Grid()->ProcessorCount(); // size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf; // auto &p = BinaryIO::lastPerf;
// p.size = size; // p.size = size;
// p.time = ioWatch.useconds(); // p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); // p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() // MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; // << ",
// "
// << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; // MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
// } // }
//
// template <typename Field> // template <typename Field> void stdRead(Field &vec, const std::string filestem)
// void stdRead(Field &vec, const std::string filestem)
// { // {
// std::string rankStr = std::to_string(vec.Grid()->ThisRank()); // std::string rankStr = std::to_string(vec.Grid()->ThisRank());
// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
// size_t size; // size_t size;
// uint32_t crcRead, crcData; // uint32_t crcRead, crcData;
// GridStopWatch ioWatch, crcWatch; // GridStopWatch ioWatch, crcWatch;
// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); // size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
// crcWatch.Start(); // crcWatch.Start();
// std::fread(&crcRead, sizeof(uint32_t), 1, file); // std::fread(&crcRead, sizeof(uint32_t), 1, file);
// crcWatch.Stop(); // crcWatch.Stop();
// { // {
// autoView(vec_v, vec, CpuWrite); // autoView(vec_v, vec, CpuWrite);
// ioWatch.Start(); // ioWatch.Start();
// std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); // std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
// vec.Grid()->lSites(), file);
// ioWatch.Stop(); // ioWatch.Stop();
// std::fclose(file); // std::fclose(file);
// } // }
@ -96,19 +98,19 @@ namespace Grid
// assert(crcData == crcRead); // assert(crcData == crcRead);
// size *= vec.Grid()->ProcessorCount(); // size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf; // auto &p = BinaryIO::lastPerf;
// p.size = size; // p.size = size;
// p.time = ioWatch.useconds(); // p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); // p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() // MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; // << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; // MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
// } // }
template <typename Field> template <typename Field> void stdWrite(const std::string filestem, Field &vec)
void stdWrite(const std::string filestem, Field &vec)
{ {
std::string rankStr = std::to_string(vec.Grid()->ThisRank()); std::string rankStr = std::to_string(vec.Grid()->ThisRank());
std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary); std::ofstream file(filestem + "." + rankStr + ".bin",
std::ios::out | std::ios::binary);
size_t size, sizec; size_t size, sizec;
uint32_t crc; uint32_t crc;
GridStopWatch ioWatch, crcWatch; GridStopWatch ioWatch, crcWatch;
@ -130,16 +132,16 @@ namespace Grid
p.size = size; p.size = size;
p.time = ioWatch.useconds(); p.time = ioWatch.useconds();
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl; << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
} }
template <typename Field> template <typename Field> void stdRead(Field &vec, const std::string filestem)
void stdRead(Field &vec, const std::string filestem)
{ {
std::string rankStr = std::to_string(vec.Grid()->ThisRank()); std::string rankStr = std::to_string(vec.Grid()->ThisRank());
std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary); std::ifstream file(filestem + "." + rankStr + ".bin",
std::ios::in | std::ios::binary);
size_t size, sizec; size_t size, sizec;
uint32_t crcRead, crcData; uint32_t crcRead, crcData;
GridStopWatch ioWatch, crcWatch; GridStopWatch ioWatch, crcWatch;
@ -168,13 +170,12 @@ namespace Grid
p.size = size; p.size = size;
p.time = ioWatch.useconds(); p.time = ioWatch.useconds();
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl; << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
} }
template <typename Field> template <typename Field> void limeWrite(const std::string filestem, Field &vec)
void limeWrite(const std::string filestem, Field &vec)
{ {
emptyUserRecord record; emptyUserRecord record;
ScidacWriter binWriter(vec.Grid()->IsBoss()); ScidacWriter binWriter(vec.Grid()->IsBoss());
@ -184,8 +185,7 @@ namespace Grid
binWriter.close(); binWriter.close();
} }
template <typename Field> template <typename Field> void limeRead(Field &vec, const std::string filestem)
void limeRead(Field &vec, const std::string filestem)
{ {
emptyUserRecord record; emptyUserRecord record;
ScidacReader binReader; ScidacReader binReader;
@ -225,12 +225,13 @@ namespace Grid
template <typename Field> template <typename Field>
void writeBenchmark(const Coordinate &latt, const std::string filename, void writeBenchmark(const Coordinate &latt, const std::string filename,
const WriterFn<Field> &write, const WriterFn<Field> &write, const unsigned int Ls = 1,
const unsigned int Ls = 1, const bool rb = false) const bool rb = false)
{ {
auto mpi = GridDefaultMpi(); auto mpi = GridDefaultMpi();
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr<GridCartesian> gBasePt(
SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
std::shared_ptr<GridBase> gPt; std::shared_ptr<GridBase> gPt;
std::random_device rd; std::random_device rd;
@ -251,12 +252,13 @@ namespace Grid
template <typename Field> template <typename Field>
void readBenchmark(const Coordinate &latt, const std::string filename, void readBenchmark(const Coordinate &latt, const std::string filename,
const ReaderFn<Field> &read, const ReaderFn<Field> &read, const unsigned int Ls = 1,
const unsigned int Ls = 1, const bool rb = false) const bool rb = false)
{ {
auto mpi = GridDefaultMpi(); auto mpi = GridDefaultMpi();
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr<GridCartesian> gBasePt(
SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
std::shared_ptr<GridBase> gPt; std::shared_ptr<GridBase> gPt;
makeGrid(gPt, gBasePt, Ls, rb); makeGrid(gPt, gBasePt, Ls, rb);

View File

@ -1,801 +0,0 @@
/*
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <Grid/Grid.h>
using namespace Grid;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
double mflop_ref;
double mflop_ref_err;
int NN_global;
struct time_statistics
{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v)
{
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x)
{ return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void comms_header()
{
std::cout << GridLogMessage << " L "
<< "\t"
<< " Ls "
<< "\t"
<< "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
};
Gamma::Algebra Gmu[] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT};
struct controls
{
int Opt;
int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
};
class Benchmark
{
public:
static void Decomposition(void)
{
int threads = GridThread::GetThreads();
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() << std::endl;
std::cout << GridLogMessage << "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi()) << std::endl;
std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) << std::endl;
std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) << std::endl;
std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) << std::endl;
std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) << std::endl;
std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) << std::endl;
std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
}
static void Comms(void)
{
int Nloop = 200;
int nmu = 0;
int maxlat = 32;
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
for (int mu = 0; mu < Nd; mu++)
if (mpi_layout[mu] > 1)
nmu++;
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout << GridLogMessage << "====================================================================================================" << std::endl;
std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl;
std::cout << GridLogMessage << "====================================================================================================" << std::endl;
comms_header();
for (int lat = 16; lat <= maxlat; lat += 8)
{
// for(int Ls=8;Ls<=8;Ls*=2){
{
int Ls = 12;
Coordinate latt_size({lat * mpi_layout[0],
lat * mpi_layout[1],
lat * mpi_layout[2],
lat * mpi_layout[3]});
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank / Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
// Grid.ShmBufferFreeAll();
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
for (int d = 0; d < 8; d++)
{
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
// bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
// bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
// int ncomm;
double dbytes;
for (int dir = 0; dir < 8; dir++)
{
int mu = dir % 4;
if (mpi_layout[mu] > 1)
{
std::vector<double> times(Nloop);
for (int i = 0; i < Nloop; i++)
{
dbytes = 0;
double start = usecond();
int xmit_to_rank;
int recv_from_rank;
if (dir == mu)
{
int comm_proc = 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
else
{
int comm_proc = mpi_layout[mu] - 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank,
bytes);
dbytes += bytes;
double stop = usecond();
t_time[i] = stop - start; // microseconds
}
timestat.statistics(t_time);
dbytes = dbytes * ppn;
double xbytes = dbytes * 0.5;
double bidibytes = dbytes;
std::cout << GridLogMessage << lat << "\t" << Ls << "\t "
<< bytes << " \t "
<< xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t "
<< xbytes / timestat.max << " " << xbytes / timestat.min
<< "\t\t" << bidibytes / timestat.mean << " " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " "
<< bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl;
}
}
for (int d = 0; d < 8; d++)
{
acceleratorFreeDevice(xbuf[d]);
acceleratorFreeDevice(rbuf[d]);
}
}
}
return;
}
static void Memory(void)
{
const int Nvec = 8;
typedef Lattice<iVector<vReal, Nvec>> LatticeVec;
typedef iVector<vReal, Nvec> Vec;
Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " L "
<< "\t\t"
<< "bytes"
<< "\t\t\t"
<< "GB/s"
<< "\t\t"
<< "Gflop/s"
<< "\t\t seconds"
<< "\t\tGB/s / node" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl;
// uint64_t NP;
uint64_t NN;
uint64_t lmax = 32;
#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat)
GridSerialRNG sRNG;
sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
for (int lat = 8; lat <= lmax; lat += 8)
{
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]});
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
// NP= Grid.RankCount();
NN = Grid.NodeCount();
Vec rn;
random(sRNG, rn);
LatticeVec z(&Grid);
z = Zero();
LatticeVec x(&Grid);
x = Zero();
LatticeVec y(&Grid);
y = Zero();
double a = 2.0;
uint64_t Nloop = NLOOP;
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
z = a * x - y;
}
double stop = usecond();
double time = (stop - start) / Nloop * 1000;
double flops = vol * Nvec * 2; // mul,add
double bytes = 3.0 * vol * Nvec * sizeof(Real);
std::cout << GridLogMessage << std::setprecision(3)
<< lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000.
<< "\t\t" << bytes / time / NN << std::endl;
}
};
static void SU4(void)
{
const int Nc4 = 4;
typedef Lattice<iMatrix<vComplexF, Nc4>> LatticeSU4;
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " L "
<< "\t\t"
<< "bytes"
<< "\t\t\t"
<< "GB/s"
<< "\t\t"
<< "Gflop/s"
<< "\t\t seconds"
<< "\t\tGB/s / node" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl;
uint64_t NN;
uint64_t lmax = 32;
GridSerialRNG sRNG;
sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
for (int lat = 8; lat <= lmax; lat += 8)
{
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]});
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
NN = Grid.NodeCount();
LatticeSU4 z(&Grid);
z = Zero();
LatticeSU4 x(&Grid);
x = Zero();
LatticeSU4 y(&Grid);
y = Zero();
// double a=2.0;
uint64_t Nloop = NLOOP;
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
z = x * y;
}
double stop = usecond();
double time = (stop - start) / Nloop * 1000;
double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
std::cout << GridLogMessage << std::setprecision(3)
<< lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000.
<< "\t\t" << bytes / time / NN << std::endl;
}
};
static double DWF(int Ls, int L)
{
RealD mass = 0.1;
RealD M5 = 1.8;
double mflops;
double mflops_best = 0;
double mflops_worst = 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi();
assert(mpi.size() == 4);
Coordinate local({L, L, L, L});
Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global = NN;
uint64_t SHM = NP / NN;
///////// Welcome message ////////////
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " << std::endl;
std::cout << GridLogMessage << "* Nc : " << Nc << std::endl;
std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl;
std::cout << GridLogMessage << "* Ls : " << Ls << std::endl;
std::cout << GridLogMessage << "* ranks : " << NP << std::endl;
std::cout << GridLogMessage << "* nodes : " << NN << std::endl;
std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl;
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl;
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
///////// Lattice Init ////////////
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG4(UGrid);
RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid);
RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
typedef DomainWallFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
///////// Source preparation ////////////
Gauge Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4, Umu);
Fermion src(FGrid);
random(RNG5, src);
Fermion src_e(FrbGrid);
Fermion src_o(FrbGrid);
Fermion r_e(FrbGrid);
Fermion r_o(FrbGrid);
Fermion r_eo(FGrid);
Action Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
{
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd, src_o, src);
const int num_cases = 4;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases[] = {
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}};
for (int c = 0; c < num_cases; c++)
{
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout << GridLogMessage << "==================================================================================" << std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
int nwarm = 10;
double t0 = usecond();
FGrid->Barrier();
for (int i = 0; i < nwarm; i++)
{
Dw.DhopEO(src_o, r_e, DaggerNo);
}
FGrid->Barrier();
double t1 = usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0, &ncall, sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Dw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for (uint64_t i = 0; i < ncall; i++)
{
t0 = usecond();
Dw.DhopEO(src_o, r_e, DaggerNo);
t1 = usecond();
t_time[i] = t1 - t0;
}
FGrid->Barrier();
double volume = Ls;
for (int mu = 0; mu < Nd; mu++)
volume = volume * latt4[mu];
// Nc=3 gives
// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2
// double flops=(1344.0*volume)/2;
#if 0
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2;
#else
double fps = Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
#endif
double flops = (fps * volume) / 2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops / timestat.min;
mf_lo = flops / timestat.max;
mf_err = flops / timestat.min * timestat.err / timestat.mean;
mflops = flops / timestat.mean;
mflops_all.push_back(mflops);
if (mflops_best == 0)
mflops_best = mflops;
if (mflops_worst == 0)
mflops_worst = mflops;
if (mflops > mflops_best)
mflops_best = mflops;
if (mflops < mflops_worst)
mflops_worst = mflops;
std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank " << mflops / NP << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node " << mflops / NN << std::endl;
}
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Best mflop/s = " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl;
std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Worst mflop/s = " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl;
std::cout << GridLogMessage << fmt << std::endl;
std::cout << GridLogMessage;
for (int i = 0; i < mflops_all.size(); i++)
{
std::cout << mflops_all[i] / NN << " ; ";
}
std::cout << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
}
return mflops_best;
}
static double Staggered(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst = 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi();
assert(mpi.size() == 4);
Coordinate local({L, L, L, L});
Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global = NN;
uint64_t SHM = NP / NN;
///////// Welcome message ////////////
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L << "^4 local volume " << std::endl;
std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl;
std::cout << GridLogMessage << "* ranks : " << NP << std::endl;
std::cout << GridLogMessage << "* nodes : " << NN << std::endl;
std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl;
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl;
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
///////// Lattice Init ////////////
GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1, 2, 3, 4});
GridParallelRNG RNG4(FGrid);
RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass = 0.1;
RealD c1 = 9.0 / 8.0;
RealD c2 = -1.0 / 24.0;
RealD u0 = 1.0;
typedef ImprovedStaggeredFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid);
SU<Nc>::HotConfiguration(RNG4, Umu);
typename Action::ImplParams params;
Action Ds(Umu, Umu, *FGrid, *FrbGrid, mass, c1, c2, u0, params);
///////// Source preparation ////////////
Fermion src(FGrid);
random(RNG4, src);
Fermion src_e(FrbGrid);
Fermion src_o(FrbGrid);
Fermion r_e(FrbGrid);
Fermion r_o(FrbGrid);
Fermion r_eo(FGrid);
{
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd, src_o, src);
const int num_cases = 4;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases[] = {
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}};
for (int c = 0; c < num_cases; c++)
{
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout << GridLogMessage << "==================================================================================" << std::endl;
if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" << std::endl;
if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
int nwarm = 10;
double t0 = usecond();
FGrid->Barrier();
for (int i = 0; i < nwarm; i++)
{
Ds.DhopEO(src_o, r_e, DaggerNo);
}
FGrid->Barrier();
double t1 = usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0, &ncall, sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Ds.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for (uint64_t i = 0; i < ncall; i++)
{
t0 = usecond();
Ds.DhopEO(src_o, r_e, DaggerNo);
t1 = usecond();
t_time[i] = t1 - t0;
}
FGrid->Barrier();
double volume = 1;
for (int mu = 0; mu < Nd; mu++)
volume = volume * latt4[mu];
double flops = (1146.0 * volume) / 2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops / timestat.min;
mf_lo = flops / timestat.max;
mf_err = flops / timestat.min * timestat.err / timestat.mean;
mflops = flops / timestat.mean;
mflops_all.push_back(mflops);
if (mflops_best == 0)
mflops_best = mflops;
if (mflops_worst == 0)
mflops_worst = mflops;
if (mflops > mflops_best)
mflops_best = mflops;
if (mflops < mflops_worst)
mflops_worst = mflops;
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank " << mflops / NP << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node " << mflops / NN << std::endl;
}
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << L << "^4 Deo Best mflop/s = " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl;
std::cout << GridLogMessage << L << "^4 Deo Worst mflop/s = " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl;
std::cout << GridLogMessage << fmt << std::endl;
std::cout << GridLogMessage;
for (int i = 0; i < mflops_all.size(); i++)
{
std::cout << mflops_all[i] / NN << " ; ";
}
std::cout << std::endl;
}
std::cout << GridLogMessage << "==================================================================================" << std::endl;
return mflops_best;
}
};
int main(int argc, char **argv)
{
Grid_init(&argc, &argv);
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
#ifdef KNL
LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2});
#else
LebesgueOrder::Block = std::vector<int>({2, 2, 2, 2});
#endif
Benchmark::Decomposition();
int do_su4 = 1;
int do_memory = 1;
int do_comms = 1;
int sel = 4;
std::vector<int> L_list({8, 12, 16, 24, 32});
int selm1 = sel - 1;
std::vector<double> wilson;
std::vector<double> dwf4;
std::vector<double> staggered;
int Ls = 1;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
}
Ls = 12;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
double result = Benchmark::DWF(Ls, L_list[l]);
dwf4.push_back(result);
}
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
double result = Benchmark::Staggered(L_list[l]);
staggered.push_back(result);
}
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " << dwf4[l] << " \t\t " << staggered[l] << std::endl;
}
std::cout << GridLogMessage << "==================================================================================" << std::endl;
int NN = NN_global;
if (do_memory)
{
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Memory benchmark " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
Benchmark::Memory();
}
if (do_su4)
{
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
Benchmark::SU4();
}
if (do_comms)
{
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Communications benchmark " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
Benchmark::Comms();
}
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
}
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN << " Mflop/s per node" << std::endl;
std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl;
std::cout << std::setprecision(3);
std::cout << GridLogMessage << "==================================================================================" << std::endl;
Grid_finalize();
}

View File

@ -21,231 +21,245 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
struct time_statistics{ struct time_statistics
{
double mean; double mean;
double err; double err;
double min; double min;
double max; double max;
void statistics(std::vector<double> v){ void statistics(std::vector<double> v)
double sum = std::accumulate(v.begin(), v.end(), 0.0); {
mean = sum / v.size(); double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size()); std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end()); auto result = std::minmax_element(v.begin(), v.end());
min = *result.first; min = *result.first;
max = *result.second; max = *result.second;
} }
}; };
void header(){ void header()
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<std::setw(11)<<"bytes\t\t"<<"MB/s uni"<<"\t"<<"MB/s bidi"<<std::endl;
};
int main (int argc, char ** argv)
{ {
Grid_init(&argc,&argv); std::cout << GridLogMessage << " L "
<< "\t"
<< " Ls "
<< "\t" << std::setw(11) << "bytes\t\t"
<< "MB/s uni"
<< "\t"
<< "MB/s bidi" << std::endl;
};
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); int main(int argc, char **argv)
Coordinate mpi_layout = GridDefaultMpi(); {
Grid_init(&argc, &argv);
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
<< std::endl;
int Nloop=250; int Nloop = 250;
int nmu=0; int nmu = 0;
int maxlat=32; int maxlat = 32;
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; for (int mu = 0; mu < Nd; mu++)
if (mpi_layout[mu] > 1)
nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
<< std::endl;
std::vector<double> t_time(Nloop); std::vector<double> t_time(Nloop);
// time_statistics timestat; // time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout << GridLogMessage
std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl; << "========================================================================="
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; "==========================="
<< std::endl;
std::cout << GridLogMessage
<< "= Benchmarking sequential halo exchange from host memory " << std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
header(); header();
for(int lat=8;lat<=maxlat;lat+=4){ for (int lat = 8; lat <= maxlat; lat += 4)
for(int Ls=8;Ls<=8;Ls*=2){ {
for (int Ls = 8; Ls <= 8; Ls *= 2)
{
Coordinate latt_size ({lat*mpi_layout[0], Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat*mpi_layout[1], lat * mpi_layout[3]});
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridCartesian Grid(latt_size, simd_layout, mpi_layout);
RealD Nrank = Grid._Nprocessors; RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode; RealD ppn = Nrank / Nnode;
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8); std::vector<std::vector<HalfSpinColourVectorD>> xbuf(8);
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8); std::vector<std::vector<HalfSpinColourVectorD>> rbuf(8);
for(int mu=0;mu<8;mu++){ for (int mu = 0; mu < 8; mu++)
xbuf[mu].resize(lat*lat*lat*Ls); {
rbuf[mu].resize(lat*lat*lat*Ls); xbuf[mu].resize(lat * lat * lat * Ls);
rbuf[mu].resize(lat * lat * lat * Ls);
} }
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
int ncomm; int ncomm;
for(int mu=0;mu<4;mu++){ for (int mu = 0; mu < 4; mu++)
if (mpi_layout[mu]>1 ) { {
double start=usecond(); if (mpi_layout[mu] > 1)
for(int i=0;i<Nloop;i++){ {
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
ncomm=0; ncomm = 0;
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
}
comm_proc = mpi_layout[mu]-1; ncomm++;
{ int comm_proc = 1;
std::vector<CommsRequest_t> requests; int xmit_to_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); int recv_from_rank;
Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
Grid.Barrier();
double stop=usecond();
double mean=(stop-start)/Nloop;
double dbytes = bytes*ppn;
double xbytes = dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" {
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" " std::vector<CommsRequest_t> requests;
<<std::right<< xbytes/mean<<" " Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
<< "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl; Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
(void *)&rbuf[mu][0], recv_from_rank, bytes);
}
comm_proc = mpi_layout[mu] - 1;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
}
}
Grid.Barrier();
double stop = usecond();
double mean = (stop - start) / Nloop;
double dbytes = bytes * ppn;
double xbytes = dbytes * 2.0 * ncomm;
double rbytes = xbytes;
double bidibytes = xbytes + rbytes;
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
} << std::setw(11) << bytes << std::fixed << std::setprecision(1)
<< std::setw(7) << " " << std::right << xbytes / mean << " "
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
}
} }
} }
} }
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout << GridLogMessage
std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory "<<std::endl; << "========================================================================="
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; "==========================="
<< std::endl;
std::cout << GridLogMessage
<< "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
header(); header();
for(int lat=8;lat<=maxlat;lat+=4){ for (int lat = 8; lat <= maxlat; lat += 4)
for(int Ls=8;Ls<=8;Ls*=2){ {
for (int Ls = 8; Ls <= 8; Ls *= 2)
{
Coordinate latt_size ({lat*mpi_layout[0], Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat*mpi_layout[1], lat * mpi_layout[3]});
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridCartesian Grid(latt_size, simd_layout, mpi_layout);
RealD Nrank = Grid._Nprocessors; RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode; RealD ppn = Nrank / Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8); std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8); std::vector<HalfSpinColourVectorD *> rbuf(8);
uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
for(int d=0;d<8;d++){ for (int d = 0; d < 8; d++)
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); {
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
} }
int ncomm; int ncomm;
for(int mu=0;mu<4;mu++){ for (int mu = 0; mu < 4; mu++)
if (mpi_layout[mu]>1 ) { {
double start=usecond(); if (mpi_layout[mu] > 1)
for(int i=0;i<Nloop;i++){ {
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
ncomm=0; ncomm = 0;
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
}
comm_proc = mpi_layout[mu]-1; ncomm++;
{ int comm_proc = 1;
std::vector<CommsRequest_t> requests; int xmit_to_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); int recv_from_rank;
Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
Grid.Barrier();
double stop=usecond();
double mean=(stop-start)/Nloop;
double dbytes = bytes*ppn;
double xbytes = dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" {
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" " std::vector<CommsRequest_t> requests;
<<std::right<< xbytes/mean<<" " Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
<< "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl; Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
(void *)&rbuf[mu][0], recv_from_rank, bytes);
}
comm_proc = mpi_layout[mu] - 1;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
}
}
Grid.Barrier();
double stop = usecond();
double mean = (stop - start) / Nloop;
double dbytes = bytes * ppn;
double xbytes = dbytes * 2.0 * ncomm;
double rbytes = xbytes;
double bidibytes = xbytes + rbytes;
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
} << std::setw(11) << bytes << std::fixed << std::setprecision(1)
<< std::setw(7) << " " << std::right << xbytes / mean << " "
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
}
} }
for(int d=0;d<8;d++){ for (int d = 0; d < 8; d++)
acceleratorFreeDevice(xbuf[d]); {
acceleratorFreeDevice(rbuf[d]); acceleratorFreeDevice(xbuf[d]);
acceleratorFreeDevice(rbuf[d]);
} }
} }
} }
std::cout << GridLogMessage
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; << "========================================================================="
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl; "==========================="
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; << std::endl;
std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
Grid_finalize(); Grid_finalize();
} }

View File

@ -1,6 +1,7 @@
/* /*
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk> Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com> Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
Copyright © 2023 Simon Bürger <simon.buerger@rwth-aachen.de>
This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
@ -16,6 +17,7 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#include "json.hpp"
#include <Grid/Grid.h> #include <Grid/Grid.h>
#ifdef GRID_CUDA #ifdef GRID_CUDA
#define CUDA_PROFILE #define CUDA_PROFILE
@ -28,17 +30,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
template <class d> template <class d> struct scal
struct scal
{ {
d internal; d internal;
}; };
Gamma::Algebra Gmu[] = { Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
Gamma::Algebra::GammaX, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT};
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
@ -48,24 +46,41 @@ int main(int argc, char **argv)
Coordinate latt4 = GridDefaultLatt(); Coordinate latt4 = GridDefaultLatt();
int Ls = 16; int Ls = 16;
std::string json_filename = ""; // empty indicates no json output
nlohmann::json json;
// benchmark specific command line arguments
for (int i = 0; i < argc; i++) for (int i = 0; i < argc; i++)
{
if (std::string(argv[i]) == "-Ls") if (std::string(argv[i]) == "-Ls")
{ {
std::stringstream ss(argv[i + 1]); std::stringstream ss(argv[i + 1]);
ss >> Ls; ss >> Ls;
} }
if (std::string(argv[i]) == "--json-out")
json_filename = argv[i + 1];
}
GridLogLayout(); GridLogLayout();
long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc); long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); json["single_site_flops"] = single_site_flops;
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
json["grid"] = FGrid->FullDimensions().toVector();
json["local_grid"] = FGrid->LocalDimensions().toVector();
std::cout << GridLogMessage << "Making s innermost grids" << std::endl; std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
GridCartesian *sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi()); GridCartesian *sUGrid =
SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid); GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid); GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
@ -177,13 +192,28 @@ int main(int argc, char **argv)
RealD NP = UGrid->_Nprocessors; RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount(); RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage << "*****************************************************************" << std::endl; json["ranks"] = NP;
std::cout << GridLogMessage << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" << std::endl; json["nodes"] = NN;
std::cout << GridLogMessage << "*****************************************************************" << std::endl;
std::cout << GridLogMessage << "*****************************************************************" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "* Benchmarking DomainWallFermionR::Dhop " << std::endl; << "*****************************************************************"
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl; << std::endl;
std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B" << std::endl; std::cout << GridLogMessage
<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
<< std::endl;
std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout << GridLogMessage
<< "* Benchmarking DomainWallFermionR::Dhop " << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
<< std::endl;
std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
<< std::endl;
if (sizeof(RealF) == 4) if (sizeof(RealF) == 4)
std::cout << GridLogMessage << "* SINGLE precision " << std::endl; std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
if (sizeof(RealF) == 8) if (sizeof(RealF) == 8)
@ -199,8 +229,11 @@ int main(int argc, char **argv)
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl; std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl; std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
std::cout << GridLogMessage << "*****************************************************************" << std::endl; std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
int ncall = 300; int ncall = 300;
@ -230,19 +263,38 @@ int main(int argc, char **argv)
auto simdwidth = sizeof(vComplex); auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.); double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
nsimd * ncall / (1024. * 1024. * 1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors // mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.); double data_mem =
(volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
nsimd * ncall / (1024. * 1024. * 1024.);
std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0 << " us" << std::endl; json["Dw"]["calls"] = ncall;
json["Dw"]["time"] = t1 - t0;
json["Dw"]["mflops"] = flops / (t1 - t0);
json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN;
json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
<< " us" << std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; // std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl; // std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout << GridLogMessage << "mflop/s = " << flops / (t1 - t0) << std::endl; std::cout << GridLogMessage << "mflop/s = " << flops / (t1 - t0) << std::endl;
std::cout << GridLogMessage << "mflop/s per rank = " << flops / (t1 - t0) / NP << std::endl; std::cout << GridLogMessage << "mflop/s per rank = " << flops / (t1 - t0) / NP
std::cout << GridLogMessage << "mflop/s per node = " << flops / (t1 - t0) / NN << std::endl; << std::endl;
std::cout << GridLogMessage << "RF GiB/s (base 2) = " << 1000000. * data_rf / ((t1 - t0)) << std::endl; std::cout << GridLogMessage << "mflop/s per node = " << flops / (t1 - t0) / NN
std::cout << GridLogMessage << "mem GiB/s (base 2) = " << 1000000. * data_mem / ((t1 - t0)) << std::endl; << std::endl;
std::cout << GridLogMessage
<< "RF GiB/s (base 2) = " << 1000000. * data_rf / ((t1 - t0))
<< std::endl;
std::cout << GridLogMessage
<< "mem GiB/s (base 2) = " << 1000000. * data_mem / ((t1 - t0))
<< std::endl;
err = ref - result; err = ref - result;
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl; std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
// exit(0); // exit(0);
@ -313,7 +365,10 @@ int main(int argc, char **argv)
} }
// dump=1; // dump=1;
Dw.Dhop(src, result, 1); Dw.Dhop(src, result, 1);
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout << GridLogMessage
<< "Compare to naive wilson implementation Dag to verify correctness"
<< std::endl;
std::cout << GridLogMessage << "Called DwDag" << std::endl; std::cout << GridLogMessage << "Called DwDag" << std::endl;
std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl; std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl; std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl;
@ -333,7 +388,8 @@ int main(int argc, char **argv)
LatticeFermionF r_o(FrbGrid); LatticeFermionF r_o(FrbGrid);
LatticeFermionF r_eo(FGrid); LatticeFermionF r_eo(FGrid);
std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec" << std::endl; std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
<< std::endl;
pickCheckerboard(Even, src_e, src); pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd, src_o, src); pickCheckerboard(Odd, src_o, src);
@ -341,9 +397,14 @@ int main(int argc, char **argv)
std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl; std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
// S-direction is INNERMOST and takes no part in the parity. // S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage << "*********************************************************" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "* Benchmarking DomainWallFermionF::DhopEO " << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl; << "*********************************************************" << std::endl;
std::cout << GridLogMessage
<< "* Benchmarking DomainWallFermionF::DhopEO " << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
<< std::endl;
if (sizeof(RealF) == 4) if (sizeof(RealF) == 4)
std::cout << GridLogMessage << "* SINGLE precision " << std::endl; std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
if (sizeof(RealF) == 8) if (sizeof(RealF) == 8)
@ -360,7 +421,9 @@ int main(int argc, char **argv)
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl; std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl; std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
std::cout << GridLogMessage << "*********************************************************" << std::endl; std::cout << GridLogMessage
<< "*********************************************************" << std::endl;
{ {
Dw.ZeroCounters(); Dw.ZeroCounters();
FGrid->Barrier(); FGrid->Barrier();
@ -386,9 +449,18 @@ int main(int argc, char **argv)
volume = volume * latt4[mu]; volume = volume * latt4[mu];
double flops = (single_site_flops * volume * ncall) / 2.0; double flops = (single_site_flops * volume * ncall) / 2.0;
json["Deo"]["calls"] = ncall;
json["Deo"]["time"] = t1 - t0;
json["Deo"]["mflops"] = flops / (t1 - t0);
json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl; std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl;
std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP << std::endl; std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP
std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN << std::endl; << std::endl;
std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN
<< std::endl;
Dw.Report(); Dw.Report();
} }
Dw.DhopEO(src_o, r_e, DaggerNo); Dw.DhopEO(src_o, r_e, DaggerNo);
@ -420,6 +492,21 @@ int main(int argc, char **argv)
assert(norm2(src_e) < 1.0e-4); assert(norm2(src_e) < 1.0e-4);
assert(norm2(src_o) < 1.0e-4); assert(norm2(src_o) < 1.0e-4);
if (!json_filename.empty())
{
std::cout << GridLogMessage << "writing benchmark results to " << json_filename
<< std::endl;
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
if (me == 0)
{
std::ofstream json_file(json_filename);
json_file << std::setw(4) << json;
}
}
Grid_finalize(); Grid_finalize();
exit(0); exit(0);
} }

36
Grid/Common.hpp Normal file
View File

@ -0,0 +1,36 @@
/*
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef Grid_Benchmarks_Common_hpp_
#define Grid_Benchmarks_Common_hpp_
#ifndef GRID_MSG
#define GRID_MSG std::cout << GridLogMessage
#endif
#ifndef GRID_MSG_MAXSIZE
#define GRID_MSG_MAXSIZE 1024
#endif
#define grid_printf(...) \
{ \
char _buf[GRID_MSG_MAXSIZE]; \
snprintf(_buf, GRID_MSG_MAXSIZE, __VA_ARGS__); \
GRID_MSG << _buf; \
}
#endif // Grid_Benchmarks_Common_hpp_

View File

@ -3,10 +3,10 @@ ACLOCAL_AMFLAGS = -I .buildutils/m4
bin_PROGRAMS = \ bin_PROGRAMS = \
Benchmark_comms_host_device \ Benchmark_comms_host_device \
Benchmark_dwf_fp32 \ Benchmark_dwf_fp32 \
Benchmark_ITT \ Benchmark_Grid \
Benchmark_IO Benchmark_IO
Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp
Benchmark_dwf_fp32_SOURCES = Benchmark_dwf_fp32.cpp Benchmark_dwf_fp32_SOURCES = Benchmark_dwf_fp32.cpp
Benchmark_ITT_SOURCES = Benchmark_ITT.cpp Benchmark_Grid_SOURCES = Benchmark_Grid.cpp
Benchmark_IO_SOURCES = Benchmark_IO.cpp Benchmark_IO_SOURCES = Benchmark_IO.cpp

69
Grid/Readme.md Normal file
View File

@ -0,0 +1,69 @@
# Grid benchmarks
This folder contains benchmarks for the [Grid](https://github.com/aportelli/) library.
The benchmarks can be summarised as follows
- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
matrices, as well as bandwidth measurement for different operations. Measurements are
performed for a fixed range of problem sizes.
## TL;DR
Build and install Grid, all dependencies, and the benchmark with
```bash
systems/<system>/bootstrap-env.sh <env_dir> # build dependencies, takes a long time
./build-grid.sh <env_dir> <config> # build Grid
./build-benchmark.sh <env_dir> <config> # build benchmarks
```
where `<env_dir>` is an arbitrary directory where every product will be stored, `<system>`
is a sub-directory of `systems` containing system-specific scripts
(an existing preset or your own), and finally `<config>` is the name of a build config
in `systems/<system>/grid-config.json`. After a successful execution the benchmark binaries
will be in `<env_dir>/prefix/gridbench_<config>`.
## Environment setup
A complete runtime environnement can be deploy using scripts from this repository. System-specific scripts are in the `systems` directory.
You should first deploy the environment for the specific system you are using, for example
```bash
systems/tursa/bootstrap-env.sh ./env
```
will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
of packages, and might take some time to complete.
After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
```bash
source ./env/env.sh
```
Additional scripts `env-*.sh` can be sourced after to activate more specific environments,
this should be done after sourcing `env.sh` as above.
## Building the benchmarks
The environnement directory contains a `grid-config.json` file specifying compilation flag
configurations for Grid (please see Grid's repository for documentation). All entries have
the form
```json
{
"name": "foo", // name of the configuration
"env-script": "bar.sh", // script to source before building
// (path relative to the environment directory)
"commit": "...", // Grid commit to use
// (anything that can be an argument of git checkout)
"config-options": "..." // options to pass to the configure script,
"env" : { // environment variables
"VAR": "value" // export VAR="value" before building
}
}
```
Grid can then be built with
```
./build-grid.sh <env_dir> <config>
```
where `<env_dir>` is the environment directory and `<config>` is the build config name in
`grid-config.json`. Similarly, the benchmarks can then be built with
```
./build-grid <env_dir> <config>
```
## Running the benchmarks
After building the benchmarks as above you can find the binaries in
`<env_dir>/prefix/gridbench_<config>`.

View File

@ -2,5 +2,10 @@
set -euo pipefail set -euo pipefail
json_url='https://raw.githubusercontent.com/nlohmann/json/bc889afb4c5bf1c0d8ee29ef35eaaf4c8bef8a5d/single_include/nlohmann/json.hpp'
if [ ! -f json.hpp ]; then
wget ${json_url}
fi
mkdir -p .buildutils/m4 mkdir -p .buildutils/m4
autoreconf -fvi autoreconf -fvi

31
Grid/build-benchmark.sh Executable file
View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
# shellcheck disable=SC1090,SC1091
set -euo pipefail
if (( $# != 2 )); then
echo "usage: $(basename "$0") <environment directory> <config>" 1>&2
exit 1
fi
env_dir=$1
cfg=$2
call_dir=$(pwd -P)
script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
cd "${env_dir}"
env_dir=$(pwd -P)
cd "${call_dir}"
build_dir="${env_dir}/build/Grid-benchmarks/${cfg}"
mkdir -p "${build_dir}"
source "${env_dir}/env.sh"
entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
cd "${build_dir}" || return
source "${env_dir}/${env_script}"
if [ ! -f Makefile ]; then
"${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
--prefix="${env_dir}/prefix/gridbench_${cfg}"
fi
make -j 128
make install
cd "${call_dir}"

42
Grid/build-grid.sh Executable file
View File

@ -0,0 +1,42 @@
#!/usr/bin/env bash
# shellcheck disable=SC1090,SC1091
set -euo pipefail
if (( $# != 2 )); then
echo "usage: $(basename "$0") <environment directory> <config>" 1>&2
exit 1
fi
env_dir=$1
cfg=$2
call_dir=$(pwd -P)
cd "${env_dir}"
env_dir=$(pwd -P)
cd "${call_dir}"
build_dir="${env_dir}/build/Grid/${cfg}"
if [ -d "${build_dir}" ]; then
echo "error: directory '${build_dir}' exists"
exit 1
fi
mkdir -p "${build_dir}"
source "${env_dir}/env.sh"
entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
IFS=" " read -r -a args <<< "$(echo "${entry}" | jq -r ".\"config-options\"")"
env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
cd "${build_dir}" || return
source "${env_dir}/${env_script}"
extra_env=$(mktemp)
echo "${entry}" | jq -r '.env|to_entries|map("export \(.key)='\''\(.value|tostring)'\''")|.[]' > "${extra_env}"
commit=$(echo "${entry}" | jq -r ".commit")
git clone https://github.com/paboyle/Grid.git "${build_dir}"
cd "${build_dir}"
git checkout "${commit}"
./bootstrap.sh
mkdir build; cd build
source "${extra_env}"
../configure --prefix="${env_dir}/prefix/grid_${cfg}" "${args[@]}"
make -j128
make install
rm -rf "${extra_env}"
cd "${call_dir}"

View File

@ -3,55 +3,69 @@ AC_INIT([lattice-bench], [0.1], [antonin.portelli@me.com])
AC_CANONICAL_BUILD AC_CANONICAL_BUILD
AC_CANONICAL_HOST AC_CANONICAL_HOST
AC_CANONICAL_TARGET AC_CANONICAL_TARGET
AC_CONFIG_SRCDIR([Benchmark_ITT.cpp]) AC_CONFIG_SRCDIR([Benchmark_Grid.cpp])
AC_CONFIG_MACRO_DIR([.buildutils/m4]) AC_CONFIG_MACRO_DIR([.buildutils/m4])
AC_CONFIG_HEADERS([config.h]) AC_CONFIG_HEADERS([config.h])
AM_INIT_AUTOMAKE([-Wall -Werror foreign]) AM_INIT_AUTOMAKE([-Wall -Werror foreign])
m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])
# Checks for programs.
AC_PROG_CXX
AC_PROG_CC
AC_PROG_RANLIB
AM_PROG_AR
AC_LANG([C++])
AC_ARG_WITH([grid], AC_ARG_WITH([grid],
[AS_HELP_STRING([--with-grid=<prefix>], [AS_HELP_STRING([--with-grid=<prefix>],
[try this for a non-standard install prefix of Grid])], [try this for a non-standard install prefix of Grid])],
[PATH="$with_grid/bin$PATH_SEPARATOR$PATH"] [PATH="$with_grid/bin$PATH_SEPARATOR$PATH"]
[CXXFLAGS="$CXXFLAGS -I$with_grid/include"] [CXXFLAGS="$CXXFLAGS -I$with_grid/include"]
[LDFLAGS="$LDFLAGS -L$with_grid/lib"]) [LDFLAGS="$LDFLAGS -L$with_grid/lib"])
AC_CHECK_PROG([GRIDCONF],[grid-config],[yes]) AC_CHECK_PROG([GRIDCONF],[grid-config],[yes])
if test x"$GRIDCONF" != x"yes" ; then if test x"$GRIDCONF" != x"yes" ; then
AC_MSG_ERROR([grid-config not found]) AC_MSG_ERROR([grid-config not found])
fi fi
if test x"$CXX" == x ; then
CXX="`grid-config --cxx`"
elif test "$CXX" != "`grid-config --cxx`" ; then
AC_MSG_WARN([CXX differs from that reported by grid-config])
fi
if test x"$CXXLD" == x ; then
CXXLD="`grid-config --cxxld`"
elif test "$CXXLD" != "`grid-config --cxxld`" ; then
AC_MSG_WARN([CXXLD differs from that reported by grid-config])
fi
CXXFLAGS="$CXXFLAGS `grid-config --cxxflags`" CXXFLAGS="$CXXFLAGS `grid-config --cxxflags`"
LDFLAGS="$LDFLAGS `grid-config --ldflags`"
CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS" CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
AC_PROG_CC
AM_PROG_CC_C_O
AC_PROG_CXX
LDFLAGS="$LDFLAGS `grid-config --ldflags`"
LDFLAGS="$AM_LDFLAGS $LDFLAGS" LDFLAGS="$AM_LDFLAGS $LDFLAGS"
LIBS=" -lGrid $LIBS `grid-config --libs`" LIBS=" -ldl -lGrid $LIBS `grid-config --libs`"
AC_PROG_RANLIB
AM_PROG_AR
AC_LANG([C++])
AC_MSG_CHECKING([that a minimal Grid program compiles]); AC_MSG_CHECKING([that a minimal Grid program compiles]);
AC_LINK_IFELSE( AC_LINK_IFELSE(
[AC_LANG_SOURCE([[ [AC_LANG_SOURCE([[
#include <Grid/Grid.h> #include <Grid/Grid.h>
using namespace Grid; using namespace Grid;
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
Grid_init(&argc, &argv); Grid_init(&argc, &argv);
Grid_finalize(); Grid_finalize();
return 0; return 0;
} }
]])], ]])],
[AC_MSG_RESULT([yes])], [AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])] [AC_MSG_RESULT([no])]
[AC_MSG_ERROR([Could not compile a minimal Grid program])]) [AC_MSG_ERROR([impossible to compile a minimal Grid program])])
AC_SUBST([CXXLD])
AC_SUBST([AM_CXXFLAGS]) AC_SUBST([AM_CXXFLAGS])
AC_SUBST([AM_LDFLAGS]) AC_SUBST([AM_LDFLAGS])
AC_CONFIG_FILES([Makefile]) AC_CONFIG_FILES([Makefile])

View File

@ -0,0 +1,24 @@
#!/usr/bin/env bash
set -euo pipefail
if (( $# != 1 )); then
echo "usage: $(basename "$0") <environment directory>" 1>&2
exit 1
fi
dir=$1
call_dir=$(pwd -P)
script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
if [ -d "${dir}" ]; then
echo "error: directory '${dir}' exists"
exit 1
fi
mkdir -p "${dir}"
cd "${dir}"
git clone https://github.com/spack/spack.git
cd "${call_dir}"
cp "${script_dir}"/files/* "${dir}"
cp "${script_dir}/env.sh" "${script_dir}/grid-config.json" "${dir}"
source "${dir}"/spack/share/spack/setup-env.sh
"${script_dir}"/spack-bootstrap.sh "${dir}"

View File

@ -1,5 +0,0 @@
#!/usr/bin/env bash
# shellcheck disable=SC2046
script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
spack load $(cat "${script_dir}"/grid-cpu.spack)

View File

@ -1,5 +0,0 @@
#!/usr/bin/env bash
# shellcheck disable=SC2046
script_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
spack load $(cat "${script_dir}"/grid-gpu.spack)

View File

@ -0,0 +1,8 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091
GRIDENVDIR="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
export GRIDENVDIR
export PATH="${GRIDENVDIR}/prefix/base/bin:${PATH}"
export ACLOCAL_PATH="${GRIDENVDIR}/prefix/base/share/aclocal:${ACLOCAL_PATH}"
source "${GRIDENVDIR}"/spack/share/spack/setup-env.sh

View File

@ -0,0 +1,4 @@
#!/usr/bin/env bash
spack env activate grid-cpu
spack env status

View File

@ -0,0 +1,4 @@
#!/usr/bin/env bash
spack env activate grid-gpu
spack env status

View File

@ -0,0 +1,25 @@
{
"configs": [
{
"name": "gpu",
"env-script": "env-gpu.sh",
"commit": "796abfad80625d81bb16af7ff6ec612a836f17d8",
"config-options": "--enable-comms=mpi --enable-simd=GPU --enable-alloc-align=4k --enable-accelerator-cshift --enable-shm=nvlink --enable-gen-simd-width=64 --enable-accelerator=cuda --disable-unified --enable-gparity=no --enable-fermion-reps=no",
"env" : {
"CXX": "nvcc",
"LDFLAGS": "-cudart shared",
"CXXFLAGS": "-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
}
},
{
"name": "cpu",
"env-script": "env-cpu.sh",
"commit": "796abfad80625d81bb16af7ff6ec612a836f17d8",
"config-options": "--enable-comms=mpi-auto --enable-simd=AVX2 --enable-alloc-align=4k --enable-shm=shmget --enable-gparity=no --enable-fermion-reps=no",
"env" : {
"CXX": "clang++",
"MPICXX": "mpicxx"
}
}
]
}

View File

@ -1,10 +0,0 @@
gcc@9.4.0
llvm@12.0.1
ucx@1.12.0.CPU%gcc@9.4.0
openmpi@4.1.1.CPU%gcc@9.4.0
hdf5^openmpi@4.1.1.CPU%gcc@9.4.0
fftw^openmpi@4.1.1.CPU%gcc@9.4.0
openssl
gmp%gcc@9.4.0
mpfr%gcc@9.4.0
c-lime

View File

@ -1,10 +0,0 @@
gcc@9.4.0
cuda@11.4.0
ucx@1.12.0.GPU%gcc@9.4.0
openmpi@4.1.1.GPU%gcc@9.4.0
hdf5^openmpi@4.1.1.GPU%gcc@9.4.0
fftw^openmpi@4.1.1.GPU%gcc@9.4.0
openssl
gmp%gcc@9.4.0
mpfr%gcc@9.4.0
c-lime

169
Grid/systems/tursa/spack-bootstrap.sh Normal file → Executable file
View File

@ -2,16 +2,19 @@
# shellcheck disable=SC2016 # shellcheck disable=SC2016
set -euo pipefail set -euo pipefail
GCC='gcc@9.4.0' gcc_spec='gcc@9.4.0'
CUDA='cuda@11.4.0' cuda_spec='cuda@11.4.0'
HDF5='hdf5@1.10.7' hdf5_spec='hdf5@1.10.7'
if (( $# != 1 )); then if (( $# != 1 )); then
echo "usage: $(basename "$0") <env dir>" 1>&2 echo "usage: $(basename "$0") <env dir>" 1>&2
exit 1 exit 1
fi fi
ENVDIR=$1 dir=$1
CWD=$(pwd -P) cwd=$(pwd -P)
cd "${dir}"
dir=$(pwd -P)
cd "${cwd}"
# General configuration ######################################################## # General configuration ########################################################
# build with 128 tasks # build with 128 tasks
@ -33,107 +36,92 @@ echo 'packages:
spack config --scope site add -f external.yaml spack config --scope site add -f external.yaml
rm external.yaml rm external.yaml
# Base compilers ###############################################################
# configure system base # configure system base
spack compiler find --scope site spack compiler find --scope site
# Base packages ################################################################ # install GCC, CUDA & LLVM
# install GCC spack install ${gcc_spec} ${cuda_spec} llvm
spack install ${GCC}
spack load ${GCC} spack load llvm
spack compiler find --scope site spack compiler find --scope site
spack unload ${GCC} spack unload llvm
# clean spack load ${gcc_spec}
spack clean
spack gc -y
# install CUDA
spack install ${CUDA}
# install development tools
dev_tools=("autoconf" "automake" "libtool" "git")
spack install "${dev_tools[@]}"
# create view for CLI & dev tools
spack view symlink -i "${ENVDIR}/prefix/base" "${dev_tools[@]}"
# install clang
spack install llvm@12.0.1
# locate new compilers
spack load llvm@12.0.1
spack compiler find --scope site spack compiler find --scope site
spack unload llvm@12.0.1 spack unload ${gcc_spec}
# Manual compilation of OpenMPI & UCX ########################################## # Manual compilation of OpenMPI & UCX ##########################################
# set build directories # set build directories
mkdir -p "${ENVDIR}"/build mkdir -p "${dir}"/build
cd "${ENVDIR}"/build cd "${dir}"/build
spack load ${GCC} ${CUDA} spack load ${gcc_spec} ${cuda_spec}
CUDA_PATH=$(which nvcc | sed "s/bin/@/g" | cut -d "@" -f1) cuda_path=$(spack find --format "{prefix}" cuda)
GDRCOPY_PATH=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1 gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1
# Install ucx 1.12.0 # Install ucx 1.12.0
UCX_URL=https://github.com/openucx/ucx/releases/download/v1.12.0/ucx-1.12.0.tar.gz ucx_url=https://github.com/openucx/ucx/releases/download/v1.12.0/ucx-1.12.0.tar.gz
echo "-- building UCX from source" echo "-- building UCX from source"
wget ${UCX_URL} wget ${ucx_url}
UCX_AR=$(basename ${UCX_URL}) ucx_ar=$(basename ${ucx_url})
tar -xvf "${UCX_AR}" tar -xvf "${ucx_ar}"
cd "${UCX_AR%.tar.gz}" cd "${ucx_ar%.tar.gz}"
# ucx gpu build # ucx gpu build
mkdir build_gpu; cd build_gpu mkdir -p build_gpu; cd build_gpu
../configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu \ ../configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu \
--disable-dependency-tracking --prefix="${ENVDIR}"/prefix/ucx_gpu \ --disable-dependency-tracking --prefix="${dir}"/prefix/ucx_gpu \
--enable-devel-headers --enable-examples --enable-optimizations \ --enable-devel-headers --enable-examples --enable-optimizations \
--with-gdrcopy=${GDRCOPY_PATH} --with-verbs --disable-logging \ --with-gdrcopy=${gdrcopy_path} --with-verbs --disable-logging \
--disable-debug --disable-assertions --enable-cma \ --disable-debug --disable-assertions --enable-cma \
--with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \ --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \
--without-rocm --without-ugni --without-java \ --without-rocm --without-ugni --without-java \
--enable-compiler-opt=3 --with-cuda="${CUDA_PATH}" --without-cm \ --enable-compiler-opt=3 --with-cuda="${cuda_path}" --without-cm \
--with-rc --with-ud --with-dc --with-mlx5-dv --with-dm \ --with-rc --with-ud --with-dc --with-mlx5-dv --with-dm \
--enable-mt LDFLAGS=-L${GDRCOPY_PATH}/lib --enable-mt --without-go LDFLAGS=-L${gdrcopy_path}/lib
make -j 128 make -j 128
make install make install
cd .. cd ..
# ucx cpu build # ucx cpu build
mkdir build_cpu; cd build_cpu mkdir -p build_cpu; cd build_cpu
../configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu \ ../configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu \
--disable-dependency-tracking --prefix="${ENVDIR}"/prefix/ucx_cpu \ --disable-dependency-tracking --prefix="${dir}"/prefix/ucx_cpu \
--enable-devel-headers --enable-examples --enable-optimizations \ --enable-devel-headers --enable-examples --enable-optimizations \
--with-verbs --disable-logging --disable-debug \ --with-verbs --disable-logging --disable-debug \
--disable-assertions --enable-mt --enable-cma \ --disable-assertions --enable-mt --enable-cma \
--with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \ --with-knem=/opt/knem-1.1.4.90mlnx1/ --with-rdmacm \
--without-rocm --without-ugni --without-java \ --without-rocm --without-ugni --without-java \
--enable-compiler-opt=3 --without-cm --without-ugni --with-rc \ --enable-compiler-opt=3 --without-cm --without-ugni --with-rc \
--with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --with-ud --with-dc --with-mlx5-dv --with-dm --enable-mt --without-go
make -j 128 make -j 128
make install make install
cd "${ENVDIR}"/build cd "${dir}"/build
# Install openmpi 4.1.1 (needs to be done on a gpu node) # Install openmpi 4.1.1
OMPI_URL=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz ompi_url=https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz
echo "-- building OpenMPI from source" echo "-- building OpenMPI from source"
wget ${OMPI_URL} wget ${ompi_url}
OMPI_AR=$(basename ${OMPI_URL}) ompi_ar=$(basename ${ompi_url})
tar -xvf "${OMPI_AR}" tar -xvf "${ompi_ar}"
cd "${OMPI_AR%.tar.gz}" cd "${ompi_ar%.tar.gz}"
export AUTOMAKE_JOBS=128
./autogen.pl -f
# openmpi gpu build # openmpi gpu build
mkdir build_gpu; cd build_gpu mkdir build_gpu; cd build_gpu
../configure --prefix="${ENVDIR}"/prefix/ompi_gpu --without-xpmem \ ../configure --prefix="${dir}"/prefix/ompi_gpu --without-xpmem \
--with-ucx="${ENVDIR}"/prefix/ucx_gpu \ --with-ucx="${dir}"/prefix/ucx_gpu \
--with-ucx-libdir="${ENVDIR}"/prefix/ucx_gpu/lib \ --with-ucx-libdir="${dir}"/prefix/ucx_gpu/lib \
--with-knem=/opt/knem-1.1.4.90mlnx1/ \ --with-knem=/opt/knem-1.1.4.90mlnx1/ \
--enable-mca-no-build=btl-uct \ --enable-mca-no-build=btl-uct \
--with-cuda="${CUDA_PATH}" --disable-getpwuid \ --with-cuda="${cuda_path}" --disable-getpwuid \
--with-verbs --with-slurm --enable-mpi-fortran=all \ --with-verbs --with-slurm --enable-mpi-fortran=all \
--with-pmix=internal --with-libevent=internal --with-pmix=internal --with-libevent=internal
make -j 128 make -j 128
@ -142,61 +130,76 @@ cd ..
# openmpi cpu build # openmpi cpu build
mkdir build_cpu; cd build_cpu mkdir build_cpu; cd build_cpu
../configure --prefix="${ENVDIR}"/prefix/ompi_cpu --without-xpmem \ ../configure --prefix="${dir}"/prefix/ompi_cpu --without-xpmem \
--with-ucx="${ENVDIR}"/prefix/ucx_cpu \ --with-ucx="${dir}"/prefix/ucx_cpu \
--with-ucx-libdir="${ENVDIR}"/prefix/ucx_cpu/lib \ --with-ucx-libdir="${dir}"/prefix/ucx_cpu/lib \
--with-knem=/opt/knem-1.1.4.90mlnx1/ \ --with-knem=/opt/knem-1.1.4.90mlnx1/ \
--enable-mca-no-build=btl-uct --disable-getpwuid \ --enable-mca-no-build=btl-uct --disable-getpwuid \
--with-verbs --with-slurm --enable-mpi-fortran=all \ --with-verbs --with-slurm --enable-mpi-fortran=all \
--with-pmix=internal --with-libevent=internal --with-pmix=internal --with-libevent=internal
make -j 128 make -j 128
make install make install
cd "${ENVDIR}" cd "${dir}"
# Add externals to spack # Add externals to spack
echo "packages: echo "packages:
ucx: ucx:
externals: externals:
- spec: \"ucx@1.12.0.GPU%gcc@9.4.0\" - spec: \"ucx@1.12.0.GPU%gcc@9.4.0\"
prefix: ${ENVDIR}/prefix/ucx_gpu prefix: ${dir}/prefix/ucx_gpu
- spec: \"ucx@1.12.0.CPU%gcc@9.4.0\" - spec: \"ucx@1.12.0.CPU%gcc@9.4.0\"
prefix: ${ENVDIR}/prefix/ucx_cpu prefix: ${dir}/prefix/ucx_cpu
buildable: False buildable: False
openmpi: openmpi:
externals: externals:
- spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\" - spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\"
prefix: ${ENVDIR}/prefix/ompi_gpu prefix: ${dir}/prefix/ompi_gpu
- spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\" - spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\"
prefix: ${ENVDIR}/prefix/ompi_cpu prefix: ${dir}/prefix/ompi_cpu
buildable: False" > spack.yaml buildable: False" > spack.yaml
spack config --scope site add -f spack.yaml spack config --scope site add -f spack.yaml
rm spack.yaml rm spack.yaml
spack install ucx@1.12.0.GPU%gcc@9.4.0 spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
spack install ucx@1.12.0.CPU%gcc@9.4.0 spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
spack install openmpi@4.1.1.GPU%gcc@9.4.0
spack install openmpi@4.1.1.CPU%gcc@9.4.0
# Install Grid dependencies #################################################### cd "${cwd}"
cd "${CWD}"
OPENMPIGPUHASH=$(spack find --format "{hash}" openmpi@4.1.1.GPU) # environments #################################################################
OPENMPICPUHASH=$(spack find --format "{hash}" openmpi@4.1.1.CPU) dev_tools=("autoconf" "automake" "libtool" "jq")
ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU)
ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU)
spack install ${HDF5}+cxx+threadsafe ^/"${OPENMPIGPUHASH}" spack env create grid-gpu
spack install ${HDF5}+cxx+threadsafe ^/"${OPENMPICPUHASH}" spack env activate grid-gpu
spack install fftw ^/"${OPENMPIGPUHASH}" spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}"
spack install fftw ^/"${OPENMPICPUHASH}" spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
spack install openssl gmp mpfr c-lime spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}"
spack add fftw ^/"${ompi_gpu_hash}"
spack add openssl gmp mpfr c-lime
spack install
spack env deactivate
spack env create grid-cpu
spack env activate grid-cpu
spack add llvm "${dev_tools[@]}"
spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}"
spack add fftw ^/"${ompi_cpu_hash}"
spack add openssl gmp mpfr c-lime
spack install
spack env deactivate
# Final setup ################################################################## # Final setup ##################################################################
spack clean spack clean
spack gc -y
# add more environment variables in module loading # add more environment variables in module loading
spack config --scope site add 'modules:prefix_inspections:lib:[LIBRARY_PATH]' spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]'
spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]'
spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]' spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]'
spack module tcl refresh -y spack module tcl refresh -y
# permission change for group access # permission change for group access
chmod -R g+rw "${ENVDIR}/spack/var/spack/cache" chmod -R g+rw "${dir}/spack/var/spack/cache"
setfacl -d -R -m g::rwX "${ENVDIR}/spack/var/spack/cache" setfacl -d -R -m g::rwX "${dir}/spack/var/spack/cache"

8
Readme.md Normal file
View File

@ -0,0 +1,8 @@
# Lattice benchmarks
This repository is an attempt at packaging benchmarks for various libraries used for
lattice field theory simulations. It is currently only featuring the Grid library but
more will be added later.
Libraries:
- [Grid](https://github.com/aportelli/) - [Documentation](Grid/Readme.md)