commit 2a64c41a8c579cfd9824f5221777fe95a37b8c0f Author: Antonin Portelli Date: Fri Jan 13 19:00:20 2023 +0000 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8478350 --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +/.vscode +build* +.buildutils +autom4te.cache +config.* +compile +ar-lib +aclocal.m4 +configure +depcomp +install-sh +missing +Makefile.in +.DS_Store +*~ \ No newline at end of file diff --git a/Grid/Benchmark_IO.cpp b/Grid/Benchmark_IO.cpp new file mode 100644 index 0000000..da6b78f --- /dev/null +++ b/Grid/Benchmark_IO.cpp @@ -0,0 +1,224 @@ +/* +Copyright © 2022 Antonin Portelli + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +#include "Benchmark_IO.hpp" + +#ifndef BENCH_IO_LMIN +#define BENCH_IO_LMIN 8 +#endif + +#ifndef BENCH_IO_LMAX +#define BENCH_IO_LMAX 32 +#endif + +#ifndef BENCH_IO_NPASS +#define BENCH_IO_NPASS 10 +#endif + +#ifdef HAVE_LIME +using namespace Grid; + +std::string filestem(const int l) +{ + return "iobench_l" + std::to_string(l); +} + +int vol(const int i) +{ + return BENCH_IO_LMIN + 2 * i; +} + +int volInd(const int l) +{ + return (l - BENCH_IO_LMIN) / 2; +} + +template +void stats(Mat &mean, Mat &stdDev, const std::vector &data) +{ + auto nr = data[0].rows(), nc = data[0].cols(); + Eigen::MatrixXd sqSum(nr, nc); + double n = static_cast(data.size()); + + assert(n > 1.); + mean = Mat::Zero(nr, nc); + sqSum = Mat::Zero(nr, nc); + for (auto &d : data) + { + mean += d; + sqSum += d.cwiseProduct(d); + } + stdDev = ((sqSum - mean.cwiseProduct(mean) / n) / (n - 1.)).cwiseSqrt(); + mean /= n; +} + +#define grid_printf(...) \ + { \ + char _buf[1024]; \ + sprintf(_buf, __VA_ARGS__); \ + MSG << _buf; \ + } + +enum +{ + sRead = 0, + sWrite = 1, + gRead = 2, + gWrite = 3 +}; + +int main(int argc, char **argv) +{ + Grid_init(&argc, &argv); + + int64_t threads = GridThread::GetThreads(); + auto mpi = GridDefaultMpi(); + unsigned int nVol = (BENCH_IO_LMAX - BENCH_IO_LMIN) / 2 + 1; + unsigned int nRelVol = (BENCH_IO_LMAX - 24) / 2 + 1; + std::vector perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4)); + std::vector avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4)); + std::vector latt; + + MSG << "Grid is setup to use " << threads << " threads" << std::endl; + MSG << "MPI partition " << mpi << std::endl; + for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i) + { + MSG << BIGSEP << std::endl; + MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl; + MSG << BIGSEP << std::endl; + MSG << SEP << std::endl; + MSG << "Benchmark std write" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), stdWrite); + perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond; + } + + MSG << SEP << std::endl; + MSG << "Benchmark std read" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), stdRead); + perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond; + } + +#ifdef HAVE_LIME + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime write" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), limeWrite); + perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond; + } + + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime read" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), limeRead); + perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond; + } +#endif + avPerf[i].fill(0.); + for (int f = 0; f < 4; ++f) + for (int l = 24; l <= BENCH_IO_LMAX; l += 2) + { + avPerf[i](f) += perf[i](volInd(l), f); + } + avPerf[i] /= nRelVol; + } + + Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4); + Eigen::VectorXd avMean(4), avStdDev(4), avRob(4); + // double n = BENCH_IO_NPASS; + + stats(mean, stdDev, perf); + stats(avMean, avStdDev, avPerf); + rob.fill(100.); + rob -= 100. * stdDev.cwiseQuotient(mean.cwiseAbs()); + avRob.fill(100.); + avRob -= 100. * avStdDev.cwiseQuotient(avMean.cwiseAbs()); + + MSG << BIGSEP << std::endl; + MSG << "SUMMARY" << std::endl; + MSG << BIGSEP << std::endl; + MSG << "Summary of individual results (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << std::endl; + grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", + "L", "std read", "std dev", "std write", "std dev", + "Grid read", "std dev", "Grid write", "std dev"); + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", + l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), + mean(volInd(l), sWrite), stdDev(volInd(l), sWrite), + mean(volInd(l), gRead), stdDev(volInd(l), gRead), + mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); + } + MSG << std::endl; + MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl; + MSG << std::endl; + grid_printf("%4s %12s %12s %12s %12s\n", + "L", "std read", "std write", "Grid read", "Grid write"); + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", + l, rob(volInd(l), sRead), rob(volInd(l), sWrite), + rob(volInd(l), gRead), rob(volInd(l), gWrite)); + } + MSG << std::endl; + MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << std::endl; + grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", + "std read", "std dev", "std write", "std dev", + "Grid read", "std dev", "Grid write", "std dev"); + grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", + avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), + avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite)); + MSG << std::endl; + MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl; + MSG << std::endl; + grid_printf("%12s %12s %12s %12s\n", + "std read", "std write", "Grid read", "Grid write"); + grid_printf("%12.1f %12.1f %12.1f %12.1f\n", + avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite)); + + Grid_finalize(); + + return EXIT_SUCCESS; +} +#else +int main(int argc, char **argv) {} +#endif diff --git a/Grid/Benchmark_IO.hpp b/Grid/Benchmark_IO.hpp new file mode 100644 index 0000000..24e2214 --- /dev/null +++ b/Grid/Benchmark_IO.hpp @@ -0,0 +1,273 @@ +/* +Copyright © 2022 Antonin Portelli + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +#ifndef Benchmark_IO_hpp_ +#define Benchmark_IO_hpp_ + +#include +#define MSG std::cout << GridLogMessage +#define SEP \ + "-----------------------------------------------------------------------------" +#define BIGSEP \ + "=============================================================================" +#ifdef HAVE_LIME + +namespace Grid +{ + + template + using WriterFn = std::function; + template + using ReaderFn = std::function; + + // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API + // + // template + // void stdWrite(const std::string filestem, Field &vec) + // { + // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); + // size_t size; + // uint32_t crc; + // GridStopWatch ioWatch, crcWatch; + + // size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + // autoView(vec_v, vec, CpuRead); + // crcWatch.Start(); + // crc = GridChecksum::crc32(vec_v.cpu_ptr, size); + // std::fwrite(&crc, sizeof(uint32_t), 1, file); + // crcWatch.Stop(); + // MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; + // ioWatch.Start(); + // std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); + // ioWatch.Stop(); + // std::fclose(file); + // size *= vec.Grid()->ProcessorCount(); + // auto &p = BinaryIO::lastPerf; + // p.size = size; + // p.time = ioWatch.useconds(); + // p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + // MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() + // << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + // MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; + // } + // + // template + // void stdRead(Field &vec, const std::string filestem) + // { + // std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); + // size_t size; + // uint32_t crcRead, crcData; + // GridStopWatch ioWatch, crcWatch; + + // size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + // crcWatch.Start(); + // std::fread(&crcRead, sizeof(uint32_t), 1, file); + // crcWatch.Stop(); + // { + // autoView(vec_v, vec, CpuWrite); + // ioWatch.Start(); + // std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); + // ioWatch.Stop(); + // std::fclose(file); + // } + // { + // autoView(vec_v, vec, CpuRead); + // crcWatch.Start(); + // crcData = GridChecksum::crc32(vec_v.cpu_ptr, size); + // crcWatch.Stop(); + // } + // MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; + // assert(crcData == crcRead); + // size *= vec.Grid()->ProcessorCount(); + // auto &p = BinaryIO::lastPerf; + // p.size = size; + // p.time = ioWatch.useconds(); + // p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + // MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() + // << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + // MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; + // } + + template + void stdWrite(const std::string filestem, Field &vec) + { + std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary); + size_t size, sizec; + uint32_t crc; + GridStopWatch ioWatch, crcWatch; + + size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object); + sizec = size / sizeof(char); // just in case of... + autoView(vec_v, vec, CpuRead); + crcWatch.Start(); + crc = GridChecksum::crc32(vec_v.cpu_ptr, size); + file.write(reinterpret_cast(&crc), sizeof(uint32_t) / sizeof(char)); + crcWatch.Stop(); + MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; + ioWatch.Start(); + file.write(reinterpret_cast(vec_v.cpu_ptr), sizec); + file.flush(); + ioWatch.Stop(); + size *= vec.Grid()->ProcessorCount(); + auto &p = BinaryIO::lastPerf; + p.size = size; + p.time = ioWatch.useconds(); + p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); + MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() + << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; + } + + template + void stdRead(Field &vec, const std::string filestem) + { + std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary); + size_t size, sizec; + uint32_t crcRead, crcData; + GridStopWatch ioWatch, crcWatch; + + size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object); + sizec = size / sizeof(char); // just in case of... + crcWatch.Start(); + file.read(reinterpret_cast(&crcRead), sizeof(uint32_t) / sizeof(char)); + crcWatch.Stop(); + { + autoView(vec_v, vec, CpuWrite); + ioWatch.Start(); + file.read(reinterpret_cast(vec_v.cpu_ptr), sizec); + ioWatch.Stop(); + } + { + autoView(vec_v, vec, CpuRead); + crcWatch.Start(); + crcData = GridChecksum::crc32(vec_v.cpu_ptr, size); + crcWatch.Stop(); + } + MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; + assert(crcData == crcRead); + size *= vec.Grid()->ProcessorCount(); + auto &p = BinaryIO::lastPerf; + p.size = size; + p.time = ioWatch.useconds(); + p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); + MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() + << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; + } + + template + void limeWrite(const std::string filestem, Field &vec) + { + emptyUserRecord record; + ScidacWriter binWriter(vec.Grid()->IsBoss()); + + binWriter.open(filestem + ".lime.bin"); + binWriter.writeScidacFieldRecord(vec, record); + binWriter.close(); + } + + template + void limeRead(Field &vec, const std::string filestem) + { + emptyUserRecord record; + ScidacReader binReader; + + binReader.open(filestem + ".lime.bin"); + binReader.readScidacFieldRecord(vec, record); + binReader.close(); + } + + inline void makeGrid(std::shared_ptr &gPt, + const std::shared_ptr &gBasePt, + const unsigned int Ls = 1, const bool rb = false) + { + if (rb) + { + if (Ls > 1) + { + gPt.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gBasePt.get())); + } + else + { + gPt.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gBasePt.get())); + } + } + else + { + if (Ls > 1) + { + gPt.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gBasePt.get())); + } + else + { + gPt = gBasePt; + } + } + } + + template + void writeBenchmark(const Coordinate &latt, const std::string filename, + const WriterFn &write, + const unsigned int Ls = 1, const bool rb = false) + { + auto mpi = GridDefaultMpi(); + auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); + std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); + std::shared_ptr gPt; + std::random_device rd; + + makeGrid(gPt, gBasePt, Ls, rb); + + GridBase *g = gPt.get(); + GridParallelRNG rng(g); + Field vec(g); + + rng.SeedFixedIntegers({static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd())}); + + random(rng, vec); + write(filename, vec); + } + + template + void readBenchmark(const Coordinate &latt, const std::string filename, + const ReaderFn &read, + const unsigned int Ls = 1, const bool rb = false) + { + auto mpi = GridDefaultMpi(); + auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); + std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); + std::shared_ptr gPt; + + makeGrid(gPt, gBasePt, Ls, rb); + + GridBase *g = gPt.get(); + Field vec(g); + + read(vec, filename); + } + +} + +#endif // LIME +#endif // Benchmark_IO_hpp_ diff --git a/Grid/Benchmark_ITT.cpp b/Grid/Benchmark_ITT.cpp new file mode 100644 index 0000000..e618aff --- /dev/null +++ b/Grid/Benchmark_ITT.cpp @@ -0,0 +1,801 @@ +/* +Copyright © 2015 Peter Boyle +Copyright © 2022 Antonin Portelli + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +#include + +using namespace Grid; + +std::vector L_list; +std::vector Ls_list; +std::vector mflop_list; + +double mflop_ref; +double mflop_ref_err; + +int NN_global; + +struct time_statistics +{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v) + { + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) + { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size() * (v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; + } +}; + +void comms_header() +{ + std::cout << GridLogMessage << " L " + << "\t" + << " Ls " + << "\t" + << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl; +}; + +Gamma::Algebra Gmu[] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT}; +struct controls +{ + int Opt; + int CommsOverlap; + Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch; +}; + +class Benchmark +{ +public: + static void Decomposition(void) + { + + int threads = GridThread::GetThreads(); + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << "Grid Default Decomposition patterns\n"; + std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() << std::endl; + std::cout << GridLogMessage << "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi()) << std::endl; + std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) << std::endl; + std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) << std::endl; + std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) << std::endl; + std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) << std::endl; + std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) << std::endl; + std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + } + + static void Comms(void) + { + int Nloop = 200; + int nmu = 0; + int maxlat = 32; + + Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + for (int mu = 0; mu < Nd; mu++) + if (mpi_layout[mu] > 1) + nmu++; + + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout << GridLogMessage << "====================================================================================================" << std::endl; + std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl; + std::cout << GridLogMessage << "====================================================================================================" << std::endl; + comms_header(); + + for (int lat = 16; lat <= maxlat; lat += 8) + { + // for(int Ls=8;Ls<=8;Ls*=2){ + { + int Ls = 12; + + Coordinate latt_size({lat * mpi_layout[0], + lat * mpi_layout[1], + lat * mpi_layout[2], + lat * mpi_layout[3]}); + + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank / Nnode; + + std::vector xbuf(8); + std::vector rbuf(8); + // Grid.ShmBufferFreeAll(); + uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD); + for (int d = 0; d < 8; d++) + { + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + // int ncomm; + double dbytes; + + for (int dir = 0; dir < 8; dir++) + { + int mu = dir % 4; + if (mpi_layout[mu] > 1) + { + + std::vector times(Nloop); + for (int i = 0; i < Nloop; i++) + { + + dbytes = 0; + double start = usecond(); + int xmit_to_rank; + int recv_from_rank; + + if (dir == mu) + { + int comm_proc = 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + } + else + { + int comm_proc = mpi_layout[mu] - 1; + Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); + } + Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, + (void *)&rbuf[dir][0], recv_from_rank, + bytes); + dbytes += bytes; + + double stop = usecond(); + t_time[i] = stop - start; // microseconds + } + timestat.statistics(t_time); + + dbytes = dbytes * ppn; + double xbytes = dbytes * 0.5; + double bidibytes = dbytes; + + std::cout << GridLogMessage << lat << "\t" << Ls << "\t " + << bytes << " \t " + << xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t " + << xbytes / timestat.max << " " << xbytes / timestat.min + << "\t\t" << bidibytes / timestat.mean << " " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " " + << bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl; + } + } + for (int d = 0; d < 8; d++) + { + acceleratorFreeDevice(xbuf[d]); + acceleratorFreeDevice(rbuf[d]); + } + } + } + return; + } + + static void Memory(void) + { + const int Nvec = 8; + typedef Lattice> LatticeVec; + typedef iVector Vec; + + Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " L " + << "\t\t" + << "bytes" + << "\t\t\t" + << "GB/s" + << "\t\t" + << "Gflop/s" + << "\t\t seconds" + << "\t\tGB/s / node" << std::endl; + std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl; + + // uint64_t NP; + uint64_t NN; + + uint64_t lmax = 32; +#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat) + + GridSerialRNG sRNG; + sRNG.SeedFixedIntegers(std::vector({45, 12, 81, 9})); + for (int lat = 8; lat <= lmax; lat += 8) + { + + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); + int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; + + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + // NP= Grid.RankCount(); + NN = Grid.NodeCount(); + + Vec rn; + random(sRNG, rn); + + LatticeVec z(&Grid); + z = Zero(); + LatticeVec x(&Grid); + x = Zero(); + LatticeVec y(&Grid); + y = Zero(); + double a = 2.0; + + uint64_t Nloop = NLOOP; + + double start = usecond(); + for (int i = 0; i < Nloop; i++) + { + z = a * x - y; + } + double stop = usecond(); + double time = (stop - start) / Nloop * 1000; + + double flops = vol * Nvec * 2; // mul,add + double bytes = 3.0 * vol * Nvec * sizeof(Real); + std::cout << GridLogMessage << std::setprecision(3) + << lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000. + << "\t\t" << bytes / time / NN << std::endl; + } + }; + + static void SU4(void) + { + const int Nc4 = 4; + typedef Lattice> LatticeSU4; + + Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " L " + << "\t\t" + << "bytes" + << "\t\t\t" + << "GB/s" + << "\t\t" + << "Gflop/s" + << "\t\t seconds" + << "\t\tGB/s / node" << std::endl; + std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl; + + uint64_t NN; + + uint64_t lmax = 32; + + GridSerialRNG sRNG; + sRNG.SeedFixedIntegers(std::vector({45, 12, 81, 9})); + for (int lat = 8; lat <= lmax; lat += 8) + { + + Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); + int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; + + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + NN = Grid.NodeCount(); + + LatticeSU4 z(&Grid); + z = Zero(); + LatticeSU4 x(&Grid); + x = Zero(); + LatticeSU4 y(&Grid); + y = Zero(); + // double a=2.0; + + uint64_t Nloop = NLOOP; + + double start = usecond(); + for (int i = 0; i < Nloop; i++) + { + z = x * y; + } + double stop = usecond(); + double time = (stop - start) / Nloop * 1000; + + double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add + double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF); + std::cout << GridLogMessage << std::setprecision(3) + << lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000. + << "\t\t" << bytes / time / NN << std::endl; + } + }; + + static double DWF(int Ls, int L) + { + RealD mass = 0.1; + RealD M5 = 1.8; + + double mflops; + double mflops_best = 0; + double mflops_worst = 0; + std::vector mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); + assert(mpi.size() == 4); + Coordinate local({L, L, L, L}); + Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); + + GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd, vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global = NN; + uint64_t SHM = NP / NN; + + ///////// Welcome message //////////// + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " << std::endl; + std::cout << GridLogMessage << "* Nc : " << Nc << std::endl; + std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; + std::cout << GridLogMessage << "* Ls : " << Ls << std::endl; + std::cout << GridLogMessage << "* ranks : " << NP << std::endl; + std::cout << GridLogMessage << "* nodes : " << NN << std::endl; + std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; + std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl; + std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + + ///////// Lattice Init //////////// + GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); + GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); + + ///////// RNG Init //////////// + std::vector seeds4({1, 2, 3, 4}); + std::vector seeds5({5, 6, 7, 8}); + GridParallelRNG RNG4(UGrid); + RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); + RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + typedef DomainWallFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + ///////// Source preparation //////////// + Gauge Umu(UGrid); + SU::HotConfiguration(RNG4, Umu); + Fermion src(FGrid); + random(RNG5, src); + Fermion src_e(FrbGrid); + Fermion src_o(FrbGrid); + Fermion r_e(FrbGrid); + Fermion r_o(FrbGrid); + Fermion r_eo(FGrid); + Action Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); + + { + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + const int num_cases = 4; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases[] = { + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential}, + {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}}; + + for (int c = 0; c < num_cases; c++) + { + + WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; + WilsonKernelsStatic::Opt = Cases[c].Opt; + CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) + std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl; + if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) + std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; + if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) + std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; + std::cout << GridLogMessage << "* SINGLE precision " << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + + int nwarm = 10; + double t0 = usecond(); + FGrid->Barrier(); + for (int i = 0; i < nwarm; i++) + { + Dw.DhopEO(src_o, r_e, DaggerNo); + } + FGrid->Barrier(); + double t1 = usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0, &ncall, sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for (uint64_t i = 0; i < ncall; i++) + { + t0 = usecond(); + Dw.DhopEO(src_o, r_e, DaggerNo); + t1 = usecond(); + t_time[i] = t1 - t0; + } + FGrid->Barrier(); + + double volume = Ls; + for (int mu = 0; mu < Nd; mu++) + volume = volume * latt4[mu]; + + // Nc=3 gives + // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 + // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 + // double flops=(1344.0*volume)/2; +#if 0 + double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; +#else + double fps = Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2; +#endif + double flops = (fps * volume) / 2; + double mf_hi, mf_lo, mf_err; + + timestat.statistics(t_time); + mf_hi = flops / timestat.min; + mf_lo = flops / timestat.max; + mf_err = flops / timestat.min * timestat.err / timestat.mean; + + mflops = flops / timestat.mean; + mflops_all.push_back(mflops); + if (mflops_best == 0) + mflops_best = mflops; + if (mflops_worst == 0) + mflops_worst = mflops; + if (mflops > mflops_best) + mflops_best = mflops; + if (mflops < mflops_worst) + mflops_worst = mflops; + + std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank " << mflops / NP << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node " << mflops / NN << std::endl; + } + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Best mflop/s = " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl; + std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Worst mflop/s = " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl; + std::cout << GridLogMessage << fmt << std::endl; + std::cout << GridLogMessage; + + for (int i = 0; i < mflops_all.size(); i++) + { + std::cout << mflops_all[i] / NN << " ; "; + } + std::cout << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + } + return mflops_best; + } + + static double Staggered(int L) + { + double mflops; + double mflops_best = 0; + double mflops_worst = 0; + std::vector mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); + assert(mpi.size() == 4); + Coordinate local({L, L, L, L}); + Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); + + GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd, vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global = NN; + uint64_t SHM = NP / NN; + + ///////// Welcome message //////////// + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L << "^4 local volume " << std::endl; + std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; + std::cout << GridLogMessage << "* ranks : " << NP << std::endl; + std::cout << GridLogMessage << "* nodes : " << NN << std::endl; + std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; + std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl; + std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + + ///////// Lattice Init //////////// + GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid); + + ///////// RNG Init //////////// + std::vector seeds4({1, 2, 3, 4}); + GridParallelRNG RNG4(FGrid); + RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass = 0.1; + RealD c1 = 9.0 / 8.0; + RealD c2 = -1.0 / 24.0; + RealD u0 = 1.0; + + typedef ImprovedStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); + SU::HotConfiguration(RNG4, Umu); + + typename Action::ImplParams params; + Action Ds(Umu, Umu, *FGrid, *FrbGrid, mass, c1, c2, u0, params); + + ///////// Source preparation //////////// + Fermion src(FGrid); + random(RNG4, src); + Fermion src_e(FrbGrid); + Fermion src_o(FrbGrid); + Fermion r_e(FrbGrid); + Fermion r_o(FrbGrid); + Fermion r_eo(FGrid); + + { + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + const int num_cases = 4; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases[] = { + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential}, + {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}}; + + for (int c = 0; c < num_cases; c++) + { + + StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap; + StaggeredKernelsStatic::Opt = Cases[c].Opt; + CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric) + std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" << std::endl; + if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute) + std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; + if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) + std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; + std::cout << GridLogMessage << "* SINGLE precision " << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + + int nwarm = 10; + double t0 = usecond(); + FGrid->Barrier(); + for (int i = 0; i < nwarm; i++) + { + Ds.DhopEO(src_o, r_e, DaggerNo); + } + FGrid->Barrier(); + double t1 = usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0, &ncall, sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for (uint64_t i = 0; i < ncall; i++) + { + t0 = usecond(); + Ds.DhopEO(src_o, r_e, DaggerNo); + t1 = usecond(); + t_time[i] = t1 - t0; + } + FGrid->Barrier(); + + double volume = 1; + for (int mu = 0; mu < Nd; mu++) + volume = volume * latt4[mu]; + double flops = (1146.0 * volume) / 2; + double mf_hi, mf_lo, mf_err; + + timestat.statistics(t_time); + mf_hi = flops / timestat.min; + mf_lo = flops / timestat.max; + mf_err = flops / timestat.min * timestat.err / timestat.mean; + + mflops = flops / timestat.mean; + mflops_all.push_back(mflops); + if (mflops_best == 0) + mflops_best = mflops; + if (mflops_worst == 0) + mflops_worst = mflops; + if (mflops > mflops_best) + mflops_best = mflops; + if (mflops < mflops_worst) + mflops_worst = mflops; + + std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank " << mflops / NP << std::endl; + std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node " << mflops / NN << std::endl; + } + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << L << "^4 Deo Best mflop/s = " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl; + std::cout << GridLogMessage << L << "^4 Deo Worst mflop/s = " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl; + std::cout << GridLogMessage << fmt << std::endl; + std::cout << GridLogMessage; + + for (int i = 0; i < mflops_all.size(); i++) + { + std::cout << mflops_all[i] / NN << " ; "; + } + std::cout << std::endl; + } + std::cout << GridLogMessage << "==================================================================================" << std::endl; + return mflops_best; + } +}; + +int main(int argc, char **argv) +{ + Grid_init(&argc, &argv); + + CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); +#ifdef KNL + LebesgueOrder::Block = std::vector({8, 2, 2, 2}); +#else + LebesgueOrder::Block = std::vector({2, 2, 2, 2}); +#endif + Benchmark::Decomposition(); + + int do_su4 = 1; + int do_memory = 1; + int do_comms = 1; + + int sel = 4; + std::vector L_list({8, 12, 16, 24, 32}); + int selm1 = sel - 1; + + std::vector wilson; + std::vector dwf4; + std::vector staggered; + + int Ls = 1; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + wilson.push_back(Benchmark::DWF(Ls, L_list[l])); + } + + Ls = 12; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + double result = Benchmark::DWF(Ls, L_list[l]); + dwf4.push_back(result); + } + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + double result = Benchmark::Staggered(L_list[l]); + staggered.push_back(result); + } + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " << dwf4[l] << " \t\t " << staggered[l] << std::endl; + } + std::cout << GridLogMessage << "==================================================================================" << std::endl; + + int NN = NN_global; + if (do_memory) + { + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " Memory benchmark " << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + Benchmark::Memory(); + } + + if (do_su4) + { + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " SU(4) benchmark " << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + Benchmark::SU4(); + } + + if (do_comms) + { + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " Communications benchmark " << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + Benchmark::Comms(); + } + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl; + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl; + for (int l = 0; l < L_list.size(); l++) + { + std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl; + } + std::cout << GridLogMessage << "==================================================================================" << std::endl; + + std::cout << GridLogMessage << "==================================================================================" << std::endl; + std::cout << GridLogMessage << " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN << " Mflop/s per node" << std::endl; + std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl; + std::cout << std::setprecision(3); + std::cout << GridLogMessage << "==================================================================================" << std::endl; + + Grid_finalize(); +} diff --git a/Grid/Benchmark_comms_host_device.cpp b/Grid/Benchmark_comms_host_device.cpp new file mode 100644 index 0000000..56227c4 --- /dev/null +++ b/Grid/Benchmark_comms_host_device.cpp @@ -0,0 +1,251 @@ +/* +Copyright © 2015 Peter Boyle +Copyright © 2022 Antonin Portelli + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +#include + +using namespace std; +using namespace Grid; + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void header(){ + std::cout <1) nmu++; + + std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; + std::vector t_time(Nloop); + // time_statistics timestat; + + std::cout< > xbuf(8); + std::vector > rbuf(8); + + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + } + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< xbuf(8); + std::vector rbuf(8); + + uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + } + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< +Copyright © 2022 Antonin Portelli + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal +{ + d internal; +}; + +Gamma::Algebra Gmu[] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT}; + +int main(int argc, char **argv) +{ + Grid_init(&argc, &argv); + + int threads = GridThread::GetThreads(); + + Coordinate latt4 = GridDefaultLatt(); + int Ls = 16; + for (int i = 0; i < argc; i++) + if (std::string(argv[i]) == "-Ls") + { + std::stringstream ss(argv[i + 1]); + ss >> Ls; + } + + GridLogLayout(); + + long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc); + + GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); + GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); + + std::cout << GridLogMessage << "Making s innermost grids" << std::endl; + GridCartesian *sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi()); + GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); + GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid); + GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid); + + std::vector seeds4({1, 2, 3, 4}); + std::vector seeds5({5, 6, 7, 8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); + RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); + RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src(FGrid); + random(RNG5, src); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4, Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; +#if 0 + Umu=1.0; + for(int mu=0;mu(Umu,mu); + // if (mu !=2 ) ttmp = 0; + // ttmp = ttmp* pow(10.0,mu); + PokeIndex(Umu,ttmp,mu); + } + std::cout << GridLogMessage << "Forced to diagonal " << std::endl; +#endif + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + // replicate across fifth dimension + // LatticeGaugeFieldF Umu5d(FGrid); + std::vector U(4, UGrid); + for (int mu = 0; mu < Nd; mu++) + { + U[mu] = PeekIndex(Umu, mu); + } + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for (int mu = 0; mu < Nd; mu++) + { + + tmp = Cshift(src, mu + 1, 1); + { + autoView(tmp_v, tmp, CpuWrite); + autoView(U_v, U[mu], CpuRead); + for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++) + { + for (int s = 0; s < Ls; s++) + { + tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s]; + } + } + } + ref = ref + tmp - Gamma(Gmu[mu]) * tmp; + + { + autoView(tmp_v, tmp, CpuWrite); + autoView(U_v, U[mu], CpuRead); + autoView(src_v, src, CpuRead); + for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++) + { + for (int s = 0; s < Ls; s++) + { + tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s]; + } + } + } + tmp = Cshift(tmp, mu + 1, -1); + ref = ref + tmp + Gamma(Gmu[mu]) * tmp; + } + ref = -0.5 * ref; + } + + RealD mass = 0.1; + RealD M5 = 1.8; + + RealD NP = UGrid->_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage << "*****************************************************************" << std::endl; + std::cout << GridLogMessage << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" << std::endl; + std::cout << GridLogMessage << "*****************************************************************" << std::endl; + std::cout << GridLogMessage << "*****************************************************************" << std::endl; + std::cout << GridLogMessage << "* Benchmarking DomainWallFermionR::Dhop " << std::endl; + std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd() << std::endl; + std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B" << std::endl; + if (sizeof(RealF) == 4) + std::cout << GridLogMessage << "* SINGLE precision " << std::endl; + if (sizeof(RealF) == 8) + std::cout << GridLogMessage << "* DOUBLE precision " << std::endl; +#ifdef GRID_OMP + if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) + std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; + if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) + std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl; +#endif + if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) + std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl; + if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) + std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl; + if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm) + std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl; + std::cout << GridLogMessage << "*****************************************************************" << std::endl; + + DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); + int ncall = 300; + + if (1) + { + FGrid->Barrier(); + Dw.ZeroCounters(); + Dw.Dhop(src, result, 0); + std::cout << GridLogMessage << "Called warmup" << std::endl; + double t0 = usecond(); + for (int i = 0; i < ncall; i++) + { + __SSC_START; + Dw.Dhop(src, result, 0); + __SSC_STOP; + } + double t1 = usecond(); + FGrid->Barrier(); + + double volume = Ls; + for (int mu = 0; mu < Nd; mu++) + volume = volume * latt4[mu]; + double flops = single_site_flops * volume * ncall; + + auto nsimd = vComplex::Nsimd(); + auto simdwidth = sizeof(vComplex); + + // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors + double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.); + + // mem: Nd Wilson * Ls, Nd gauge, Nc colors + double data_mem = (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth / nsimd * ncall / (1024. * 1024. * 1024.); + + std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0 << " us" << std::endl; + // std::cout< 1.0e-4)) + { + /* + std::cout << "RESULT\n " << result<Barrier(); + exit(-1); + } + assert(norm2(err) < 1.0e-4); + Dw.Report(); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for (int mu = 0; mu < Nd; mu++) + { + + // ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x + tmp = Cshift(src, mu + 1, 1); + { + autoView(ref_v, ref, CpuWrite); + autoView(tmp_v, tmp, CpuRead); + autoView(U_v, U[mu], CpuRead); + for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++) + { + for (int s = 0; s < Ls; s++) + { + int i = s + Ls * ss; + ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]); + ; + } + } + } + + { + autoView(tmp_v, tmp, CpuWrite); + autoView(U_v, U[mu], CpuRead); + autoView(src_v, src, CpuRead); + for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++) + { + for (int s = 0; s < Ls; s++) + { + tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s]; + } + } + } + // tmp =adj(U[mu])*src; + tmp = Cshift(tmp, mu + 1, -1); + { + autoView(ref_v, ref, CpuWrite); + autoView(tmp_v, tmp, CpuRead); + for (int i = 0; i < ref_v.size(); i++) + { + ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i]; + ; + } + } + } + ref = -0.5 * ref; + } + // dump=1; + Dw.Dhop(src, result, 1); + std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; + std::cout << GridLogMessage << "Called DwDag" << std::endl; + std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl; + std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl; + err = ref - result; + std::cout << GridLogMessage << "norm dag diff " << norm2(err) << std::endl; + if ((norm2(err) > 1.0e-4)) + { + /* + std::cout<< "DAG RESULT\n " <Barrier(); + Dw.DhopEO(src_o, r_e, DaggerNo); + double t0 = usecond(); + for (int i = 0; i < ncall; i++) + { +#ifdef CUDA_PROFILE + if (i == 10) + cudaProfilerStart(); +#endif + Dw.DhopEO(src_o, r_e, DaggerNo); +#ifdef CUDA_PROFILE + if (i == 20) + cudaProfilerStop(); +#endif + } + double t1 = usecond(); + FGrid->Barrier(); + + double volume = Ls; + for (int mu = 0; mu < Nd; mu++) + volume = volume * latt4[mu]; + double flops = (single_site_flops * volume * ncall) / 2.0; + + std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl; + std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP << std::endl; + std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN << std::endl; + Dw.Report(); + } + Dw.DhopEO(src_o, r_e, DaggerNo); + Dw.DhopOE(src_e, r_o, DaggerNo); + Dw.Dhop(src, result, DaggerNo); + + std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl; + std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl; + std::cout << GridLogMessage << "res" << norm2(result) << std::endl; + + setCheckerboard(r_eo, r_o); + setCheckerboard(r_eo, r_e); + + err = r_eo - result; + std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl; + if ((norm2(err) > 1.0e-4)) + { + /* + std::cout<< "Deo RESULT\n " < + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Grid/Makefile.am b/Grid/Makefile.am new file mode 100644 index 0000000..8557f11 --- /dev/null +++ b/Grid/Makefile.am @@ -0,0 +1,12 @@ +ACLOCAL_AMFLAGS = -I .buildutils/m4 + +bin_PROGRAMS = \ + Benchmark_comms_host_device \ + Benchmark_dwf_fp32 \ + Benchmark_ITT \ + Benchmark_IO + +Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp +Benchmark_dwf_fp32_SOURCES = Benchmark_dwf_fp32.cpp +Benchmark_ITT_SOURCES = Benchmark_ITT.cpp +Benchmark_IO_SOURCES = Benchmark_IO.cpp diff --git a/Grid/bootstrap.sh b/Grid/bootstrap.sh new file mode 100755 index 0000000..808ebeb --- /dev/null +++ b/Grid/bootstrap.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +set -euo pipefail + +mkdir -p .buildutils/m4 +autoreconf -fvi diff --git a/Grid/configure.ac b/Grid/configure.ac new file mode 100644 index 0000000..e5a5bf5 --- /dev/null +++ b/Grid/configure.ac @@ -0,0 +1,58 @@ +AC_PREREQ([2.69]) +AC_INIT([lattice-bench], [0.1], [antonin.portelli@me.com]) +AC_CANONICAL_BUILD +AC_CANONICAL_HOST +AC_CANONICAL_TARGET +AC_CONFIG_SRCDIR([Benchmark_ITT.cpp]) +AC_CONFIG_MACRO_DIR([.buildutils/m4]) +AC_CONFIG_HEADERS([config.h]) +AM_INIT_AUTOMAKE([-Wall -Werror foreign]) +m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) + +# Checks for programs. +AC_PROG_CXX +AC_PROG_CC +AC_PROG_RANLIB +AM_PROG_AR +AC_LANG([C++]) + +AC_ARG_WITH([grid], + [AS_HELP_STRING([--with-grid=], + [try this for a non-standard install prefix of Grid])], + [PATH="$with_grid/bin$PATH_SEPARATOR$PATH"] + [CXXFLAGS="$CXXFLAGS -I$with_grid/include"] + [LDFLAGS="$LDFLAGS -L$with_grid/lib"]) +AC_CHECK_PROG([GRIDCONF],[grid-config],[yes]) +if test x"$GRIDCONF" != x"yes" ; then + AC_MSG_ERROR([grid-config not found]) +fi +CXXFLAGS="$CXXFLAGS `grid-config --cxxflags`" +LDFLAGS="$LDFLAGS `grid-config --ldflags`" +CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS" +LDFLAGS="$AM_LDFLAGS $LDFLAGS" +LIBS=" -lGrid $LIBS `grid-config --libs`" + +AC_MSG_CHECKING([that a minimal Grid program compiles]); +AC_LINK_IFELSE( + [AC_LANG_SOURCE([[ + #include + + using namespace Grid; + + int main(int argc, char *argv[]) + { + Grid_init(&argc, &argv); + Grid_finalize(); + + return 0; + } + + ]])], + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] + [AC_MSG_ERROR([Could not compile a minimal Grid program])]) + +AC_SUBST([AM_CXXFLAGS]) +AC_SUBST([AM_LDFLAGS]) +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT