Compare commits

..

1 Commits
main ... main

17 changed files with 1321 additions and 690 deletions

View File

@ -1,9 +1,8 @@
/*
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
Copyright © 2022 Simon Buerger <simon.buerger@rwth-aachen.de>
This is a fork of Benchmark_ITT.cpp from Grid
This is a refactoring of Benchmark_ITT.cpp from Grid
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
@ -25,6 +24,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
using namespace Grid;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
double mflop_ref;
double mflop_ref_err;
int NN_global;
nlohmann::json json_results;
@ -52,6 +58,18 @@ struct time_statistics
}
};
void comms_header()
{
std::cout << GridLogMessage << " L "
<< "\t"
<< " Ls "
<< "\t"
<< "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
};
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
struct controls
{
int Opt;
@ -64,69 +82,47 @@ class Benchmark
public:
static void Decomposition(void)
{
nlohmann::json tmp;
int threads = GridThread::GetThreads();
Grid::Coordinate mpi = GridDefaultMpi();
assert(mpi.size() == 4);
Coordinate local({8, 8, 8, 8});
Coordinate latt4(
{local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(
latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
Grid::Coordinate shm(4, 1);
GlobalSharedMemory::GetShmDims(mpi, shm);
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global = NN;
uint64_t SHM = NP / NN;
grid_big_sep();
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads"
<< std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
grid_small_sep();
std::cout << GridLogMessage << "* OpenMP threads : " << GridThread::GetThreads()
std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads()
<< std::endl;
std::cout << GridLogMessage << "* MPI layout : " << GridCmdVectorIntToString(mpi)
std::cout << GridLogMessage
<< "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi())
<< std::endl;
std::cout << GridLogMessage << "* Shm layout : " << GridCmdVectorIntToString(shm)
<< std::endl;
std::cout << GridLogMessage << "* vReal : " << sizeof(vReal) * 8 << "bits ; "
std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "* vRealF : " << sizeof(vRealF) * 8
std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "* vRealD : " << sizeof(vRealD) * 8
std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "* vComplex : " << sizeof(vComplex) * 8
std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "* vComplexF : " << sizeof(vComplexF) * 8
std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "* vComplexD : " << sizeof(vComplexD) * 8
std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "* ranks : " << NP << std::endl;
std::cout << GridLogMessage << "* nodes : " << NN << std::endl;
std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl;
for (unsigned int i = 0; i < mpi.size(); ++i)
{
tmp["mpi"].push_back(mpi[i]);
tmp["shm"].push_back(shm[i]);
}
tmp["ranks"] = NP;
tmp["nodes"] = NN;
json_results["geometry"] = tmp;
}
static void Comms(void)
@ -137,8 +133,6 @@ class Benchmark
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
Coordinate shm_layout(Nd, 1);
GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout);
for (int mu = 0; mu < Nd; mu++)
if (mpi_layout[mu] > 1)
@ -147,11 +141,18 @@ class Benchmark
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in "
std::cout << GridLogMessage
<< "======================================================================="
"============================="
<< std::endl;
std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "
<< nmu << " dimensions" << std::endl;
grid_small_sep();
grid_printf("%5s %5s %7s %15s %15s %15s %15s %15s\n", "L", "dir", "shm",
"payload (B)", "time (usec)", "rate (GB/s/node)", "std dev", "max");
std::cout << GridLogMessage
<< "======================================================================="
"============================="
<< std::endl;
grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)",
"time (usec)", "rate (GB/s)", "std dev", "max");
for (int lat = 16; lat <= maxlat; lat += 8)
{
@ -175,85 +176,56 @@ class Benchmark
}
double dbytes;
#define NWARMUP 50
for (int dir = 0; dir < 8; dir++)
{
int mu = dir % 4;
if (mpi_layout[mu] == 1) // skip directions that are not distributed
continue;
bool is_shm = mpi_layout[mu] == shm_layout[mu];
bool is_partial_shm = !is_shm && shm_layout[mu] != 1;
std::vector<double> times(Nloop);
for (int i = 0; i < NWARMUP; i++)
{
int xmit_to_rank;
int recv_from_rank;
if (dir == mu)
{
int comm_proc = 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
else
{
int comm_proc = mpi_layout[mu] - 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0],
recv_from_rank, bytes);
}
for (int i = 0; i < Nloop; i++)
if (mpi_layout[mu] > 1)
{
dbytes = 0;
double start = usecond();
int xmit_to_rank;
int recv_from_rank;
if (dir == mu)
std::vector<double> times(Nloop);
for (int i = 0; i < Nloop; i++)
{
int comm_proc = 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
else
{
int comm_proc = mpi_layout[mu] - 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0],
recv_from_rank, bytes);
dbytes += bytes;
double stop = usecond();
t_time[i] = stop - start; // microseconds
dbytes = 0;
double start = usecond();
int xmit_to_rank;
int recv_from_rank;
if (dir == mu)
{
int comm_proc = 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
else
{
int comm_proc = mpi_layout[mu] - 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank, bytes);
dbytes += bytes;
double stop = usecond();
t_time[i] = stop - start; // microseconds
}
timestat.statistics(t_time);
dbytes = dbytes * ppn;
double bidibytes = 2. * dbytes;
double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.;
double rate_err = rate * timestat.err / timestat.mean;
double rate_max = rate * timestat.mean / timestat.min;
grid_printf("%5d %5d %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, bytes,
timestat.mean, rate, rate_err, rate_max);
nlohmann::json tmp;
tmp["L"] = lat;
tmp["dir"] = dir;
tmp["bytes"] = bytes;
tmp["time"] = timestat.mean;
tmp["GB_per_second"] = rate;
json_results["comms"].push_back(tmp);
}
timestat.statistics(t_time);
dbytes = dbytes * ppn;
double bidibytes = 2. * dbytes;
double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.;
double rate_err = rate * timestat.err / timestat.mean;
double rate_max = rate * timestat.mean / timestat.min;
grid_printf("%5d %5d %7s %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir,
is_shm ? "yes"
: is_partial_shm ? "partial"
: "no",
bytes, timestat.mean, rate, rate_err, rate_max);
nlohmann::json tmp;
nlohmann::json tmp_rate;
tmp["L"] = lat;
tmp["dir"] = dir;
tmp["shared_mem"] = is_shm;
tmp["partial_shared_mem"] = is_partial_shm;
tmp["bytes"] = bytes;
tmp["time_usec"] = timestat.mean;
tmp_rate["mean"] = rate;
tmp_rate["error"] = rate_err;
tmp_rate["max"] = rate_max;
tmp["rate_GBps"] = tmp_rate;
json_results["comms"].push_back(tmp);
}
for (int d = 0; d < 8; d++)
{
@ -273,14 +245,33 @@ class Benchmark
Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout << GridLogMessage << "Benchmarking a*x + y bandwidth" << std::endl;
grid_small_sep();
grid_printf("%5s %15s %15s %15s %15s\n", "L", "size (MB/node)", "time (usec)",
"GB/s/node", "Gflop/s/node");
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " L "
<< "\t\t"
<< "bytes"
<< "\t\t\t"
<< "GB/s"
<< "\t\t"
<< "Gflop/s"
<< "\t\t seconds"
<< "\t\tGB/s / node" << std::endl;
std::cout << GridLogMessage
<< "----------------------------------------------------------"
<< std::endl;
// uint64_t NP;
uint64_t NN;
uint64_t lmax = 64;
#define NLOOP (200 * lmax * lmax * lmax / lat / lat / lat)
uint64_t lmax = 32;
#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat)
GridSerialRNG sRNG;
sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
@ -289,11 +280,11 @@ class Benchmark
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
double vol =
static_cast<double>(latt_size[0]) * latt_size[1] * latt_size[2] * latt_size[3];
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
// NP= Grid.RankCount();
NN = Grid.NodeCount();
Vec rn;
@ -309,29 +300,26 @@ class Benchmark
uint64_t Nloop = NLOOP;
for (int i = 0; i < NWARMUP; i++)
{
z = a * x - y;
}
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
z = a * x - y;
}
double stop = usecond();
double time = (stop - start) / Nloop / 1.e6;
double time = (stop - start) / Nloop * 1000;
double flops = vol * Nvec * 2 / 1.e9; // mul,add
double bytes = 3.0 * vol * Nvec * sizeof(Real) / 1024. / 1024.;
grid_printf("%5d %15.2f %15.2f %15.2f %15.2f\n", lat, bytes / NN, time * 1.e6,
bytes / time / NN / 1024., flops / time / NN);
double flops = vol * Nvec * 2; // mul,add
double bytes = 3.0 * vol * Nvec * sizeof(Real);
std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
<< " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
<< (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
<< std::endl;
nlohmann::json tmp;
tmp["L"] = lat;
tmp["size_MB"] = bytes / NN;
tmp["GBps"] = bytes / time / NN / 1024.;
tmp["GFlops"] = flops / time / NN;
tmp["bytes"] = bytes;
tmp["gflops"] = flops / time;
tmp["GB_per_second"] = bytes / time;
json_results["axpy"].push_back(tmp);
}
};
@ -344,14 +332,31 @@ class Benchmark
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout << GridLogMessage << "Benchmarking z = y*x SU(4) bandwidth" << std::endl;
grid_small_sep();
grid_printf("%5s %15s %15s %15s %15s\n", "L", "size (MB/node)", "time (usec)",
"GB/s/node", "Gflop/s/node");
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " L "
<< "\t\t"
<< "bytes"
<< "\t\t\t"
<< "GB/s"
<< "\t\t"
<< "Gflop/s"
<< "\t\t seconds"
<< "\t\tGB/s / node" << std::endl;
std::cout << GridLogMessage
<< "----------------------------------------------------------"
<< std::endl;
uint64_t NN;
uint64_t lmax = 48;
uint64_t lmax = 32;
GridSerialRNG sRNG;
sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
@ -360,8 +365,7 @@ class Benchmark
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
double vol =
static_cast<double>(latt_size[0]) * latt_size[1] * latt_size[2] * latt_size[3];
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@ -373,31 +377,30 @@ class Benchmark
x = Zero();
LatticeSU4 y(&Grid);
y = Zero();
// double a=2.0;
uint64_t Nloop = NLOOP;
for (int i = 0; i < NWARMUP; i++)
{
z = x * y;
}
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
z = x * y;
}
double stop = usecond();
double time = (stop - start) / Nloop / 1.e6;
double time = (stop - start) / Nloop * 1000;
double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8) / 1.e9; // mul,add
double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF) / 1024. / 1024.;
grid_printf("%5d %15.2f %15.2f %15.2f %15.2f\n", lat, bytes / NN, time * 1.e6,
bytes / time / NN / 1024., flops / time / NN);
double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
<< " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
<< (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
<< std::endl;
nlohmann::json tmp;
tmp["L"] = lat;
tmp["size_MB"] = bytes / NN;
tmp["GBps"] = bytes / time / NN / 1024.;
tmp["GFlops"] = flops / time / NN;
tmp["bytes"] = bytes;
tmp["GB_per_second"] = bytes / time;
tmp["gflops"] = flops / time;
json_results["SU4"].push_back(tmp);
}
};
@ -407,10 +410,10 @@ class Benchmark
RealD mass = 0.1;
RealD M5 = 1.8;
double gflops;
double gflops_best = 0;
double gflops_worst = 0;
std::vector<double> gflops_all;
double mflops;
double mflops_best = 0;
double mflops_worst = 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
@ -430,7 +433,10 @@ class Benchmark
uint64_t SHM = NP / NN;
///////// Welcome message ////////////
grid_big_sep();
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume "
<< std::endl;
std::cout << GridLogMessage << "* Nc : " << Nc << std::endl;
@ -443,7 +449,10 @@ class Benchmark
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi)
<< std::endl;
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
grid_big_sep();
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
///////// Lattice Init ////////////
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
@ -502,7 +511,10 @@ class Benchmark
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
grid_small_sep();
std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
@ -510,7 +522,10 @@ class Benchmark
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
grid_small_sep();
std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
int nwarm = 10;
double t0 = usecond();
@ -525,6 +540,10 @@ class Benchmark
FGrid->Broadcast(0, &ncall, sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per
// second"<<std::endl;
Dw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for (uint64_t i = 0; i < ncall; i++)
@ -550,60 +569,67 @@ class Benchmark
double fps =
Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
#endif
double flops = (fps * volume) / 2.;
double gf_hi, gf_lo, gf_err;
double flops = (fps * volume) / 2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
gf_hi = flops / timestat.min / 1000.;
gf_lo = flops / timestat.max / 1000.;
gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
mf_hi = flops / timestat.min;
mf_lo = flops / timestat.max;
mf_err = flops / timestat.min * timestat.err / timestat.mean;
gflops = flops / timestat.mean / 1000.;
gflops_all.push_back(gflops);
if (gflops_best == 0)
gflops_best = gflops;
if (gflops_worst == 0)
gflops_worst = gflops;
if (gflops > gflops_best)
gflops_best = gflops;
if (gflops < gflops_worst)
gflops_worst = gflops;
mflops = flops / timestat.mean;
mflops_all.push_back(mflops);
if (mflops_best == 0)
mflops_best = mflops;
if (mflops_worst == 0)
mflops_worst = mflops;
if (mflops > mflops_best)
mflops_best = mflops;
if (mflops < mflops_worst)
mflops_worst = mflops;
std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo Gflop/s = " << gflops << " (" << gf_err << ") " << gf_lo
<< "-" << gf_hi << std::endl;
<< "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
<< "-" << mf_hi << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo Gflop/s per rank " << gflops / NP << std::endl;
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo Gflop/s per node " << gflops / NN << std::endl;
<< "Deo mflop/s per node " << mflops / NN << std::endl;
}
grid_small_sep();
std::cout << GridLogMessage
<< "====================================================================="
"============="
<< std::endl;
std::cout << GridLogMessage << L << "^4 x " << Ls
<< " Deo Best Gflop/s = " << gflops_best << " ; "
<< gflops_best / NN << " per node " << std::endl;
<< " Deo Best mflop/s = " << mflops_best << " ; "
<< mflops_best / NN << " per node " << std::endl;
std::cout << GridLogMessage << L << "^4 x " << Ls
<< " Deo Worst Gflop/s = " << gflops_worst << " ; "
<< gflops_worst / NN << " per node " << std::endl;
<< " Deo Worst mflop/s = " << mflops_worst << " ; "
<< mflops_worst / NN << " per node " << std::endl;
std::cout << GridLogMessage << fmt << std::endl;
std::cout << GridLogMessage;
for (int i = 0; i < gflops_all.size(); i++)
for (int i = 0; i < mflops_all.size(); i++)
{
std::cout << gflops_all[i] / NN << " ; ";
std::cout << mflops_all[i] / NN << " ; ";
}
std::cout << std::endl;
std::cout << GridLogMessage
<< "====================================================================="
"============="
<< std::endl;
}
return gflops_best;
return mflops_best;
}
static double Staggered(int L)
{
double gflops;
double gflops_best = 0;
double gflops_worst = 0;
std::vector<double> gflops_all;
double mflops;
double mflops_best = 0;
double mflops_worst = 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
@ -623,7 +649,10 @@ class Benchmark
uint64_t SHM = NP / NN;
///////// Welcome message ////////////
grid_big_sep();
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L
<< "^4 local volume " << std::endl;
std::cout << GridLogMessage
@ -634,7 +663,10 @@ class Benchmark
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi)
<< std::endl;
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
grid_big_sep();
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
///////// Lattice Init ////////////
GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(
@ -696,7 +728,10 @@ class Benchmark
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
grid_small_sep();
std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels"
<< std::endl;
@ -705,7 +740,10 @@ class Benchmark
if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
grid_small_sep();
std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
int nwarm = 10;
double t0 = usecond();
@ -720,6 +758,10 @@ class Benchmark
FGrid->Broadcast(0, &ncall, sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per
// second"<<std::endl;
Ds.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for (uint64_t i = 0; i < ncall; i++)
@ -734,51 +776,58 @@ class Benchmark
double volume = 1;
for (int mu = 0; mu < Nd; mu++)
volume = volume * latt4[mu];
double flops = (1146.0 * volume) / 2.;
double gf_hi, gf_lo, gf_err;
double flops = (1146.0 * volume) / 2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
gf_hi = flops / timestat.min / 1000.;
gf_lo = flops / timestat.max / 1000.;
gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
mf_hi = flops / timestat.min;
mf_lo = flops / timestat.max;
mf_err = flops / timestat.min * timestat.err / timestat.mean;
gflops = flops / timestat.mean / 1000.;
gflops_all.push_back(gflops);
if (gflops_best == 0)
gflops_best = gflops;
if (gflops_worst == 0)
gflops_worst = gflops;
if (gflops > gflops_best)
gflops_best = gflops;
if (gflops < gflops_worst)
gflops_worst = gflops;
mflops = flops / timestat.mean;
mflops_all.push_back(mflops);
if (mflops_best == 0)
mflops_best = mflops;
if (mflops_worst == 0)
mflops_worst = mflops;
if (mflops > mflops_best)
mflops_best = mflops;
if (mflops < mflops_worst)
mflops_worst = mflops;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo Gflop/s = " << gflops << " (" << gf_err << ") " << gf_lo
<< "-" << gf_hi << std::endl;
<< "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
<< "-" << mf_hi << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo Gflop/s per rank " << gflops / NP << std::endl;
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo Gflop/s per node " << gflops / NN << std::endl;
<< "Deo mflop/s per node " << mflops / NN << std::endl;
}
grid_small_sep();
std::cout << GridLogMessage
<< "====================================================================="
"============="
<< std::endl;
std::cout << GridLogMessage << L
<< "^4 Deo Best Gflop/s = " << gflops_best << " ; "
<< gflops_best / NN << " per node " << std::endl;
<< "^4 Deo Best mflop/s = " << mflops_best << " ; "
<< mflops_best / NN << " per node " << std::endl;
std::cout << GridLogMessage << L
<< "^4 Deo Worst Gflop/s = " << gflops_worst << " ; "
<< gflops_worst / NN << " per node " << std::endl;
<< "^4 Deo Worst mflop/s = " << mflops_worst << " ; "
<< mflops_worst / NN << " per node " << std::endl;
std::cout << GridLogMessage << fmt << std::endl;
std::cout << GridLogMessage;
for (int i = 0; i < gflops_all.size(); i++)
for (int i = 0; i < mflops_all.size(); i++)
{
std::cout << gflops_all[i] / NN << " ; ";
std::cout << mflops_all[i] / NN << " ; ";
}
std::cout << std::endl;
}
return gflops_best;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
return mflops_best;
}
};
@ -816,87 +865,175 @@ int main(int argc, char **argv)
std::vector<double> dwf4;
std::vector<double> staggered;
if (do_memory)
{
grid_big_sep();
std::cout << GridLogMessage << " Memory benchmark " << std::endl;
grid_big_sep();
Benchmark::Memory();
}
if (do_su4)
{
grid_big_sep();
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
grid_big_sep();
Benchmark::SU4();
}
if (do_comms)
{
grid_big_sep();
std::cout << GridLogMessage << " Communications benchmark " << std::endl;
grid_big_sep();
Benchmark::Comms();
}
if (do_flops)
{
Ls = 1;
grid_big_sep();
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
for (int l = 0; l < L_list.size(); l++)
{
wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
}
Ls = 12;
grid_big_sep();
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
for (int l = 0; l < L_list.size(); l++)
{
double result = Benchmark::DWF(Ls, L_list[l]);
dwf4.push_back(result);
}
grid_big_sep();
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised"
<< std::endl;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
for (int l = 0; l < L_list.size(); l++)
{
double result = Benchmark::Staggered(L_list[l]);
staggered.push_back(result);
}
int NN = NN_global;
grid_big_sep();
std::cout << GridLogMessage << "Gflop/s/node Summary table Ls=" << Ls << std::endl;
grid_big_sep();
grid_printf("%5s %12s %12s %12s\n", "L", "Wilson", "DWF", "Staggered");
nlohmann::json tmp_flops;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
grid_printf("%5d %12.2f %12.2f %12.2f\n", L_list[l], wilson[l] / NN, dwf4[l] / NN,
staggered[l] / NN);
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t "
<< dwf4[l] << " \t\t " << staggered[l] << std::endl;
nlohmann::json tmp;
tmp["L"] = L_list[l];
tmp["Gflops_wilson"] = wilson[l] / NN;
tmp["Gflops_dwf4"] = dwf4[l] / NN;
tmp["Gflops_staggered"] = staggered[l] / NN;
tmp_flops["results"].push_back(tmp);
tmp["mflops_wilson"] = wilson[l];
tmp["mflops_dwf4"] = dwf4[l];
tmp["mflops_staggered"] = staggered[l];
json_results["flops"].push_back(tmp);
}
grid_big_sep();
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
}
int NN = NN_global;
if (do_memory)
{
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " Memory benchmark " << std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
Benchmark::Memory();
}
if (do_su4)
{
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
Benchmark::SU4();
}
if (do_comms)
{
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " Communications benchmark " << std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
Benchmark::Comms();
}
if (do_flops)
{
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t "
<< dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
}
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage
<< " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
<< " Gflop/s per node" << std::endl;
<< " Mflop/s per node" << std::endl;
std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
<< dwf4[selm1] / NN << ") " << std::endl;
std::cout << std::setprecision(3);
grid_big_sep();
tmp_flops["comparison_point_Gflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN;
json_results["flops"] = tmp_flops;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
}
if (!json_filename.empty())
@ -909,7 +1046,7 @@ int main(int argc, char **argv)
if (me == 0)
{
std::ofstream json_file(json_filename);
json_file << std::setw(2) << json_results;
json_file << std::setw(4) << json_results;
}
}

View File

@ -32,7 +32,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#ifdef HAVE_LIME
using namespace Grid;
std::string filestem(const int l) { return "io/iobench_l" + std::to_string(l); }
std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
@ -56,6 +56,13 @@ template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat
mean /= n;
}
#define grid_printf(...) \
{ \
char _buf[1024]; \
sprintf(_buf, __VA_ARGS__); \
MSG << _buf; \
}
enum
{
sRead = 0,
@ -76,58 +83,58 @@ int main(int argc, char **argv)
std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
std::vector<int> latt;
GRID_MSG << "Grid is setup to use " << threads << " threads" << std::endl;
GRID_MSG << "MPI partition " << mpi << std::endl;
MSG << "Grid is setup to use " << threads << " threads" << std::endl;
MSG << "MPI partition " << mpi << std::endl;
for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
{
grid_big_sep();
GRID_MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
grid_big_sep();
grid_small_sep();
GRID_MSG << "Benchmark std write" << std::endl;
grid_small_sep();
MSG << BIGSEP << std::endl;
MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
MSG << BIGSEP << std::endl;
MSG << SEP << std::endl;
MSG << "Benchmark std write" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
MSG << "-- Local volume " << l << "^4" << std::endl;
writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
}
grid_small_sep();
GRID_MSG << "Benchmark std read" << std::endl;
grid_small_sep();
MSG << SEP << std::endl;
MSG << "Benchmark std read" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
MSG << "-- Local volume " << l << "^4" << std::endl;
readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
}
#ifdef HAVE_LIME
grid_small_sep();
GRID_MSG << "Benchmark Grid C-Lime write" << std::endl;
grid_small_sep();
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime write" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
MSG << "-- Local volume " << l << "^4" << std::endl;
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
}
grid_small_sep();
GRID_MSG << "Benchmark Grid C-Lime read" << std::endl;
grid_small_sep();
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime read" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
MSG << "-- Local volume " << l << "^4" << std::endl;
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
}
@ -152,13 +159,13 @@ int main(int argc, char **argv)
avRob.fill(100.);
avRob -= 100. * avStdDev.cwiseQuotient(avMean.cwiseAbs());
grid_big_sep();
GRID_MSG << "SUMMARY" << std::endl;
grid_big_sep();
GRID_MSG << "Summary of individual results (all results in MB/s)." << std::endl;
GRID_MSG << "Every second colum gives the standard deviation of the previous column."
<< std::endl;
GRID_MSG << std::endl;
MSG << BIGSEP << std::endl;
MSG << "SUMMARY" << std::endl;
MSG << BIGSEP << std::endl;
MSG << "Summary of individual results (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column."
<< std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
@ -169,10 +176,10 @@ int main(int argc, char **argv)
stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
stdDev(volInd(l), gWrite));
}
GRID_MSG << std::endl;
GRID_MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
<< std::endl;
GRID_MSG << std::endl;
MSG << std::endl;
MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
<< std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
"Grid write");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
@ -180,21 +187,21 @@ int main(int argc, char **argv)
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
}
GRID_MSG << std::endl;
GRID_MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
<< "^4 (all results in MB/s)." << std::endl;
GRID_MSG << "Every second colum gives the standard deviation of the previous column."
<< std::endl;
GRID_MSG << std::endl;
MSG << std::endl;
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
<< "^4 (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column."
<< std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
GRID_MSG << std::endl;
GRID_MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
<< std::endl;
GRID_MSG << std::endl;
MSG << std::endl;
MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
<< std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
"Grid write");
grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),

View File

@ -18,8 +18,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#ifndef Benchmark_IO_hpp_
#define Benchmark_IO_hpp_
#include "Common.hpp"
#include <Grid/Grid.h>
#define MSG std::cout << GridLogMessage
#define SEP \
"-----------------------------------------------------------------------------"
#define BIGSEP \
"============================================================================="
#ifdef HAVE_LIME
namespace Grid
@ -46,9 +50,9 @@ namespace Grid
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
// std::fwrite(&crc, sizeof(uint32_t), 1, file);
// crcWatch.Stop();
// GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec <<
// std::endl; ioWatch.Start(); std::fwrite(vec_v.cpu_ptr, sizeof(typename
// Field::scalar_object),
// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
// ioWatch.Start();
// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
// vec.Grid()->lSites(), file);
// ioWatch.Stop();
// std::fclose(file);
@ -57,11 +61,11 @@ namespace Grid
// p.size = size;
// p.time = ioWatch.useconds();
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
// GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
// << ",
// "
// << p.mbytesPerSecond << " MB/s" << std::endl;
// GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
// }
// template <typename Field> void stdRead(Field &vec, const std::string filestem)
@ -90,14 +94,16 @@ namespace Grid
// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
// crcWatch.Stop();
// }
// GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec <<
// std::endl; assert(crcData == crcRead); size *= vec.Grid()->ProcessorCount(); auto
// &p = BinaryIO::lastPerf; p.size = size; p.time = ioWatch.useconds();
// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
// assert(crcData == crcRead);
// size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf;
// p.size = size;
// p.time = ioWatch.useconds();
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
// GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() <<
// ", "
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
// << p.mbytesPerSecond << " MB/s" << std::endl;
// GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
// }
template <typename Field> void stdWrite(const std::string filestem, Field &vec)
@ -116,7 +122,7 @@ namespace Grid
crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t) / sizeof(char));
crcWatch.Stop();
GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
ioWatch.Start();
file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
file.flush();
@ -126,9 +132,9 @@ namespace Grid
p.size = size;
p.time = ioWatch.useconds();
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
<< p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
}
template <typename Field> void stdRead(Field &vec, const std::string filestem)
@ -157,17 +163,16 @@ namespace Grid
crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
crcWatch.Stop();
}
GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec
<< std::endl;
MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
assert(crcData == crcRead);
size *= vec.Grid()->ProcessorCount();
auto &p = BinaryIO::lastPerf;
p.size = size;
p.time = ioWatch.useconds();
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
<< p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
}
template <typename Field> void limeWrite(const std::string filestem, Field &vec)

View File

@ -0,0 +1,265 @@
/*
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
struct time_statistics
{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v)
{
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void header()
{
std::cout << GridLogMessage << " L "
<< "\t"
<< " Ls "
<< "\t" << std::setw(11) << "bytes\t\t"
<< "MB/s uni"
<< "\t"
<< "MB/s bidi" << std::endl;
};
int main(int argc, char **argv)
{
Grid_init(&argc, &argv);
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
<< std::endl;
int Nloop = 250;
int nmu = 0;
int maxlat = 32;
for (int mu = 0; mu < Nd; mu++)
if (mpi_layout[mu] > 1)
nmu++;
std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
<< std::endl;
std::vector<double> t_time(Nloop);
// time_statistics timestat;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
std::cout << GridLogMessage
<< "= Benchmarking sequential halo exchange from host memory " << std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
header();
for (int lat = 8; lat <= maxlat; lat += 4)
{
for (int Ls = 8; Ls <= 8; Ls *= 2)
{
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank / Nnode;
std::vector<std::vector<HalfSpinColourVectorD>> xbuf(8);
std::vector<std::vector<HalfSpinColourVectorD>> rbuf(8);
for (int mu = 0; mu < 8; mu++)
{
xbuf[mu].resize(lat * lat * lat * Ls);
rbuf[mu].resize(lat * lat * lat * Ls);
}
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
int ncomm;
for (int mu = 0; mu < 4; mu++)
{
if (mpi_layout[mu] > 1)
{
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
ncomm = 0;
ncomm++;
int comm_proc = 1;
int xmit_to_rank;
int recv_from_rank;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
(void *)&rbuf[mu][0], recv_from_rank, bytes);
}
comm_proc = mpi_layout[mu] - 1;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
}
}
Grid.Barrier();
double stop = usecond();
double mean = (stop - start) / Nloop;
double dbytes = bytes * ppn;
double xbytes = dbytes * 2.0 * ncomm;
double rbytes = xbytes;
double bidibytes = xbytes + rbytes;
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
<< std::setw(11) << bytes << std::fixed << std::setprecision(1)
<< std::setw(7) << " " << std::right << xbytes / mean << " "
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
}
}
}
}
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
std::cout << GridLogMessage
<< "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
header();
for (int lat = 8; lat <= maxlat; lat += 4)
{
for (int Ls = 8; Ls <= 8; Ls *= 2)
{
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank / Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
for (int d = 0; d < 8; d++)
{
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
}
int ncomm;
for (int mu = 0; mu < 4; mu++)
{
if (mpi_layout[mu] > 1)
{
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
ncomm = 0;
ncomm++;
int comm_proc = 1;
int xmit_to_rank;
int recv_from_rank;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
(void *)&rbuf[mu][0], recv_from_rank, bytes);
}
comm_proc = mpi_layout[mu] - 1;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
}
}
Grid.Barrier();
double stop = usecond();
double mean = (stop - start) / Nloop;
double dbytes = bytes * ppn;
double xbytes = dbytes * 2.0 * ncomm;
double rbytes = xbytes;
double bidibytes = xbytes + rbytes;
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
<< std::setw(11) << bytes << std::fixed << std::setprecision(1)
<< std::setw(7) << " " << std::right << xbytes / mean << " "
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
}
}
for (int d = 0; d < 8; d++)
{
acceleratorFreeDevice(xbuf[d]);
acceleratorFreeDevice(rbuf[d]);
}
}
}
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
Grid_finalize();
}

512
Grid/Benchmark_dwf_fp32.cpp Normal file
View File

@ -0,0 +1,512 @@
/*
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
Copyright © 2023 Simon Bürger <simon.buerger@rwth-aachen.de>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "json.hpp"
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
template <class d> struct scal
{
d internal;
};
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
int main(int argc, char **argv)
{
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls = 16;
std::string json_filename = ""; // empty indicates no json output
nlohmann::json json;
// benchmark specific command line arguments
for (int i = 0; i < argc; i++)
{
if (std::string(argv[i]) == "-Ls")
{
std::stringstream ss(argv[i + 1]);
ss >> Ls;
}
if (std::string(argv[i]) == "--json-out")
json_filename = argv[i + 1];
}
GridLogLayout();
long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
json["single_site_flops"] = single_site_flops;
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
json["grid"] = FGrid->FullDimensions().toVector();
json["local_grid"] = FGrid->LocalDimensions().toVector();
std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
GridCartesian *sUGrid =
SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid);
RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid);
RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src(FGrid);
random(RNG5, src);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0 / ::sqrt(norm2(src));
src = src * N2;
#endif
LatticeFermionF result(FGrid);
result = Zero();
LatticeFermionF ref(FGrid);
ref = Zero();
LatticeFermionF tmp(FGrid);
LatticeFermionF err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4, Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrixF ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
// LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++)
{
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for (int mu = 0; mu < Nd; mu++)
{
tmp = Cshift(src, mu + 1, 1);
{
autoView(tmp_v, tmp, CpuWrite);
autoView(U_v, U[mu], CpuRead);
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
{
for (int s = 0; s < Ls; s++)
{
tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s];
}
}
}
ref = ref + tmp - Gamma(Gmu[mu]) * tmp;
{
autoView(tmp_v, tmp, CpuWrite);
autoView(U_v, U[mu], CpuRead);
autoView(src_v, src, CpuRead);
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
{
for (int s = 0; s < Ls; s++)
{
tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
}
}
}
tmp = Cshift(tmp, mu + 1, -1);
ref = ref + tmp + Gamma(Gmu[mu]) * tmp;
}
ref = -0.5 * ref;
}
RealD mass = 0.1;
RealD M5 = 1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
json["ranks"] = NP;
json["nodes"] = NN;
std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout << GridLogMessage
<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
<< std::endl;
std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout << GridLogMessage
<< "* Benchmarking DomainWallFermionR::Dhop " << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
<< std::endl;
std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
<< std::endl;
if (sizeof(RealF) == 4)
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
if (sizeof(RealF) == 8)
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
#ifdef GRID_OMP
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
#endif
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
int ncall = 300;
if (1)
{
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src, result, 0);
std::cout << GridLogMessage << "Called warmup" << std::endl;
double t0 = usecond();
for (int i = 0; i < ncall; i++)
{
__SSC_START;
Dw.Dhop(src, result, 0);
__SSC_STOP;
}
double t1 = usecond();
FGrid->Barrier();
double volume = Ls;
for (int mu = 0; mu < Nd; mu++)
volume = volume * latt4[mu];
double flops = single_site_flops * volume * ncall;
auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
nsimd * ncall / (1024. * 1024. * 1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem =
(volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
nsimd * ncall / (1024. * 1024. * 1024.);
json["Dw"]["calls"] = ncall;
json["Dw"]["time"] = t1 - t0;
json["Dw"]["mflops"] = flops / (t1 - t0);
json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN;
json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
<< " us" << std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout << GridLogMessage << "mflop/s = " << flops / (t1 - t0) << std::endl;
std::cout << GridLogMessage << "mflop/s per rank = " << flops / (t1 - t0) / NP
<< std::endl;
std::cout << GridLogMessage << "mflop/s per node = " << flops / (t1 - t0) / NN
<< std::endl;
std::cout << GridLogMessage
<< "RF GiB/s (base 2) = " << 1000000. * data_rf / ((t1 - t0))
<< std::endl;
std::cout << GridLogMessage
<< "mem GiB/s (base 2) = " << 1000000. * data_mem / ((t1 - t0))
<< std::endl;
err = ref - result;
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
// exit(0);
if ((norm2(err) > 1.0e-4))
{
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
std::cout << GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
exit(-1);
}
assert(norm2(err) < 1.0e-4);
Dw.Report();
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for (int mu = 0; mu < Nd; mu++)
{
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = Cshift(src, mu + 1, 1);
{
autoView(ref_v, ref, CpuWrite);
autoView(tmp_v, tmp, CpuRead);
autoView(U_v, U[mu], CpuRead);
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
{
for (int s = 0; s < Ls; s++)
{
int i = s + Ls * ss;
ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]);
;
}
}
}
{
autoView(tmp_v, tmp, CpuWrite);
autoView(U_v, U[mu], CpuRead);
autoView(src_v, src, CpuRead);
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
{
for (int s = 0; s < Ls; s++)
{
tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
}
}
}
// tmp =adj(U[mu])*src;
tmp = Cshift(tmp, mu + 1, -1);
{
autoView(ref_v, ref, CpuWrite);
autoView(tmp_v, tmp, CpuRead);
for (int i = 0; i < ref_v.size(); i++)
{
ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i];
;
}
}
}
ref = -0.5 * ref;
}
// dump=1;
Dw.Dhop(src, result, 1);
std::cout << GridLogMessage
<< "Compare to naive wilson implementation Dag to verify correctness"
<< std::endl;
std::cout << GridLogMessage << "Called DwDag" << std::endl;
std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl;
err = ref - result;
std::cout << GridLogMessage << "norm dag diff " << norm2(err) << std::endl;
if ((norm2(err) > 1.0e-4))
{
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermionF src_e(FrbGrid);
LatticeFermionF src_o(FrbGrid);
LatticeFermionF r_e(FrbGrid);
LatticeFermionF r_o(FrbGrid);
LatticeFermionF r_eo(FGrid);
std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
<< std::endl;
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd, src_o, src);
std::cout << GridLogMessage << "src_e" << norm2(src_e) << std::endl;
std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage
<< "*********************************************************" << std::endl;
std::cout << GridLogMessage
<< "* Benchmarking DomainWallFermionF::DhopEO " << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
<< std::endl;
if (sizeof(RealF) == 4)
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
if (sizeof(RealF) == 8)
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
#ifdef GRID_OMP
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
#endif
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
std::cout << GridLogMessage
<< "*********************************************************" << std::endl;
{
Dw.ZeroCounters();
FGrid->Barrier();
Dw.DhopEO(src_o, r_e, DaggerNo);
double t0 = usecond();
for (int i = 0; i < ncall; i++)
{
#ifdef CUDA_PROFILE
if (i == 10)
cudaProfilerStart();
#endif
Dw.DhopEO(src_o, r_e, DaggerNo);
#ifdef CUDA_PROFILE
if (i == 20)
cudaProfilerStop();
#endif
}
double t1 = usecond();
FGrid->Barrier();
double volume = Ls;
for (int mu = 0; mu < Nd; mu++)
volume = volume * latt4[mu];
double flops = (single_site_flops * volume * ncall) / 2.0;
json["Deo"]["calls"] = ncall;
json["Deo"]["time"] = t1 - t0;
json["Deo"]["mflops"] = flops / (t1 - t0);
json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl;
std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP
<< std::endl;
std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN
<< std::endl;
Dw.Report();
}
Dw.DhopEO(src_o, r_e, DaggerNo);
Dw.DhopOE(src_e, r_o, DaggerNo);
Dw.Dhop(src, result, DaggerNo);
std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl;
std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl;
std::cout << GridLogMessage << "res" << norm2(result) << std::endl;
setCheckerboard(r_eo, r_o);
setCheckerboard(r_eo, r_e);
err = r_eo - result;
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
if ((norm2(err) > 1.0e-4))
{
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
pickCheckerboard(Even, src_e, err);
pickCheckerboard(Odd, src_o, err);
std::cout << GridLogMessage << "norm diff even " << norm2(src_e) << std::endl;
std::cout << GridLogMessage << "norm diff odd " << norm2(src_o) << std::endl;
assert(norm2(src_e) < 1.0e-4);
assert(norm2(src_o) < 1.0e-4);
if (!json_filename.empty())
{
std::cout << GridLogMessage << "writing benchmark results to " << json_filename
<< std::endl;
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
if (me == 0)
{
std::ofstream json_file(json_filename);
json_file << std::setw(4) << json;
}
}
Grid_finalize();
exit(0);
}

View File

@ -26,20 +26,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#define GRID_MSG_MAXSIZE 1024
#endif
#define GRID_BIG_SEP \
"==============================================================================="
#define GRID_SMALL_SEP "------------------------------------------"
#define grid_big_sep() \
{ \
GRID_MSG << GRID_BIG_SEP << std::endl; \
}
#define grid_small_sep() \
{ \
GRID_MSG << GRID_SMALL_SEP << std::endl; \
}
#define grid_printf(...) \
{ \
char _buf[GRID_MSG_MAXSIZE]; \

View File

@ -1,8 +1,12 @@
ACLOCAL_AMFLAGS = -I .buildutils/m4
bin_PROGRAMS = \
Benchmark_comms_host_device \
Benchmark_dwf_fp32 \
Benchmark_Grid \
Benchmark_IO
Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp
Benchmark_dwf_fp32_SOURCES = Benchmark_dwf_fp32.cpp
Benchmark_Grid_SOURCES = Benchmark_Grid.cpp
Benchmark_IO_SOURCES = Benchmark_IO.cpp

View File

@ -6,7 +6,6 @@ The benchmarks can be summarised as follows
- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
matrices, as well as bandwidth measurement for different operations. Measurements are
performed for a fixed range of problem sizes.
- `Benchmark_IO`: Parallel I/O benchmark.
## TL;DR
Build and install Grid, all dependencies, and the benchmark with
@ -29,7 +28,7 @@ You should first deploy the environment for the specific system you are using, f
systems/tursa/bootstrap-env.sh ./env
```
will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
of packages, and take some time to complete.
of packages, and might take some time to complete.
After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
```bash
@ -67,84 +66,4 @@ where `<env_dir>` is the environment directory and `<config>` is the build confi
## Running the benchmarks
After building the benchmarks as above you can find the binaries in
`<env_dir>/prefix/gridbench_<config>`. Depending on the system selected, the environment
directory might also contain batch script examples. More information about the benchmarks
is provided below.
### `Benchmark_Grid`
This benchmark performs flop/s measurement for typical lattice QCD sparse matrices, as
well as memory and inter-process bandwidth measurement using Grid routines. The benchmark
command accept any Grid flag (see complete list with `--help`), as well as a
`--json-out <file>` flag to save the measurement results in JSON to `<file>`. The
benchmarks are performed on a fix set of problem sizes, and the Grid flag `--grid` will
be ignored.
The resulting metrics are as follows, all data size units are in base 2
(i.e. 1 kB = 1024 B).
*Memory bandwidth*
One sub-benchmark measure the memory bandwidth using a lattice version of the `axpy` BLAS
routine, in a similar fashion to the STREAM benchmark. The JSON entries under `"axpy"`
have the form
```json
{
"GBps": 215.80653375861607, // bandwidth in GB/s/node
"GFlops": 19.310041765757834, // FP performance (double precision)
"L": 8, // local lattice volume
"size_MB": 3.0 // memory size in MB/node
}
```
A second benchmark performs site-wise SU(4) matrix multiplication, and has a higher
arithmetic intensity than the `axpy` one (although it is still memory-bound).
The JSON entries under `"SU4"` have the form
```json
{
"GBps": 394.76639187026865, // bandwidth in GB/s/node
"GFlops": 529.8464820758512, // FP performance (single precision)
"L": 8, // local lattice size
"size_MB": 6.0 // memory size in MB/node
}
```
*Inter-process bandwidth*
This sub-benchmark measures the achieved bidirectional bandwidth in threaded halo exchange
using routines in Grid. The exchange is performed in each direction on the MPI Cartesian
grid which is parallelised across at least 2 processes. The resulting bandwidth is related
to node-local transfers (inter-CPU, NVLink, ...) or network transfers depending on the MPI
decomposition. he JSON entries under `"comms"` have the form
```json
{
"L": 40, // local lattice size
"bytes": 73728000, // payload size in B/rank
"dir": 2, // direction of the exchange, 8 possible directions
// (0: +x, 1: +y, ..., 5: -x, 6: -y, ...)
"rate_GBps": {
"error": 6.474271894240327, // standard deviation across measurements (GB/s/node)
"max": 183.10546875, // maximum measured bandwidth (GB/s/node)
"mean": 175.21747026766676 // average measured bandwidth (GB/s/node)
},
"time_usec": 3135.055 // average transfer time (microseconds)
}
```
*Floating-point performances*
This sub-benchmark measures the achieved floating-point performances using the
Wilson fermion, domain-wall fermion, and staggered fermion sparse matrices from Grid.
In the `"flops"` and `"results"` section of the JSON output are recorded the best
performances, e.g.
```json
{
"Gflops_dwf4": 366.5251173474483, // domain-wall in Gflop/s/node (single precision)
"Gflops_staggered": 7.5982861018529455, // staggered in Gflop/s/node (single precision)
"Gflops_wilson": 15.221839719288932, // Wilson in Gflop/s/node (single precision)
"L": 8 // local lattice size
}
```
Here "best" means across a number of different implementations of the routines. Please
see the log of the benchmark for an additional breakdown. Finally, the JSON output
contains a "comparison point", which is the average of the L=24 and L=32 best
domain-wall performances.
`<env_dir>/prefix/gridbench_<config>`.

View File

@ -20,12 +20,8 @@ mkdir -p "${build_dir}"
source "${env_dir}/env.sh"
entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
cd "${build_dir}" || return
source "${env_dir}/${env_script}"
cd "${script_dir}"
if [ ! -f configure ]; then
./bootstrap.sh
fi
cd "${build_dir}"
if [ ! -f Makefile ]; then
"${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
--prefix="${env_dir}/prefix/gridbench_${cfg}"

View File

@ -1,8 +1,8 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091
env_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
mkdir -p ~/.config/lattice-benchmarks
echo "${env_dir}" > ~/.config/lattice-benchmarks/grid-env
source "${env_dir}/spack/share/spack/setup-env.sh"
spack load jq git
GRIDENVDIR="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
export GRIDENVDIR
export PATH="${GRIDENVDIR}/prefix/base/bin:${PATH}"
export ACLOCAL_PATH="${GRIDENVDIR}/prefix/base/share/aclocal:${ACLOCAL_PATH}"
source "${GRIDENVDIR}"/spack/share/spack/setup-env.sh

0
Grid/systems/tursa/files/cpu-mpi-wrapper.sh Executable file → Normal file
View File

0
Grid/systems/tursa/files/gpu-mpi-wrapper.sh Executable file → Normal file
View File

View File

@ -1,17 +0,0 @@
#!/usr/bin/env bash
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=8
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
export OMPI_MCA_io=romio321
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3

View File

@ -1,60 +0,0 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
#SBATCH -J benchmark-grid-16
#SBATCH -t 1:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --qos=standard
#SBATCH --no-requeue
#SBATCH --gpu-freq=1410
set -euo pipefail
# load environment #############################################################
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
if [ ! -f "${env_cfg}" ]; then
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
exit 1
fi
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
source "${env_dir}/env.sh" # load base Spack environment
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
# application and parameters ###################################################
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd "${app}" > "${job_info_dir}/ldd"
md5sum "${app}" > "${job_info_dir}/app-hash"
readelf -a "${app}" > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
"${env_dir}/gpu-mpi-wrapper.sh" \
"${app}" \
--json-out "${job_info_dir}/result.json" \
--mpi 1.4.4.4 \
--accelerator-threads 8 \
--threads 8 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
################################################################################

View File

@ -1,60 +0,0 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
#SBATCH -J benchmark-grid-1
#SBATCH -t 1:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --qos=standard
#SBATCH --no-requeue
#SBATCH --gpu-freq=1410
set -euo pipefail
# load environment #############################################################
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
if [ ! -f "${env_cfg}" ]; then
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
exit 1
fi
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
source "${env_dir}/env.sh" # load base Spack environment
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
# application and parameters ###################################################
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd "${app}" > "${job_info_dir}/ldd"
md5sum "${app}" > "${job_info_dir}/app-hash"
readelf -a "${app}" > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
"${env_dir}/gpu-mpi-wrapper.sh" \
"${app}" \
--json-out "${job_info_dir}/result.json" \
--mpi 1.1.1.4 \
--accelerator-threads 8 \
--threads 8 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
################################################################################

View File

@ -1,60 +0,0 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
#SBATCH -J benchmark-grid-32
#SBATCH -t 1:00:00
#SBATCH --nodes=32
#SBATCH --ntasks=128
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --qos=standard
#SBATCH --no-requeue
#SBATCH --gpu-freq=1410
set -euo pipefail
# load environment #############################################################
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
if [ ! -f "${env_cfg}" ]; then
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
exit 1
fi
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
source "${env_dir}/env.sh" # load base Spack environment
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
# application and parameters ###################################################
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd "${app}" > "${job_info_dir}/ldd"
md5sum "${app}" > "${job_info_dir}/app-hash"
readelf -a "${app}" > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
"${env_dir}/gpu-mpi-wrapper.sh" \
"${app}" \
--json-out "${job_info_dir}/result.json" \
--mpi 1.4.4.8 \
--accelerator-threads 8 \
--threads 8 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
################################################################################

View File

@ -4,13 +4,7 @@ set -euo pipefail
gcc_spec='gcc@9.4.0'
cuda_spec='cuda@11.4.0'
# hdf5 and fftw depend on OpenMPI, which we install manually. To make sure this
# dependency is picked by spack, we specify the compiler here explicitly. For
# most other packages we dont really care about the compiler (i.e. system
# compiler versus ${gcc_spec})
hdf5_spec="hdf5@1.10.7+cxx+threadsafe%${gcc_spec}"
fftw_spec="fftw%${gcc_spec}"
hdf5_spec='hdf5@1.10.7'
if (( $# != 1 )); then
echo "usage: $(basename "$0") <env dir>" 1>&2
@ -24,7 +18,7 @@ cd "${cwd}"
# General configuration ########################################################
# build with 128 tasks
echo 'config:
echo 'config:
build_jobs: 128
build_stage:
- $spack/var/spack/stage
@ -44,23 +38,26 @@ rm external.yaml
# Base compilers ###############################################################
# configure system base
spack env create base
spack env activate base
spack compiler find --scope site
# install GCC, CUDA
spack add ${gcc_spec} ${cuda_spec}
spack concretize
spack env depfile -o Makefile.tmp
make -j128 -f Makefile.tmp
# install GCC, CUDA & LLVM
spack install ${gcc_spec} ${cuda_spec} llvm
spack load llvm
spack compiler find --scope site
spack unload llvm
spack load ${gcc_spec}
spack compiler find --scope site
spack unload ${gcc_spec}
# Manual compilation of OpenMPI & UCX ##########################################
# set build directories
mkdir -p "${dir}"/build
cd "${dir}"/build
spack load ${gcc_spec} ${cuda_spec}
cuda_path=$(spack find --format "{prefix}" cuda)
gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1
@ -127,8 +124,8 @@ mkdir build_gpu; cd build_gpu
--with-cuda="${cuda_path}" --disable-getpwuid \
--with-verbs --with-slurm --enable-mpi-fortran=all \
--with-pmix=internal --with-libevent=internal
make -j 128
make install
make -j 128
make install
cd ..
# openmpi cpu build
@ -144,65 +141,65 @@ make -j 128
make install
cd "${dir}"
ucx_spec_gpu="ucx@1.12.0.GPU%${gcc_spec}"
ucx_spec_cpu="ucx@1.12.0.CPU%${gcc_spec}"
openmpi_spec_gpu="openmpi@4.1.1.GPU%${gcc_spec}"
openmpi_spec_cpu="openmpi@4.1.1.CPU%${gcc_spec}"
# Add externals to spack
echo "packages:
ucx:
externals:
- spec: \"${ucx_spec_gpu}\"
- spec: \"ucx@1.12.0.GPU%gcc@9.4.0\"
prefix: ${dir}/prefix/ucx_gpu
- spec: \"${ucx_spec_cpu}\"
- spec: \"ucx@1.12.0.CPU%gcc@9.4.0\"
prefix: ${dir}/prefix/ucx_cpu
buildable: False
openmpi:
externals:
- spec: \"${openmpi_spec_gpu}\"
- spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\"
prefix: ${dir}/prefix/ompi_gpu
- spec: \"${openmpi_spec_cpu}\"
- spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\"
prefix: ${dir}/prefix/ompi_cpu
buildable: False" > spack.yaml
spack config --scope site add -f spack.yaml
rm spack.yaml
spack env deactivate
spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
cd "${cwd}"
# environments #################################################################
dev_tools=("autoconf" "automake" "libtool" "jq" "git")
dev_tools=("autoconf" "automake" "libtool" "jq")
ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU)
ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU)
spack env create grid-gpu
spack env activate grid-gpu
spack compiler find --scope site
spack add ${gcc_spec} ${cuda_spec} ${ucx_spec_gpu} ${openmpi_spec_gpu}
spack add ${hdf5_spec} ${fftw_spec}
spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
spack concretize
spack env depfile -o Makefile.tmp
make -j128 -f Makefile.tmp
spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}"
spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}"
spack add fftw ^/"${ompi_gpu_hash}"
spack add openssl gmp mpfr c-lime
spack install
spack env deactivate
spack env create grid-cpu
spack env activate grid-cpu
spack compiler find --scope site
spack add ${gcc_spec} ${ucx_spec_cpu} ${openmpi_spec_cpu}
spack add ${hdf5_spec} ${fftw_spec}
spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
spack concretize
spack env depfile -o Makefile.tmp
make -j128 -f Makefile.tmp
spack add llvm "${dev_tools[@]}"
spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}"
spack add fftw ^/"${ompi_cpu_hash}"
spack add openssl gmp mpfr c-lime
spack install
spack env deactivate
# Final setup ##################################################################
spack clean
#spack gc -y # "spack gc" tends to get hung up for unknown reasons
spack gc -y
# add more environment variables in module loading
spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]'
spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]'
spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]'
spack module tcl refresh -y
# permission change for group access
chmod -R g+rw "${dir}/spack/var/spack/cache"
setfacl -d -R -m g::rwX "${dir}/spack/var/spack/cache"