Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
2efbad5f70 |
@ -1,9 +1,8 @@
|
||||
/*
|
||||
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
||||
Copyright © 2022 Simon Buerger <simon.buerger@rwth-aachen.de>
|
||||
|
||||
This is a fork of Benchmark_ITT.cpp from Grid
|
||||
This is a refactoring of Benchmark_ITT.cpp from Grid
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
@ -25,6 +24,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
std::vector<int> L_list;
|
||||
std::vector<int> Ls_list;
|
||||
std::vector<double> mflop_list;
|
||||
|
||||
double mflop_ref;
|
||||
double mflop_ref_err;
|
||||
|
||||
int NN_global;
|
||||
|
||||
nlohmann::json json_results;
|
||||
@ -52,6 +58,18 @@ struct time_statistics
|
||||
}
|
||||
};
|
||||
|
||||
void comms_header()
|
||||
{
|
||||
std::cout << GridLogMessage << " L "
|
||||
<< "\t"
|
||||
<< " Ls "
|
||||
<< "\t"
|
||||
<< "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
|
||||
|
||||
struct controls
|
||||
{
|
||||
int Opt;
|
||||
@ -64,69 +82,47 @@ class Benchmark
|
||||
public:
|
||||
static void Decomposition(void)
|
||||
{
|
||||
nlohmann::json tmp;
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
Grid::Coordinate mpi = GridDefaultMpi();
|
||||
assert(mpi.size() == 4);
|
||||
Coordinate local({8, 8, 8, 8});
|
||||
Coordinate latt4(
|
||||
{local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
|
||||
GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(
|
||||
latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
|
||||
Grid::Coordinate shm(4, 1);
|
||||
GlobalSharedMemory::GetShmDims(mpi, shm);
|
||||
|
||||
uint64_t NP = TmpGrid->RankCount();
|
||||
uint64_t NN = TmpGrid->NodeCount();
|
||||
NN_global = NN;
|
||||
uint64_t SHM = NP / NN;
|
||||
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
|
||||
grid_small_sep();
|
||||
std::cout << GridLogMessage << "* OpenMP threads : " << GridThread::GetThreads()
|
||||
std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads()
|
||||
<< std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "* MPI layout : " << GridCmdVectorIntToString(mpi)
|
||||
std::cout << GridLogMessage
|
||||
<< "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi())
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* Shm layout : " << GridCmdVectorIntToString(shm)
|
||||
<< std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "* vReal : " << sizeof(vReal) * 8 << "bits ; "
|
||||
std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; "
|
||||
<< GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd()))
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* vRealF : " << sizeof(vRealF) * 8
|
||||
std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8
|
||||
<< "bits ; "
|
||||
<< GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd()))
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* vRealD : " << sizeof(vRealD) * 8
|
||||
std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8
|
||||
<< "bits ; "
|
||||
<< GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd()))
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* vComplex : " << sizeof(vComplex) * 8
|
||||
std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8
|
||||
<< "bits ; "
|
||||
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd()))
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* vComplexF : " << sizeof(vComplexF) * 8
|
||||
std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8
|
||||
<< "bits ; "
|
||||
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd()))
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* vComplexD : " << sizeof(vComplexD) * 8
|
||||
std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8
|
||||
<< "bits ; "
|
||||
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd()))
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* ranks : " << NP << std::endl;
|
||||
std::cout << GridLogMessage << "* nodes : " << NN << std::endl;
|
||||
std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl;
|
||||
|
||||
for (unsigned int i = 0; i < mpi.size(); ++i)
|
||||
{
|
||||
tmp["mpi"].push_back(mpi[i]);
|
||||
tmp["shm"].push_back(shm[i]);
|
||||
}
|
||||
tmp["ranks"] = NP;
|
||||
tmp["nodes"] = NN;
|
||||
json_results["geometry"] = tmp;
|
||||
}
|
||||
|
||||
static void Comms(void)
|
||||
@ -137,8 +133,6 @@ class Benchmark
|
||||
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
Coordinate shm_layout(Nd, 1);
|
||||
GlobalSharedMemory::GetShmDims(mpi_layout, shm_layout);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
if (mpi_layout[mu] > 1)
|
||||
@ -147,11 +141,18 @@ class Benchmark
|
||||
std::vector<double> t_time(Nloop);
|
||||
time_statistics timestat;
|
||||
|
||||
std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in "
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"============================="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "
|
||||
<< nmu << " dimensions" << std::endl;
|
||||
grid_small_sep();
|
||||
grid_printf("%5s %5s %7s %15s %15s %15s %15s %15s\n", "L", "dir", "shm",
|
||||
"payload (B)", "time (usec)", "rate (GB/s/node)", "std dev", "max");
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"============================="
|
||||
<< std::endl;
|
||||
grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)",
|
||||
"time (usec)", "rate (GB/s)", "std dev", "max");
|
||||
|
||||
for (int lat = 16; lat <= maxlat; lat += 8)
|
||||
{
|
||||
@ -175,85 +176,56 @@ class Benchmark
|
||||
}
|
||||
|
||||
double dbytes;
|
||||
#define NWARMUP 50
|
||||
|
||||
for (int dir = 0; dir < 8; dir++)
|
||||
{
|
||||
int mu = dir % 4;
|
||||
if (mpi_layout[mu] == 1) // skip directions that are not distributed
|
||||
continue;
|
||||
bool is_shm = mpi_layout[mu] == shm_layout[mu];
|
||||
bool is_partial_shm = !is_shm && shm_layout[mu] != 1;
|
||||
|
||||
std::vector<double> times(Nloop);
|
||||
for (int i = 0; i < NWARMUP; i++)
|
||||
{
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
if (dir == mu)
|
||||
{
|
||||
int comm_proc = 1;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
}
|
||||
else
|
||||
{
|
||||
int comm_proc = mpi_layout[mu] - 1;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
}
|
||||
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0],
|
||||
recv_from_rank, bytes);
|
||||
}
|
||||
for (int i = 0; i < Nloop; i++)
|
||||
if (mpi_layout[mu] > 1)
|
||||
{
|
||||
|
||||
dbytes = 0;
|
||||
double start = usecond();
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
if (dir == mu)
|
||||
std::vector<double> times(Nloop);
|
||||
for (int i = 0; i < Nloop; i++)
|
||||
{
|
||||
int comm_proc = 1;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
}
|
||||
else
|
||||
{
|
||||
int comm_proc = mpi_layout[mu] - 1;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
}
|
||||
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0],
|
||||
recv_from_rank, bytes);
|
||||
dbytes += bytes;
|
||||
|
||||
double stop = usecond();
|
||||
t_time[i] = stop - start; // microseconds
|
||||
dbytes = 0;
|
||||
double start = usecond();
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
if (dir == mu)
|
||||
{
|
||||
int comm_proc = 1;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
}
|
||||
else
|
||||
{
|
||||
int comm_proc = mpi_layout[mu] - 1;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
}
|
||||
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
||||
(void *)&rbuf[dir][0], recv_from_rank, bytes);
|
||||
dbytes += bytes;
|
||||
|
||||
double stop = usecond();
|
||||
t_time[i] = stop - start; // microseconds
|
||||
}
|
||||
timestat.statistics(t_time);
|
||||
|
||||
dbytes = dbytes * ppn;
|
||||
double bidibytes = 2. * dbytes;
|
||||
double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.;
|
||||
double rate_err = rate * timestat.err / timestat.mean;
|
||||
double rate_max = rate * timestat.mean / timestat.min;
|
||||
grid_printf("%5d %5d %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, bytes,
|
||||
timestat.mean, rate, rate_err, rate_max);
|
||||
nlohmann::json tmp;
|
||||
tmp["L"] = lat;
|
||||
tmp["dir"] = dir;
|
||||
tmp["bytes"] = bytes;
|
||||
tmp["time"] = timestat.mean;
|
||||
tmp["GB_per_second"] = rate;
|
||||
json_results["comms"].push_back(tmp);
|
||||
}
|
||||
timestat.statistics(t_time);
|
||||
|
||||
dbytes = dbytes * ppn;
|
||||
double bidibytes = 2. * dbytes;
|
||||
double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.;
|
||||
double rate_err = rate * timestat.err / timestat.mean;
|
||||
double rate_max = rate * timestat.mean / timestat.min;
|
||||
grid_printf("%5d %5d %7s %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir,
|
||||
is_shm ? "yes"
|
||||
: is_partial_shm ? "partial"
|
||||
: "no",
|
||||
bytes, timestat.mean, rate, rate_err, rate_max);
|
||||
nlohmann::json tmp;
|
||||
nlohmann::json tmp_rate;
|
||||
tmp["L"] = lat;
|
||||
tmp["dir"] = dir;
|
||||
tmp["shared_mem"] = is_shm;
|
||||
tmp["partial_shared_mem"] = is_partial_shm;
|
||||
tmp["bytes"] = bytes;
|
||||
tmp["time_usec"] = timestat.mean;
|
||||
tmp_rate["mean"] = rate;
|
||||
tmp_rate["error"] = rate_err;
|
||||
tmp_rate["max"] = rate_max;
|
||||
tmp["rate_GBps"] = tmp_rate;
|
||||
json_results["comms"].push_back(tmp);
|
||||
}
|
||||
for (int d = 0; d < 8; d++)
|
||||
{
|
||||
@ -273,14 +245,33 @@ class Benchmark
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
|
||||
std::cout << GridLogMessage << "Benchmarking a*x + y bandwidth" << std::endl;
|
||||
grid_small_sep();
|
||||
grid_printf("%5s %15s %15s %15s %15s\n", "L", "size (MB/node)", "time (usec)",
|
||||
"GB/s/node", "Gflop/s/node");
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " L "
|
||||
<< "\t\t"
|
||||
<< "bytes"
|
||||
<< "\t\t\t"
|
||||
<< "GB/s"
|
||||
<< "\t\t"
|
||||
<< "Gflop/s"
|
||||
<< "\t\t seconds"
|
||||
<< "\t\tGB/s / node" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "----------------------------------------------------------"
|
||||
<< std::endl;
|
||||
|
||||
// uint64_t NP;
|
||||
uint64_t NN;
|
||||
uint64_t lmax = 64;
|
||||
#define NLOOP (200 * lmax * lmax * lmax / lat / lat / lat)
|
||||
|
||||
uint64_t lmax = 32;
|
||||
#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat)
|
||||
|
||||
GridSerialRNG sRNG;
|
||||
sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
|
||||
@ -289,11 +280,11 @@ class Benchmark
|
||||
|
||||
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
|
||||
lat * mpi_layout[3]});
|
||||
double vol =
|
||||
static_cast<double>(latt_size[0]) * latt_size[1] * latt_size[2] * latt_size[3];
|
||||
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
|
||||
|
||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||
|
||||
// NP= Grid.RankCount();
|
||||
NN = Grid.NodeCount();
|
||||
|
||||
Vec rn;
|
||||
@ -309,29 +300,26 @@ class Benchmark
|
||||
|
||||
uint64_t Nloop = NLOOP;
|
||||
|
||||
for (int i = 0; i < NWARMUP; i++)
|
||||
{
|
||||
z = a * x - y;
|
||||
}
|
||||
double start = usecond();
|
||||
for (int i = 0; i < Nloop; i++)
|
||||
{
|
||||
z = a * x - y;
|
||||
}
|
||||
double stop = usecond();
|
||||
double time = (stop - start) / Nloop / 1.e6;
|
||||
double time = (stop - start) / Nloop * 1000;
|
||||
|
||||
double flops = vol * Nvec * 2 / 1.e9; // mul,add
|
||||
double bytes = 3.0 * vol * Nvec * sizeof(Real) / 1024. / 1024.;
|
||||
|
||||
grid_printf("%5d %15.2f %15.2f %15.2f %15.2f\n", lat, bytes / NN, time * 1.e6,
|
||||
bytes / time / NN / 1024., flops / time / NN);
|
||||
double flops = vol * Nvec * 2; // mul,add
|
||||
double bytes = 3.0 * vol * Nvec * sizeof(Real);
|
||||
std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
|
||||
<< " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
|
||||
<< (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
|
||||
<< std::endl;
|
||||
|
||||
nlohmann::json tmp;
|
||||
tmp["L"] = lat;
|
||||
tmp["size_MB"] = bytes / NN;
|
||||
tmp["GBps"] = bytes / time / NN / 1024.;
|
||||
tmp["GFlops"] = flops / time / NN;
|
||||
tmp["bytes"] = bytes;
|
||||
tmp["gflops"] = flops / time;
|
||||
tmp["GB_per_second"] = bytes / time;
|
||||
json_results["axpy"].push_back(tmp);
|
||||
}
|
||||
};
|
||||
@ -344,14 +332,31 @@ class Benchmark
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
|
||||
std::cout << GridLogMessage << "Benchmarking z = y*x SU(4) bandwidth" << std::endl;
|
||||
grid_small_sep();
|
||||
grid_printf("%5s %15s %15s %15s %15s\n", "L", "size (MB/node)", "time (usec)",
|
||||
"GB/s/node", "Gflop/s/node");
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " L "
|
||||
<< "\t\t"
|
||||
<< "bytes"
|
||||
<< "\t\t\t"
|
||||
<< "GB/s"
|
||||
<< "\t\t"
|
||||
<< "Gflop/s"
|
||||
<< "\t\t seconds"
|
||||
<< "\t\tGB/s / node" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "----------------------------------------------------------"
|
||||
<< std::endl;
|
||||
|
||||
uint64_t NN;
|
||||
|
||||
uint64_t lmax = 48;
|
||||
uint64_t lmax = 32;
|
||||
|
||||
GridSerialRNG sRNG;
|
||||
sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
|
||||
@ -360,8 +365,7 @@ class Benchmark
|
||||
|
||||
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
|
||||
lat * mpi_layout[3]});
|
||||
double vol =
|
||||
static_cast<double>(latt_size[0]) * latt_size[1] * latt_size[2] * latt_size[3];
|
||||
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
|
||||
|
||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||
|
||||
@ -373,31 +377,30 @@ class Benchmark
|
||||
x = Zero();
|
||||
LatticeSU4 y(&Grid);
|
||||
y = Zero();
|
||||
// double a=2.0;
|
||||
|
||||
uint64_t Nloop = NLOOP;
|
||||
|
||||
for (int i = 0; i < NWARMUP; i++)
|
||||
{
|
||||
z = x * y;
|
||||
}
|
||||
double start = usecond();
|
||||
for (int i = 0; i < Nloop; i++)
|
||||
{
|
||||
z = x * y;
|
||||
}
|
||||
double stop = usecond();
|
||||
double time = (stop - start) / Nloop / 1.e6;
|
||||
double time = (stop - start) / Nloop * 1000;
|
||||
|
||||
double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8) / 1.e9; // mul,add
|
||||
double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF) / 1024. / 1024.;
|
||||
grid_printf("%5d %15.2f %15.2f %15.2f %15.2f\n", lat, bytes / NN, time * 1.e6,
|
||||
bytes / time / NN / 1024., flops / time / NN);
|
||||
double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
|
||||
double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
|
||||
std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
|
||||
<< " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
|
||||
<< (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
|
||||
<< std::endl;
|
||||
|
||||
nlohmann::json tmp;
|
||||
tmp["L"] = lat;
|
||||
tmp["size_MB"] = bytes / NN;
|
||||
tmp["GBps"] = bytes / time / NN / 1024.;
|
||||
tmp["GFlops"] = flops / time / NN;
|
||||
tmp["bytes"] = bytes;
|
||||
tmp["GB_per_second"] = bytes / time;
|
||||
tmp["gflops"] = flops / time;
|
||||
json_results["SU4"].push_back(tmp);
|
||||
}
|
||||
};
|
||||
@ -407,10 +410,10 @@ class Benchmark
|
||||
RealD mass = 0.1;
|
||||
RealD M5 = 1.8;
|
||||
|
||||
double gflops;
|
||||
double gflops_best = 0;
|
||||
double gflops_worst = 0;
|
||||
std::vector<double> gflops_all;
|
||||
double mflops;
|
||||
double mflops_best = 0;
|
||||
double mflops_worst = 0;
|
||||
std::vector<double> mflops_all;
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
// Set/Get the layout & grid size
|
||||
@ -430,7 +433,10 @@ class Benchmark
|
||||
uint64_t SHM = NP / NN;
|
||||
|
||||
///////// Welcome message ////////////
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume "
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* Nc : " << Nc << std::endl;
|
||||
@ -443,7 +449,10 @@ class Benchmark
|
||||
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi)
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
|
||||
///////// Lattice Init ////////////
|
||||
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
|
||||
@ -502,7 +511,10 @@ class Benchmark
|
||||
WilsonKernelsStatic::Opt = Cases[c].Opt;
|
||||
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
||||
|
||||
grid_small_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "==================================================================="
|
||||
"==============="
|
||||
<< std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
|
||||
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
|
||||
@ -510,7 +522,10 @@ class Benchmark
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
|
||||
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
|
||||
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
|
||||
grid_small_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "==================================================================="
|
||||
"==============="
|
||||
<< std::endl;
|
||||
|
||||
int nwarm = 10;
|
||||
double t0 = usecond();
|
||||
@ -525,6 +540,10 @@ class Benchmark
|
||||
|
||||
FGrid->Broadcast(0, &ncall, sizeof(ncall));
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per
|
||||
// second"<<std::endl;
|
||||
Dw.ZeroCounters();
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(ncall);
|
||||
for (uint64_t i = 0; i < ncall; i++)
|
||||
@ -550,60 +569,67 @@ class Benchmark
|
||||
double fps =
|
||||
Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
|
||||
#endif
|
||||
double flops = (fps * volume) / 2.;
|
||||
double gf_hi, gf_lo, gf_err;
|
||||
double flops = (fps * volume) / 2;
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
gf_hi = flops / timestat.min / 1000.;
|
||||
gf_lo = flops / timestat.max / 1000.;
|
||||
gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
|
||||
mf_hi = flops / timestat.min;
|
||||
mf_lo = flops / timestat.max;
|
||||
mf_err = flops / timestat.min * timestat.err / timestat.mean;
|
||||
|
||||
gflops = flops / timestat.mean / 1000.;
|
||||
gflops_all.push_back(gflops);
|
||||
if (gflops_best == 0)
|
||||
gflops_best = gflops;
|
||||
if (gflops_worst == 0)
|
||||
gflops_worst = gflops;
|
||||
if (gflops > gflops_best)
|
||||
gflops_best = gflops;
|
||||
if (gflops < gflops_worst)
|
||||
gflops_worst = gflops;
|
||||
mflops = flops / timestat.mean;
|
||||
mflops_all.push_back(mflops);
|
||||
if (mflops_best == 0)
|
||||
mflops_best = mflops;
|
||||
if (mflops_worst == 0)
|
||||
mflops_worst = mflops;
|
||||
if (mflops > mflops_best)
|
||||
mflops_best = mflops;
|
||||
if (mflops < mflops_worst)
|
||||
mflops_worst = mflops;
|
||||
|
||||
std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
|
||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||
<< "Deo Gflop/s = " << gflops << " (" << gf_err << ") " << gf_lo
|
||||
<< "-" << gf_hi << std::endl;
|
||||
<< "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
|
||||
<< "-" << mf_hi << std::endl;
|
||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||
<< "Deo Gflop/s per rank " << gflops / NP << std::endl;
|
||||
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
|
||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||
<< "Deo Gflop/s per node " << gflops / NN << std::endl;
|
||||
<< "Deo mflop/s per node " << mflops / NN << std::endl;
|
||||
}
|
||||
|
||||
grid_small_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "====================================================================="
|
||||
"============="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << L << "^4 x " << Ls
|
||||
<< " Deo Best Gflop/s = " << gflops_best << " ; "
|
||||
<< gflops_best / NN << " per node " << std::endl;
|
||||
<< " Deo Best mflop/s = " << mflops_best << " ; "
|
||||
<< mflops_best / NN << " per node " << std::endl;
|
||||
std::cout << GridLogMessage << L << "^4 x " << Ls
|
||||
<< " Deo Worst Gflop/s = " << gflops_worst << " ; "
|
||||
<< gflops_worst / NN << " per node " << std::endl;
|
||||
<< " Deo Worst mflop/s = " << mflops_worst << " ; "
|
||||
<< mflops_worst / NN << " per node " << std::endl;
|
||||
std::cout << GridLogMessage << fmt << std::endl;
|
||||
std::cout << GridLogMessage;
|
||||
|
||||
for (int i = 0; i < gflops_all.size(); i++)
|
||||
for (int i = 0; i < mflops_all.size(); i++)
|
||||
{
|
||||
std::cout << gflops_all[i] / NN << " ; ";
|
||||
std::cout << mflops_all[i] / NN << " ; ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "====================================================================="
|
||||
"============="
|
||||
<< std::endl;
|
||||
}
|
||||
return gflops_best;
|
||||
return mflops_best;
|
||||
}
|
||||
|
||||
static double Staggered(int L)
|
||||
{
|
||||
double gflops;
|
||||
double gflops_best = 0;
|
||||
double gflops_worst = 0;
|
||||
std::vector<double> gflops_all;
|
||||
double mflops;
|
||||
double mflops_best = 0;
|
||||
double mflops_worst = 0;
|
||||
std::vector<double> mflops_all;
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
// Set/Get the layout & grid size
|
||||
@ -623,7 +649,10 @@ class Benchmark
|
||||
uint64_t SHM = NP / NN;
|
||||
|
||||
///////// Welcome message ////////////
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L
|
||||
<< "^4 local volume " << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
@ -634,7 +663,10 @@ class Benchmark
|
||||
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi)
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
|
||||
///////// Lattice Init ////////////
|
||||
GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(
|
||||
@ -696,7 +728,10 @@ class Benchmark
|
||||
StaggeredKernelsStatic::Opt = Cases[c].Opt;
|
||||
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
||||
|
||||
grid_small_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "==================================================================="
|
||||
"==============="
|
||||
<< std::endl;
|
||||
if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric)
|
||||
std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels"
|
||||
<< std::endl;
|
||||
@ -705,7 +740,10 @@ class Benchmark
|
||||
if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute)
|
||||
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
|
||||
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
|
||||
grid_small_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "==================================================================="
|
||||
"==============="
|
||||
<< std::endl;
|
||||
|
||||
int nwarm = 10;
|
||||
double t0 = usecond();
|
||||
@ -720,6 +758,10 @@ class Benchmark
|
||||
|
||||
FGrid->Broadcast(0, &ncall, sizeof(ncall));
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per
|
||||
// second"<<std::endl;
|
||||
Ds.ZeroCounters();
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(ncall);
|
||||
for (uint64_t i = 0; i < ncall; i++)
|
||||
@ -734,51 +776,58 @@ class Benchmark
|
||||
double volume = 1;
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
volume = volume * latt4[mu];
|
||||
double flops = (1146.0 * volume) / 2.;
|
||||
double gf_hi, gf_lo, gf_err;
|
||||
double flops = (1146.0 * volume) / 2;
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
gf_hi = flops / timestat.min / 1000.;
|
||||
gf_lo = flops / timestat.max / 1000.;
|
||||
gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
|
||||
mf_hi = flops / timestat.min;
|
||||
mf_lo = flops / timestat.max;
|
||||
mf_err = flops / timestat.min * timestat.err / timestat.mean;
|
||||
|
||||
gflops = flops / timestat.mean / 1000.;
|
||||
gflops_all.push_back(gflops);
|
||||
if (gflops_best == 0)
|
||||
gflops_best = gflops;
|
||||
if (gflops_worst == 0)
|
||||
gflops_worst = gflops;
|
||||
if (gflops > gflops_best)
|
||||
gflops_best = gflops;
|
||||
if (gflops < gflops_worst)
|
||||
gflops_worst = gflops;
|
||||
mflops = flops / timestat.mean;
|
||||
mflops_all.push_back(mflops);
|
||||
if (mflops_best == 0)
|
||||
mflops_best = mflops;
|
||||
if (mflops_worst == 0)
|
||||
mflops_worst = mflops;
|
||||
if (mflops > mflops_best)
|
||||
mflops_best = mflops;
|
||||
if (mflops < mflops_worst)
|
||||
mflops_worst = mflops;
|
||||
|
||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||
<< "Deo Gflop/s = " << gflops << " (" << gf_err << ") " << gf_lo
|
||||
<< "-" << gf_hi << std::endl;
|
||||
<< "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
|
||||
<< "-" << mf_hi << std::endl;
|
||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||
<< "Deo Gflop/s per rank " << gflops / NP << std::endl;
|
||||
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
|
||||
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
|
||||
<< "Deo Gflop/s per node " << gflops / NN << std::endl;
|
||||
<< "Deo mflop/s per node " << mflops / NN << std::endl;
|
||||
}
|
||||
|
||||
grid_small_sep();
|
||||
std::cout << GridLogMessage
|
||||
<< "====================================================================="
|
||||
"============="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << L
|
||||
<< "^4 Deo Best Gflop/s = " << gflops_best << " ; "
|
||||
<< gflops_best / NN << " per node " << std::endl;
|
||||
<< "^4 Deo Best mflop/s = " << mflops_best << " ; "
|
||||
<< mflops_best / NN << " per node " << std::endl;
|
||||
std::cout << GridLogMessage << L
|
||||
<< "^4 Deo Worst Gflop/s = " << gflops_worst << " ; "
|
||||
<< gflops_worst / NN << " per node " << std::endl;
|
||||
<< "^4 Deo Worst mflop/s = " << mflops_worst << " ; "
|
||||
<< mflops_worst / NN << " per node " << std::endl;
|
||||
std::cout << GridLogMessage << fmt << std::endl;
|
||||
std::cout << GridLogMessage;
|
||||
|
||||
for (int i = 0; i < gflops_all.size(); i++)
|
||||
for (int i = 0; i < mflops_all.size(); i++)
|
||||
{
|
||||
std::cout << gflops_all[i] / NN << " ; ";
|
||||
std::cout << mflops_all[i] / NN << " ; ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
return gflops_best;
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
return mflops_best;
|
||||
}
|
||||
};
|
||||
|
||||
@ -816,87 +865,175 @@ int main(int argc, char **argv)
|
||||
std::vector<double> dwf4;
|
||||
std::vector<double> staggered;
|
||||
|
||||
if (do_memory)
|
||||
{
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage << " Memory benchmark " << std::endl;
|
||||
grid_big_sep();
|
||||
Benchmark::Memory();
|
||||
}
|
||||
|
||||
if (do_su4)
|
||||
{
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
|
||||
grid_big_sep();
|
||||
Benchmark::SU4();
|
||||
}
|
||||
|
||||
if (do_comms)
|
||||
{
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage << " Communications benchmark " << std::endl;
|
||||
grid_big_sep();
|
||||
Benchmark::Comms();
|
||||
}
|
||||
|
||||
if (do_flops)
|
||||
{
|
||||
Ls = 1;
|
||||
grid_big_sep();
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
for (int l = 0; l < L_list.size(); l++)
|
||||
{
|
||||
wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
|
||||
}
|
||||
|
||||
Ls = 12;
|
||||
grid_big_sep();
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
for (int l = 0; l < L_list.size(); l++)
|
||||
{
|
||||
double result = Benchmark::DWF(Ls, L_list[l]);
|
||||
dwf4.push_back(result);
|
||||
}
|
||||
|
||||
grid_big_sep();
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised"
|
||||
<< std::endl;
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
for (int l = 0; l < L_list.size(); l++)
|
||||
{
|
||||
double result = Benchmark::Staggered(L_list[l]);
|
||||
staggered.push_back(result);
|
||||
}
|
||||
|
||||
int NN = NN_global;
|
||||
|
||||
grid_big_sep();
|
||||
std::cout << GridLogMessage << "Gflop/s/node Summary table Ls=" << Ls << std::endl;
|
||||
grid_big_sep();
|
||||
grid_printf("%5s %12s %12s %12s\n", "L", "Wilson", "DWF", "Staggered");
|
||||
nlohmann::json tmp_flops;
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
|
||||
for (int l = 0; l < L_list.size(); l++)
|
||||
{
|
||||
grid_printf("%5d %12.2f %12.2f %12.2f\n", L_list[l], wilson[l] / NN, dwf4[l] / NN,
|
||||
staggered[l] / NN);
|
||||
|
||||
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t "
|
||||
<< dwf4[l] << " \t\t " << staggered[l] << std::endl;
|
||||
nlohmann::json tmp;
|
||||
tmp["L"] = L_list[l];
|
||||
tmp["Gflops_wilson"] = wilson[l] / NN;
|
||||
tmp["Gflops_dwf4"] = dwf4[l] / NN;
|
||||
tmp["Gflops_staggered"] = staggered[l] / NN;
|
||||
tmp_flops["results"].push_back(tmp);
|
||||
tmp["mflops_wilson"] = wilson[l];
|
||||
tmp["mflops_dwf4"] = dwf4[l];
|
||||
tmp["mflops_staggered"] = staggered[l];
|
||||
json_results["flops"].push_back(tmp);
|
||||
}
|
||||
grid_big_sep();
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
int NN = NN_global;
|
||||
if (do_memory)
|
||||
{
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " Memory benchmark " << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
Benchmark::Memory();
|
||||
}
|
||||
|
||||
if (do_su4)
|
||||
{
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
Benchmark::SU4();
|
||||
}
|
||||
|
||||
if (do_comms)
|
||||
{
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " Communications benchmark " << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "======================================================================="
|
||||
"==========="
|
||||
<< std::endl;
|
||||
Benchmark::Comms();
|
||||
}
|
||||
|
||||
if (do_flops)
|
||||
{
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
|
||||
for (int l = 0; l < L_list.size(); l++)
|
||||
{
|
||||
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t "
|
||||
<< dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
|
||||
}
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
|
||||
<< " Gflop/s per node" << std::endl;
|
||||
<< " Mflop/s per node" << std::endl;
|
||||
std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
|
||||
<< dwf4[selm1] / NN << ") " << std::endl;
|
||||
std::cout << std::setprecision(3);
|
||||
grid_big_sep();
|
||||
tmp_flops["comparison_point_Gflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN;
|
||||
json_results["flops"] = tmp_flops;
|
||||
std::cout
|
||||
<< GridLogMessage
|
||||
<< "========================================================================="
|
||||
"========="
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
if (!json_filename.empty())
|
||||
@ -909,7 +1046,7 @@ int main(int argc, char **argv)
|
||||
if (me == 0)
|
||||
{
|
||||
std::ofstream json_file(json_filename);
|
||||
json_file << std::setw(2) << json_results;
|
||||
json_file << std::setw(4) << json_results;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -32,7 +32,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#ifdef HAVE_LIME
|
||||
using namespace Grid;
|
||||
|
||||
std::string filestem(const int l) { return "io/iobench_l" + std::to_string(l); }
|
||||
std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
|
||||
|
||||
int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
|
||||
|
||||
@ -56,6 +56,13 @@ template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat
|
||||
mean /= n;
|
||||
}
|
||||
|
||||
#define grid_printf(...) \
|
||||
{ \
|
||||
char _buf[1024]; \
|
||||
sprintf(_buf, __VA_ARGS__); \
|
||||
MSG << _buf; \
|
||||
}
|
||||
|
||||
enum
|
||||
{
|
||||
sRead = 0,
|
||||
@ -76,58 +83,58 @@ int main(int argc, char **argv)
|
||||
std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
|
||||
std::vector<int> latt;
|
||||
|
||||
GRID_MSG << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
GRID_MSG << "MPI partition " << mpi << std::endl;
|
||||
MSG << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
MSG << "MPI partition " << mpi << std::endl;
|
||||
for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
|
||||
{
|
||||
grid_big_sep();
|
||||
GRID_MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
|
||||
grid_big_sep();
|
||||
grid_small_sep();
|
||||
GRID_MSG << "Benchmark std write" << std::endl;
|
||||
grid_small_sep();
|
||||
MSG << BIGSEP << std::endl;
|
||||
MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
|
||||
MSG << BIGSEP << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark std write" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||
|
||||
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
|
||||
perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||
}
|
||||
|
||||
grid_small_sep();
|
||||
GRID_MSG << "Benchmark std read" << std::endl;
|
||||
grid_small_sep();
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark std read" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||
|
||||
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
|
||||
perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||
}
|
||||
|
||||
#ifdef HAVE_LIME
|
||||
grid_small_sep();
|
||||
GRID_MSG << "Benchmark Grid C-Lime write" << std::endl;
|
||||
grid_small_sep();
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark Grid C-Lime write" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||
|
||||
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
|
||||
perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||
}
|
||||
|
||||
grid_small_sep();
|
||||
GRID_MSG << "Benchmark Grid C-Lime read" << std::endl;
|
||||
grid_small_sep();
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark Grid C-Lime read" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
latt = {l * mpi[0], l * mpi[1], l * mpi[2], l * mpi[3]};
|
||||
|
||||
GRID_MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
MSG << "-- Local volume " << l << "^4" << std::endl;
|
||||
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
|
||||
perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
|
||||
}
|
||||
@ -152,13 +159,13 @@ int main(int argc, char **argv)
|
||||
avRob.fill(100.);
|
||||
avRob -= 100. * avStdDev.cwiseQuotient(avMean.cwiseAbs());
|
||||
|
||||
grid_big_sep();
|
||||
GRID_MSG << "SUMMARY" << std::endl;
|
||||
grid_big_sep();
|
||||
GRID_MSG << "Summary of individual results (all results in MB/s)." << std::endl;
|
||||
GRID_MSG << "Every second colum gives the standard deviation of the previous column."
|
||||
<< std::endl;
|
||||
GRID_MSG << std::endl;
|
||||
MSG << BIGSEP << std::endl;
|
||||
MSG << "SUMMARY" << std::endl;
|
||||
MSG << BIGSEP << std::endl;
|
||||
MSG << "Summary of individual results (all results in MB/s)." << std::endl;
|
||||
MSG << "Every second colum gives the standard deviation of the previous column."
|
||||
<< std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
|
||||
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
@ -169,10 +176,10 @@ int main(int argc, char **argv)
|
||||
stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
|
||||
stdDev(volInd(l), gWrite));
|
||||
}
|
||||
GRID_MSG << std::endl;
|
||||
GRID_MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
|
||||
<< std::endl;
|
||||
GRID_MSG << std::endl;
|
||||
MSG << std::endl;
|
||||
MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
|
||||
<< std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
|
||||
"Grid write");
|
||||
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
|
||||
@ -180,21 +187,21 @@ int main(int argc, char **argv)
|
||||
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
|
||||
rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
|
||||
}
|
||||
GRID_MSG << std::endl;
|
||||
GRID_MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
|
||||
<< "^4 (all results in MB/s)." << std::endl;
|
||||
GRID_MSG << "Every second colum gives the standard deviation of the previous column."
|
||||
<< std::endl;
|
||||
GRID_MSG << std::endl;
|
||||
MSG << std::endl;
|
||||
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
|
||||
<< "^4 (all results in MB/s)." << std::endl;
|
||||
MSG << "Every second colum gives the standard deviation of the previous column."
|
||||
<< std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
|
||||
"std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
|
||||
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
|
||||
avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
|
||||
avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
|
||||
GRID_MSG << std::endl;
|
||||
GRID_MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
|
||||
<< std::endl;
|
||||
GRID_MSG << std::endl;
|
||||
MSG << std::endl;
|
||||
MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
|
||||
<< std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
|
||||
"Grid write");
|
||||
grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
|
||||
|
@ -18,8 +18,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#ifndef Benchmark_IO_hpp_
|
||||
#define Benchmark_IO_hpp_
|
||||
|
||||
#include "Common.hpp"
|
||||
#include <Grid/Grid.h>
|
||||
#define MSG std::cout << GridLogMessage
|
||||
#define SEP \
|
||||
"-----------------------------------------------------------------------------"
|
||||
#define BIGSEP \
|
||||
"============================================================================="
|
||||
#ifdef HAVE_LIME
|
||||
|
||||
namespace Grid
|
||||
@ -46,9 +50,9 @@ namespace Grid
|
||||
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||
// std::fwrite(&crc, sizeof(uint32_t), 1, file);
|
||||
// crcWatch.Stop();
|
||||
// GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec <<
|
||||
// std::endl; ioWatch.Start(); std::fwrite(vec_v.cpu_ptr, sizeof(typename
|
||||
// Field::scalar_object),
|
||||
// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
||||
// ioWatch.Start();
|
||||
// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
|
||||
// vec.Grid()->lSites(), file);
|
||||
// ioWatch.Stop();
|
||||
// std::fclose(file);
|
||||
@ -57,11 +61,11 @@ namespace Grid
|
||||
// p.size = size;
|
||||
// p.time = ioWatch.useconds();
|
||||
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||
// GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||
// << ",
|
||||
// "
|
||||
// << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
// GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
// }
|
||||
|
||||
// template <typename Field> void stdRead(Field &vec, const std::string filestem)
|
||||
@ -90,14 +94,16 @@ namespace Grid
|
||||
// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||
// crcWatch.Stop();
|
||||
// }
|
||||
// GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec <<
|
||||
// std::endl; assert(crcData == crcRead); size *= vec.Grid()->ProcessorCount(); auto
|
||||
// &p = BinaryIO::lastPerf; p.size = size; p.time = ioWatch.useconds();
|
||||
// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
|
||||
// assert(crcData == crcRead);
|
||||
// size *= vec.Grid()->ProcessorCount();
|
||||
// auto &p = BinaryIO::lastPerf;
|
||||
// p.size = size;
|
||||
// p.time = ioWatch.useconds();
|
||||
// p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||
// GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() <<
|
||||
// ", "
|
||||
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
||||
// << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
// GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
// }
|
||||
|
||||
template <typename Field> void stdWrite(const std::string filestem, Field &vec)
|
||||
@ -116,7 +122,7 @@ namespace Grid
|
||||
crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||
file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t) / sizeof(char));
|
||||
crcWatch.Stop();
|
||||
GRID_MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
||||
MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
|
||||
ioWatch.Start();
|
||||
file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
|
||||
file.flush();
|
||||
@ -126,9 +132,9 @@ namespace Grid
|
||||
p.size = size;
|
||||
p.time = ioWatch.useconds();
|
||||
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||
GRID_MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
GRID_MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
||||
<< p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
}
|
||||
|
||||
template <typename Field> void stdRead(Field &vec, const std::string filestem)
|
||||
@ -157,17 +163,16 @@ namespace Grid
|
||||
crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
|
||||
crcWatch.Stop();
|
||||
}
|
||||
GRID_MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec
|
||||
<< std::endl;
|
||||
MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
|
||||
assert(crcData == crcRead);
|
||||
size *= vec.Grid()->ProcessorCount();
|
||||
auto &p = BinaryIO::lastPerf;
|
||||
p.size = size;
|
||||
p.time = ioWatch.useconds();
|
||||
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
|
||||
GRID_MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
|
||||
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
GRID_MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
|
||||
<< p.mbytesPerSecond << " MB/s" << std::endl;
|
||||
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
|
||||
}
|
||||
|
||||
template <typename Field> void limeWrite(const std::string filestem, Field &vec)
|
||||
|
265
Grid/Benchmark_comms_host_device.cpp
Normal file
265
Grid/Benchmark_comms_host_device.cpp
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
struct time_statistics
|
||||
{
|
||||
double mean;
|
||||
double err;
|
||||
double min;
|
||||
double max;
|
||||
|
||||
void statistics(std::vector<double> v)
|
||||
{
|
||||
double sum = std::accumulate(v.begin(), v.end(), 0.0);
|
||||
mean = sum / v.size();
|
||||
|
||||
std::vector<double> diff(v.size());
|
||||
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
|
||||
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
|
||||
err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
|
||||
|
||||
auto result = std::minmax_element(v.begin(), v.end());
|
||||
min = *result.first;
|
||||
max = *result.second;
|
||||
}
|
||||
};
|
||||
|
||||
void header()
|
||||
{
|
||||
std::cout << GridLogMessage << " L "
|
||||
<< "\t"
|
||||
<< " Ls "
|
||||
<< "\t" << std::setw(11) << "bytes\t\t"
|
||||
<< "MB/s uni"
|
||||
<< "\t"
|
||||
<< "MB/s bidi" << std::endl;
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
Grid_init(&argc, &argv);
|
||||
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
|
||||
<< std::endl;
|
||||
|
||||
int Nloop = 250;
|
||||
int nmu = 0;
|
||||
int maxlat = 32;
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
if (mpi_layout[mu] > 1)
|
||||
nmu++;
|
||||
|
||||
std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
|
||||
<< std::endl;
|
||||
std::vector<double> t_time(Nloop);
|
||||
// time_statistics timestat;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "= Benchmarking sequential halo exchange from host memory " << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
header();
|
||||
|
||||
for (int lat = 8; lat <= maxlat; lat += 4)
|
||||
{
|
||||
for (int Ls = 8; Ls <= 8; Ls *= 2)
|
||||
{
|
||||
|
||||
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
|
||||
lat * mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
RealD Nnode = Grid.NodeCount();
|
||||
RealD ppn = Nrank / Nnode;
|
||||
|
||||
std::vector<std::vector<HalfSpinColourVectorD>> xbuf(8);
|
||||
std::vector<std::vector<HalfSpinColourVectorD>> rbuf(8);
|
||||
|
||||
for (int mu = 0; mu < 8; mu++)
|
||||
{
|
||||
xbuf[mu].resize(lat * lat * lat * Ls);
|
||||
rbuf[mu].resize(lat * lat * lat * Ls);
|
||||
}
|
||||
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
|
||||
|
||||
int ncomm;
|
||||
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
if (mpi_layout[mu] > 1)
|
||||
{
|
||||
double start = usecond();
|
||||
for (int i = 0; i < Nloop; i++)
|
||||
{
|
||||
|
||||
ncomm = 0;
|
||||
|
||||
ncomm++;
|
||||
int comm_proc = 1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
|
||||
(void *)&rbuf[mu][0], recv_from_rank, bytes);
|
||||
}
|
||||
|
||||
comm_proc = mpi_layout[mu] - 1;
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
|
||||
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
|
||||
}
|
||||
}
|
||||
Grid.Barrier();
|
||||
double stop = usecond();
|
||||
double mean = (stop - start) / Nloop;
|
||||
double dbytes = bytes * ppn;
|
||||
double xbytes = dbytes * 2.0 * ncomm;
|
||||
double rbytes = xbytes;
|
||||
double bidibytes = xbytes + rbytes;
|
||||
|
||||
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
|
||||
<< std::setw(11) << bytes << std::fixed << std::setprecision(1)
|
||||
<< std::setw(7) << " " << std::right << xbytes / mean << " "
|
||||
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
header();
|
||||
|
||||
for (int lat = 8; lat <= maxlat; lat += 4)
|
||||
{
|
||||
for (int Ls = 8; Ls <= 8; Ls *= 2)
|
||||
{
|
||||
|
||||
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
|
||||
lat * mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
RealD Nnode = Grid.NodeCount();
|
||||
RealD ppn = Nrank / Nnode;
|
||||
|
||||
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
||||
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
||||
|
||||
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
|
||||
for (int d = 0; d < 8; d++)
|
||||
{
|
||||
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
|
||||
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
|
||||
}
|
||||
|
||||
int ncomm;
|
||||
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
if (mpi_layout[mu] > 1)
|
||||
{
|
||||
double start = usecond();
|
||||
for (int i = 0; i < Nloop; i++)
|
||||
{
|
||||
|
||||
ncomm = 0;
|
||||
|
||||
ncomm++;
|
||||
int comm_proc = 1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
|
||||
(void *)&rbuf[mu][0], recv_from_rank, bytes);
|
||||
}
|
||||
|
||||
comm_proc = mpi_layout[mu] - 1;
|
||||
{
|
||||
std::vector<CommsRequest_t> requests;
|
||||
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
|
||||
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
|
||||
(void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
|
||||
}
|
||||
}
|
||||
Grid.Barrier();
|
||||
double stop = usecond();
|
||||
double mean = (stop - start) / Nloop;
|
||||
double dbytes = bytes * ppn;
|
||||
double xbytes = dbytes * 2.0 * ncomm;
|
||||
double rbytes = xbytes;
|
||||
double bidibytes = xbytes + rbytes;
|
||||
|
||||
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
|
||||
<< std::setw(11) << bytes << std::fixed << std::setprecision(1)
|
||||
<< std::setw(7) << " " << std::right << xbytes / mean << " "
|
||||
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
for (int d = 0; d < 8; d++)
|
||||
{
|
||||
acceleratorFreeDevice(xbuf[d]);
|
||||
acceleratorFreeDevice(rbuf[d]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "========================================================================="
|
||||
"==========================="
|
||||
<< std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
512
Grid/Benchmark_dwf_fp32.cpp
Normal file
512
Grid/Benchmark_dwf_fp32.cpp
Normal file
@ -0,0 +1,512 @@
|
||||
/*
|
||||
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
|
||||
Copyright © 2023 Simon Bürger <simon.buerger@rwth-aachen.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "json.hpp"
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PROFILE
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template <class d> struct scal
|
||||
{
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
Grid_init(&argc, &argv);
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
int Ls = 16;
|
||||
std::string json_filename = ""; // empty indicates no json output
|
||||
nlohmann::json json;
|
||||
|
||||
// benchmark specific command line arguments
|
||||
for (int i = 0; i < argc; i++)
|
||||
{
|
||||
if (std::string(argv[i]) == "-Ls")
|
||||
{
|
||||
std::stringstream ss(argv[i + 1]);
|
||||
ss >> Ls;
|
||||
}
|
||||
if (std::string(argv[i]) == "--json-out")
|
||||
json_filename = argv[i + 1];
|
||||
}
|
||||
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
|
||||
|
||||
json["single_site_flops"] = single_site_flops;
|
||||
|
||||
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
|
||||
GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
|
||||
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
|
||||
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
|
||||
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
|
||||
|
||||
json["grid"] = FGrid->FullDimensions().toVector();
|
||||
json["local_grid"] = FGrid->LocalDimensions().toVector();
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
|
||||
GridCartesian *sUGrid =
|
||||
SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
|
||||
|
||||
GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
|
||||
GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
|
||||
|
||||
std::vector<int> seeds4({1, 2, 3, 4});
|
||||
std::vector<int> seeds5({5, 6, 7, 8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid);
|
||||
RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid);
|
||||
RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermionF src(FGrid);
|
||||
random(RNG5, src);
|
||||
#if 0
|
||||
src = Zero();
|
||||
{
|
||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
||||
SpinColourVectorF tmp;
|
||||
tmp=Zero();
|
||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
||||
std::cout << " source site 0 " << tmp<<std::endl;
|
||||
pokeSite(tmp,src,origin);
|
||||
}
|
||||
#else
|
||||
RealD N2 = 1.0 / ::sqrt(norm2(src));
|
||||
src = src * N2;
|
||||
#endif
|
||||
|
||||
LatticeFermionF result(FGrid);
|
||||
result = Zero();
|
||||
LatticeFermionF ref(FGrid);
|
||||
ref = Zero();
|
||||
LatticeFermionF tmp(FGrid);
|
||||
LatticeFermionF err(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
LatticeGaugeFieldF Umu(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4, Umu);
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
#if 0
|
||||
Umu=1.0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
LatticeColourMatrixF ttmp(UGrid);
|
||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
// if (mu !=2 ) ttmp = 0;
|
||||
// ttmp = ttmp* pow(10.0,mu);
|
||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
||||
#endif
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
// replicate across fifth dimension
|
||||
// LatticeGaugeFieldF Umu5d(FGrid);
|
||||
std::vector<LatticeColourMatrixF> U(4, UGrid);
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
if (1)
|
||||
{
|
||||
ref = Zero();
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
|
||||
tmp = Cshift(src, mu + 1, 1);
|
||||
{
|
||||
autoView(tmp_v, tmp, CpuWrite);
|
||||
autoView(U_v, U[mu], CpuRead);
|
||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
||||
{
|
||||
for (int s = 0; s < Ls; s++)
|
||||
{
|
||||
tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s];
|
||||
}
|
||||
}
|
||||
}
|
||||
ref = ref + tmp - Gamma(Gmu[mu]) * tmp;
|
||||
|
||||
{
|
||||
autoView(tmp_v, tmp, CpuWrite);
|
||||
autoView(U_v, U[mu], CpuRead);
|
||||
autoView(src_v, src, CpuRead);
|
||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
||||
{
|
||||
for (int s = 0; s < Ls; s++)
|
||||
{
|
||||
tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp = Cshift(tmp, mu + 1, -1);
|
||||
ref = ref + tmp + Gamma(Gmu[mu]) * tmp;
|
||||
}
|
||||
ref = -0.5 * ref;
|
||||
}
|
||||
|
||||
RealD mass = 0.1;
|
||||
RealD M5 = 1.8;
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
RealD NN = UGrid->NodeCount();
|
||||
|
||||
json["ranks"] = NP;
|
||||
json["nodes"] = NN;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "*****************************************************************"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "*****************************************************************"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "*****************************************************************"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "* Benchmarking DomainWallFermionR::Dhop " << std::endl;
|
||||
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
|
||||
<< std::endl;
|
||||
|
||||
if (sizeof(RealF) == 4)
|
||||
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
|
||||
if (sizeof(RealF) == 8)
|
||||
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
|
||||
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
|
||||
std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
|
||||
#endif
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
|
||||
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
|
||||
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
|
||||
|
||||
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "*****************************************************************"
|
||||
<< std::endl;
|
||||
|
||||
DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
|
||||
int ncall = 300;
|
||||
|
||||
if (1)
|
||||
{
|
||||
FGrid->Barrier();
|
||||
Dw.ZeroCounters();
|
||||
Dw.Dhop(src, result, 0);
|
||||
std::cout << GridLogMessage << "Called warmup" << std::endl;
|
||||
double t0 = usecond();
|
||||
for (int i = 0; i < ncall; i++)
|
||||
{
|
||||
__SSC_START;
|
||||
Dw.Dhop(src, result, 0);
|
||||
__SSC_STOP;
|
||||
}
|
||||
double t1 = usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume = Ls;
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
volume = volume * latt4[mu];
|
||||
double flops = single_site_flops * volume * ncall;
|
||||
|
||||
auto nsimd = vComplex::Nsimd();
|
||||
auto simdwidth = sizeof(vComplex);
|
||||
|
||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||
double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
|
||||
nsimd * ncall / (1024. * 1024. * 1024.);
|
||||
|
||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
||||
double data_mem =
|
||||
(volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
|
||||
nsimd * ncall / (1024. * 1024. * 1024.);
|
||||
|
||||
json["Dw"]["calls"] = ncall;
|
||||
json["Dw"]["time"] = t1 - t0;
|
||||
json["Dw"]["mflops"] = flops / (t1 - t0);
|
||||
json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
|
||||
json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN;
|
||||
json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
|
||||
json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
|
||||
|
||||
std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
|
||||
<< " us" << std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout << GridLogMessage << "mflop/s = " << flops / (t1 - t0) << std::endl;
|
||||
std::cout << GridLogMessage << "mflop/s per rank = " << flops / (t1 - t0) / NP
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "mflop/s per node = " << flops / (t1 - t0) / NN
|
||||
<< std::endl;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "RF GiB/s (base 2) = " << 1000000. * data_rf / ((t1 - t0))
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "mem GiB/s (base 2) = " << 1000000. * data_mem / ((t1 - t0))
|
||||
<< std::endl;
|
||||
err = ref - result;
|
||||
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
|
||||
// exit(0);
|
||||
|
||||
if ((norm2(err) > 1.0e-4))
|
||||
{
|
||||
/*
|
||||
std::cout << "RESULT\n " << result<<std::endl;
|
||||
std::cout << "REF \n " << ref <<std::endl;
|
||||
std::cout << "ERR \n " << err <<std::endl;
|
||||
*/
|
||||
std::cout << GridLogMessage << "WRONG RESULT" << std::endl;
|
||||
FGrid->Barrier();
|
||||
exit(-1);
|
||||
}
|
||||
assert(norm2(err) < 1.0e-4);
|
||||
Dw.Report();
|
||||
}
|
||||
|
||||
if (1)
|
||||
{ // Naive wilson dag implementation
|
||||
ref = Zero();
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
|
||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||
tmp = Cshift(src, mu + 1, 1);
|
||||
{
|
||||
autoView(ref_v, ref, CpuWrite);
|
||||
autoView(tmp_v, tmp, CpuRead);
|
||||
autoView(U_v, U[mu], CpuRead);
|
||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
||||
{
|
||||
for (int s = 0; s < Ls; s++)
|
||||
{
|
||||
int i = s + Ls * ss;
|
||||
ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]);
|
||||
;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
autoView(tmp_v, tmp, CpuWrite);
|
||||
autoView(U_v, U[mu], CpuRead);
|
||||
autoView(src_v, src, CpuRead);
|
||||
for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
|
||||
{
|
||||
for (int s = 0; s < Ls; s++)
|
||||
{
|
||||
tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
|
||||
}
|
||||
}
|
||||
}
|
||||
// tmp =adj(U[mu])*src;
|
||||
tmp = Cshift(tmp, mu + 1, -1);
|
||||
{
|
||||
autoView(ref_v, ref, CpuWrite);
|
||||
autoView(tmp_v, tmp, CpuRead);
|
||||
for (int i = 0; i < ref_v.size(); i++)
|
||||
{
|
||||
ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i];
|
||||
;
|
||||
}
|
||||
}
|
||||
}
|
||||
ref = -0.5 * ref;
|
||||
}
|
||||
// dump=1;
|
||||
Dw.Dhop(src, result, 1);
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "Compare to naive wilson implementation Dag to verify correctness"
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "Called DwDag" << std::endl;
|
||||
std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
|
||||
std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl;
|
||||
err = ref - result;
|
||||
std::cout << GridLogMessage << "norm dag diff " << norm2(err) << std::endl;
|
||||
if ((norm2(err) > 1.0e-4))
|
||||
{
|
||||
/*
|
||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
LatticeFermionF src_e(FrbGrid);
|
||||
LatticeFermionF src_o(FrbGrid);
|
||||
LatticeFermionF r_e(FrbGrid);
|
||||
LatticeFermionF r_o(FrbGrid);
|
||||
LatticeFermionF r_eo(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
|
||||
<< std::endl;
|
||||
pickCheckerboard(Even, src_e, src);
|
||||
pickCheckerboard(Odd, src_o, src);
|
||||
|
||||
std::cout << GridLogMessage << "src_e" << norm2(src_e) << std::endl;
|
||||
std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
std::cout << GridLogMessage
|
||||
|
||||
<< "*********************************************************" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "* Benchmarking DomainWallFermionF::DhopEO " << std::endl;
|
||||
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
|
||||
<< std::endl;
|
||||
|
||||
if (sizeof(RealF) == 4)
|
||||
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
|
||||
if (sizeof(RealF) == 8)
|
||||
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
|
||||
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
|
||||
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
|
||||
std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
|
||||
#endif
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
|
||||
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
|
||||
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
|
||||
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
|
||||
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
|
||||
std::cout << GridLogMessage
|
||||
<< "*********************************************************" << std::endl;
|
||||
|
||||
{
|
||||
Dw.ZeroCounters();
|
||||
FGrid->Barrier();
|
||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
||||
double t0 = usecond();
|
||||
for (int i = 0; i < ncall; i++)
|
||||
{
|
||||
#ifdef CUDA_PROFILE
|
||||
if (i == 10)
|
||||
cudaProfilerStart();
|
||||
#endif
|
||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
||||
#ifdef CUDA_PROFILE
|
||||
if (i == 20)
|
||||
cudaProfilerStop();
|
||||
#endif
|
||||
}
|
||||
double t1 = usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume = Ls;
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
volume = volume * latt4[mu];
|
||||
double flops = (single_site_flops * volume * ncall) / 2.0;
|
||||
|
||||
json["Deo"]["calls"] = ncall;
|
||||
json["Deo"]["time"] = t1 - t0;
|
||||
json["Deo"]["mflops"] = flops / (t1 - t0);
|
||||
json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
|
||||
json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
|
||||
|
||||
std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl;
|
||||
std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP
|
||||
<< std::endl;
|
||||
std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN
|
||||
<< std::endl;
|
||||
|
||||
Dw.Report();
|
||||
}
|
||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
||||
Dw.DhopOE(src_e, r_o, DaggerNo);
|
||||
Dw.Dhop(src, result, DaggerNo);
|
||||
|
||||
std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl;
|
||||
std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl;
|
||||
std::cout << GridLogMessage << "res" << norm2(result) << std::endl;
|
||||
|
||||
setCheckerboard(r_eo, r_o);
|
||||
setCheckerboard(r_eo, r_e);
|
||||
|
||||
err = r_eo - result;
|
||||
std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
|
||||
if ((norm2(err) > 1.0e-4))
|
||||
{
|
||||
/*
|
||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
|
||||
pickCheckerboard(Even, src_e, err);
|
||||
pickCheckerboard(Odd, src_o, err);
|
||||
std::cout << GridLogMessage << "norm diff even " << norm2(src_e) << std::endl;
|
||||
std::cout << GridLogMessage << "norm diff odd " << norm2(src_o) << std::endl;
|
||||
|
||||
assert(norm2(src_e) < 1.0e-4);
|
||||
assert(norm2(src_o) < 1.0e-4);
|
||||
|
||||
if (!json_filename.empty())
|
||||
{
|
||||
std::cout << GridLogMessage << "writing benchmark results to " << json_filename
|
||||
<< std::endl;
|
||||
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
if (me == 0)
|
||||
{
|
||||
std::ofstream json_file(json_filename);
|
||||
json_file << std::setw(4) << json;
|
||||
}
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
@ -26,20 +26,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#define GRID_MSG_MAXSIZE 1024
|
||||
#endif
|
||||
|
||||
#define GRID_BIG_SEP \
|
||||
"==============================================================================="
|
||||
#define GRID_SMALL_SEP "------------------------------------------"
|
||||
|
||||
#define grid_big_sep() \
|
||||
{ \
|
||||
GRID_MSG << GRID_BIG_SEP << std::endl; \
|
||||
}
|
||||
|
||||
#define grid_small_sep() \
|
||||
{ \
|
||||
GRID_MSG << GRID_SMALL_SEP << std::endl; \
|
||||
}
|
||||
|
||||
#define grid_printf(...) \
|
||||
{ \
|
||||
char _buf[GRID_MSG_MAXSIZE]; \
|
||||
|
@ -1,8 +1,12 @@
|
||||
ACLOCAL_AMFLAGS = -I .buildutils/m4
|
||||
|
||||
bin_PROGRAMS = \
|
||||
Benchmark_comms_host_device \
|
||||
Benchmark_dwf_fp32 \
|
||||
Benchmark_Grid \
|
||||
Benchmark_IO
|
||||
|
||||
|
||||
Benchmark_comms_host_device_SOURCES = Benchmark_comms_host_device.cpp
|
||||
Benchmark_dwf_fp32_SOURCES = Benchmark_dwf_fp32.cpp
|
||||
Benchmark_Grid_SOURCES = Benchmark_Grid.cpp
|
||||
Benchmark_IO_SOURCES = Benchmark_IO.cpp
|
||||
|
@ -6,7 +6,6 @@ The benchmarks can be summarised as follows
|
||||
- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
|
||||
matrices, as well as bandwidth measurement for different operations. Measurements are
|
||||
performed for a fixed range of problem sizes.
|
||||
- `Benchmark_IO`: Parallel I/O benchmark.
|
||||
|
||||
## TL;DR
|
||||
Build and install Grid, all dependencies, and the benchmark with
|
||||
@ -29,7 +28,7 @@ You should first deploy the environment for the specific system you are using, f
|
||||
systems/tursa/bootstrap-env.sh ./env
|
||||
```
|
||||
will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
|
||||
of packages, and take some time to complete.
|
||||
of packages, and might take some time to complete.
|
||||
|
||||
After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
|
||||
```bash
|
||||
@ -67,84 +66,4 @@ where `<env_dir>` is the environment directory and `<config>` is the build confi
|
||||
|
||||
## Running the benchmarks
|
||||
After building the benchmarks as above you can find the binaries in
|
||||
`<env_dir>/prefix/gridbench_<config>`. Depending on the system selected, the environment
|
||||
directory might also contain batch script examples. More information about the benchmarks
|
||||
is provided below.
|
||||
|
||||
### `Benchmark_Grid`
|
||||
This benchmark performs flop/s measurement for typical lattice QCD sparse matrices, as
|
||||
well as memory and inter-process bandwidth measurement using Grid routines. The benchmark
|
||||
command accept any Grid flag (see complete list with `--help`), as well as a
|
||||
`--json-out <file>` flag to save the measurement results in JSON to `<file>`. The
|
||||
benchmarks are performed on a fix set of problem sizes, and the Grid flag `--grid` will
|
||||
be ignored.
|
||||
|
||||
The resulting metrics are as follows, all data size units are in base 2
|
||||
(i.e. 1 kB = 1024 B).
|
||||
|
||||
*Memory bandwidth*
|
||||
|
||||
One sub-benchmark measure the memory bandwidth using a lattice version of the `axpy` BLAS
|
||||
routine, in a similar fashion to the STREAM benchmark. The JSON entries under `"axpy"`
|
||||
have the form
|
||||
```json
|
||||
{
|
||||
"GBps": 215.80653375861607, // bandwidth in GB/s/node
|
||||
"GFlops": 19.310041765757834, // FP performance (double precision)
|
||||
"L": 8, // local lattice volume
|
||||
"size_MB": 3.0 // memory size in MB/node
|
||||
}
|
||||
```
|
||||
|
||||
A second benchmark performs site-wise SU(4) matrix multiplication, and has a higher
|
||||
arithmetic intensity than the `axpy` one (although it is still memory-bound).
|
||||
The JSON entries under `"SU4"` have the form
|
||||
```json
|
||||
{
|
||||
"GBps": 394.76639187026865, // bandwidth in GB/s/node
|
||||
"GFlops": 529.8464820758512, // FP performance (single precision)
|
||||
"L": 8, // local lattice size
|
||||
"size_MB": 6.0 // memory size in MB/node
|
||||
}
|
||||
```
|
||||
|
||||
*Inter-process bandwidth*
|
||||
|
||||
This sub-benchmark measures the achieved bidirectional bandwidth in threaded halo exchange
|
||||
using routines in Grid. The exchange is performed in each direction on the MPI Cartesian
|
||||
grid which is parallelised across at least 2 processes. The resulting bandwidth is related
|
||||
to node-local transfers (inter-CPU, NVLink, ...) or network transfers depending on the MPI
|
||||
decomposition. he JSON entries under `"comms"` have the form
|
||||
```json
|
||||
{
|
||||
"L": 40, // local lattice size
|
||||
"bytes": 73728000, // payload size in B/rank
|
||||
"dir": 2, // direction of the exchange, 8 possible directions
|
||||
// (0: +x, 1: +y, ..., 5: -x, 6: -y, ...)
|
||||
"rate_GBps": {
|
||||
"error": 6.474271894240327, // standard deviation across measurements (GB/s/node)
|
||||
"max": 183.10546875, // maximum measured bandwidth (GB/s/node)
|
||||
"mean": 175.21747026766676 // average measured bandwidth (GB/s/node)
|
||||
},
|
||||
"time_usec": 3135.055 // average transfer time (microseconds)
|
||||
}
|
||||
```
|
||||
|
||||
*Floating-point performances*
|
||||
|
||||
This sub-benchmark measures the achieved floating-point performances using the
|
||||
Wilson fermion, domain-wall fermion, and staggered fermion sparse matrices from Grid.
|
||||
In the `"flops"` and `"results"` section of the JSON output are recorded the best
|
||||
performances, e.g.
|
||||
```json
|
||||
{
|
||||
"Gflops_dwf4": 366.5251173474483, // domain-wall in Gflop/s/node (single precision)
|
||||
"Gflops_staggered": 7.5982861018529455, // staggered in Gflop/s/node (single precision)
|
||||
"Gflops_wilson": 15.221839719288932, // Wilson in Gflop/s/node (single precision)
|
||||
"L": 8 // local lattice size
|
||||
}
|
||||
```
|
||||
Here "best" means across a number of different implementations of the routines. Please
|
||||
see the log of the benchmark for an additional breakdown. Finally, the JSON output
|
||||
contains a "comparison point", which is the average of the L=24 and L=32 best
|
||||
domain-wall performances.
|
||||
`<env_dir>/prefix/gridbench_<config>`.
|
@ -20,12 +20,8 @@ mkdir -p "${build_dir}"
|
||||
source "${env_dir}/env.sh"
|
||||
entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
|
||||
env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
|
||||
cd "${build_dir}" || return
|
||||
source "${env_dir}/${env_script}"
|
||||
cd "${script_dir}"
|
||||
if [ ! -f configure ]; then
|
||||
./bootstrap.sh
|
||||
fi
|
||||
cd "${build_dir}"
|
||||
if [ ! -f Makefile ]; then
|
||||
"${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
|
||||
--prefix="${env_dir}/prefix/gridbench_${cfg}"
|
||||
|
@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1091
|
||||
|
||||
env_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
||||
mkdir -p ~/.config/lattice-benchmarks
|
||||
echo "${env_dir}" > ~/.config/lattice-benchmarks/grid-env
|
||||
source "${env_dir}/spack/share/spack/setup-env.sh"
|
||||
spack load jq git
|
||||
GRIDENVDIR="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
||||
export GRIDENVDIR
|
||||
export PATH="${GRIDENVDIR}/prefix/base/bin:${PATH}"
|
||||
export ACLOCAL_PATH="${GRIDENVDIR}/prefix/base/share/aclocal:${ACLOCAL_PATH}"
|
||||
source "${GRIDENVDIR}"/spack/share/spack/setup-env.sh
|
||||
|
0
Grid/systems/tursa/files/cpu-mpi-wrapper.sh
Executable file → Normal file
0
Grid/systems/tursa/files/cpu-mpi-wrapper.sh
Executable file → Normal file
0
Grid/systems/tursa/files/gpu-mpi-wrapper.sh
Executable file → Normal file
0
Grid/systems/tursa/files/gpu-mpi-wrapper.sh
Executable file → Normal file
@ -1,17 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||
export OMP_NUM_THREADS=8
|
||||
export OMPI_MCA_btl=^uct,openib
|
||||
export OMPI_MCA_pml=ucx
|
||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
export UCX_RNDV_THRESH=16384
|
||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
|
||||
# IO environment ###############################################################
|
||||
export OMPI_MCA_io=romio321
|
||||
export OMPI_MCA_btl_openib_allow_ib=true
|
||||
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
@ -1,60 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1091,SC2050,SC2170
|
||||
|
||||
#SBATCH -J benchmark-grid-16
|
||||
#SBATCH -t 1:00:00
|
||||
#SBATCH --nodes=16
|
||||
#SBATCH --ntasks=64
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --output=%x.%j.out
|
||||
#SBATCH --error=%x.%j.err
|
||||
#SBATCH --qos=standard
|
||||
#SBATCH --no-requeue
|
||||
#SBATCH --gpu-freq=1410
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# load environment #############################################################
|
||||
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||
if [ ! -f "${env_cfg}" ]; then
|
||||
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||
exit 1
|
||||
fi
|
||||
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||
source "${env_dir}/env.sh" # load base Spack environment
|
||||
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||
|
||||
# application and parameters ###################################################
|
||||
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||
|
||||
# collect job information ######################################################
|
||||
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||
mkdir -p "${job_info_dir}"
|
||||
|
||||
date > "${job_info_dir}/start-date"
|
||||
set > "${job_info_dir}/env"
|
||||
ldd "${app}" > "${job_info_dir}/ldd"
|
||||
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||
|
||||
# run! #########################################################################
|
||||
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||
"${app}" \
|
||||
--json-out "${job_info_dir}/result.json" \
|
||||
--mpi 1.4.4.4 \
|
||||
--accelerator-threads 8 \
|
||||
--threads 8 \
|
||||
--shm 2048 &> "${job_info_dir}/log"
|
||||
|
||||
# if we reach that point the application exited successfully ###################
|
||||
touch "${job_info_dir}/success"
|
||||
date > "${job_info_dir}/end-date"
|
||||
|
||||
################################################################################
|
@ -1,60 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1091,SC2050,SC2170
|
||||
|
||||
#SBATCH -J benchmark-grid-1
|
||||
#SBATCH -t 1:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=4
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --output=%x.%j.out
|
||||
#SBATCH --error=%x.%j.err
|
||||
#SBATCH --qos=standard
|
||||
#SBATCH --no-requeue
|
||||
#SBATCH --gpu-freq=1410
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# load environment #############################################################
|
||||
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||
if [ ! -f "${env_cfg}" ]; then
|
||||
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||
exit 1
|
||||
fi
|
||||
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||
source "${env_dir}/env.sh" # load base Spack environment
|
||||
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||
|
||||
# application and parameters ###################################################
|
||||
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||
|
||||
# collect job information ######################################################
|
||||
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||
mkdir -p "${job_info_dir}"
|
||||
|
||||
date > "${job_info_dir}/start-date"
|
||||
set > "${job_info_dir}/env"
|
||||
ldd "${app}" > "${job_info_dir}/ldd"
|
||||
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||
|
||||
# run! #########################################################################
|
||||
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||
"${app}" \
|
||||
--json-out "${job_info_dir}/result.json" \
|
||||
--mpi 1.1.1.4 \
|
||||
--accelerator-threads 8 \
|
||||
--threads 8 \
|
||||
--shm 2048 &> "${job_info_dir}/log"
|
||||
|
||||
# if we reach that point the application exited successfully ###################
|
||||
touch "${job_info_dir}/success"
|
||||
date > "${job_info_dir}/end-date"
|
||||
|
||||
################################################################################
|
@ -1,60 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC1091,SC2050,SC2170
|
||||
|
||||
#SBATCH -J benchmark-grid-32
|
||||
#SBATCH -t 1:00:00
|
||||
#SBATCH --nodes=32
|
||||
#SBATCH --ntasks=128
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --output=%x.%j.out
|
||||
#SBATCH --error=%x.%j.err
|
||||
#SBATCH --qos=standard
|
||||
#SBATCH --no-requeue
|
||||
#SBATCH --gpu-freq=1410
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# load environment #############################################################
|
||||
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||
if [ ! -f "${env_cfg}" ]; then
|
||||
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||
exit 1
|
||||
fi
|
||||
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||
source "${env_dir}/env.sh" # load base Spack environment
|
||||
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||
|
||||
# application and parameters ###################################################
|
||||
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||
|
||||
# collect job information ######################################################
|
||||
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||
mkdir -p "${job_info_dir}"
|
||||
|
||||
date > "${job_info_dir}/start-date"
|
||||
set > "${job_info_dir}/env"
|
||||
ldd "${app}" > "${job_info_dir}/ldd"
|
||||
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||
|
||||
# run! #########################################################################
|
||||
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||
"${app}" \
|
||||
--json-out "${job_info_dir}/result.json" \
|
||||
--mpi 1.4.4.8 \
|
||||
--accelerator-threads 8 \
|
||||
--threads 8 \
|
||||
--shm 2048 &> "${job_info_dir}/log"
|
||||
|
||||
# if we reach that point the application exited successfully ###################
|
||||
touch "${job_info_dir}/success"
|
||||
date > "${job_info_dir}/end-date"
|
||||
|
||||
################################################################################
|
@ -4,13 +4,7 @@ set -euo pipefail
|
||||
|
||||
gcc_spec='gcc@9.4.0'
|
||||
cuda_spec='cuda@11.4.0'
|
||||
|
||||
# hdf5 and fftw depend on OpenMPI, which we install manually. To make sure this
|
||||
# dependency is picked by spack, we specify the compiler here explicitly. For
|
||||
# most other packages we dont really care about the compiler (i.e. system
|
||||
# compiler versus ${gcc_spec})
|
||||
hdf5_spec="hdf5@1.10.7+cxx+threadsafe%${gcc_spec}"
|
||||
fftw_spec="fftw%${gcc_spec}"
|
||||
hdf5_spec='hdf5@1.10.7'
|
||||
|
||||
if (( $# != 1 )); then
|
||||
echo "usage: $(basename "$0") <env dir>" 1>&2
|
||||
@ -24,7 +18,7 @@ cd "${cwd}"
|
||||
|
||||
# General configuration ########################################################
|
||||
# build with 128 tasks
|
||||
echo 'config:
|
||||
echo 'config:
|
||||
build_jobs: 128
|
||||
build_stage:
|
||||
- $spack/var/spack/stage
|
||||
@ -44,23 +38,26 @@ rm external.yaml
|
||||
|
||||
# Base compilers ###############################################################
|
||||
# configure system base
|
||||
|
||||
spack env create base
|
||||
spack env activate base
|
||||
spack compiler find --scope site
|
||||
|
||||
# install GCC, CUDA
|
||||
spack add ${gcc_spec} ${cuda_spec}
|
||||
spack concretize
|
||||
spack env depfile -o Makefile.tmp
|
||||
make -j128 -f Makefile.tmp
|
||||
# install GCC, CUDA & LLVM
|
||||
spack install ${gcc_spec} ${cuda_spec} llvm
|
||||
|
||||
spack load llvm
|
||||
spack compiler find --scope site
|
||||
spack unload llvm
|
||||
|
||||
spack load ${gcc_spec}
|
||||
spack compiler find --scope site
|
||||
spack unload ${gcc_spec}
|
||||
|
||||
# Manual compilation of OpenMPI & UCX ##########################################
|
||||
# set build directories
|
||||
mkdir -p "${dir}"/build
|
||||
cd "${dir}"/build
|
||||
|
||||
spack load ${gcc_spec} ${cuda_spec}
|
||||
|
||||
cuda_path=$(spack find --format "{prefix}" cuda)
|
||||
gdrcopy_path=/mnt/lustre/tursafs1/apps/gdrcopy/2.3.1
|
||||
|
||||
@ -127,8 +124,8 @@ mkdir build_gpu; cd build_gpu
|
||||
--with-cuda="${cuda_path}" --disable-getpwuid \
|
||||
--with-verbs --with-slurm --enable-mpi-fortran=all \
|
||||
--with-pmix=internal --with-libevent=internal
|
||||
make -j 128
|
||||
make install
|
||||
make -j 128
|
||||
make install
|
||||
cd ..
|
||||
|
||||
# openmpi cpu build
|
||||
@ -144,65 +141,65 @@ make -j 128
|
||||
make install
|
||||
cd "${dir}"
|
||||
|
||||
ucx_spec_gpu="ucx@1.12.0.GPU%${gcc_spec}"
|
||||
ucx_spec_cpu="ucx@1.12.0.CPU%${gcc_spec}"
|
||||
openmpi_spec_gpu="openmpi@4.1.1.GPU%${gcc_spec}"
|
||||
openmpi_spec_cpu="openmpi@4.1.1.CPU%${gcc_spec}"
|
||||
|
||||
# Add externals to spack
|
||||
echo "packages:
|
||||
ucx:
|
||||
externals:
|
||||
- spec: \"${ucx_spec_gpu}\"
|
||||
- spec: \"ucx@1.12.0.GPU%gcc@9.4.0\"
|
||||
prefix: ${dir}/prefix/ucx_gpu
|
||||
- spec: \"${ucx_spec_cpu}\"
|
||||
- spec: \"ucx@1.12.0.CPU%gcc@9.4.0\"
|
||||
prefix: ${dir}/prefix/ucx_cpu
|
||||
buildable: False
|
||||
openmpi:
|
||||
externals:
|
||||
- spec: \"${openmpi_spec_gpu}\"
|
||||
- spec: \"openmpi@4.1.1.GPU%gcc@9.4.0\"
|
||||
prefix: ${dir}/prefix/ompi_gpu
|
||||
- spec: \"${openmpi_spec_cpu}\"
|
||||
- spec: \"openmpi@4.1.1.CPU%gcc@9.4.0\"
|
||||
prefix: ${dir}/prefix/ompi_cpu
|
||||
buildable: False" > spack.yaml
|
||||
|
||||
spack config --scope site add -f spack.yaml
|
||||
rm spack.yaml
|
||||
spack env deactivate
|
||||
spack install ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
|
||||
spack install ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
|
||||
|
||||
cd "${cwd}"
|
||||
|
||||
# environments #################################################################
|
||||
dev_tools=("autoconf" "automake" "libtool" "jq" "git")
|
||||
dev_tools=("autoconf" "automake" "libtool" "jq")
|
||||
ompi_gpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.GPU)
|
||||
ompi_cpu_hash=$(spack find --format "{hash}" openmpi@4.1.1.CPU)
|
||||
|
||||
spack env create grid-gpu
|
||||
spack env activate grid-gpu
|
||||
spack compiler find --scope site
|
||||
spack add ${gcc_spec} ${cuda_spec} ${ucx_spec_gpu} ${openmpi_spec_gpu}
|
||||
spack add ${hdf5_spec} ${fftw_spec}
|
||||
spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
|
||||
spack concretize
|
||||
spack env depfile -o Makefile.tmp
|
||||
make -j128 -f Makefile.tmp
|
||||
spack add ${gcc_spec} ${cuda_spec} "${dev_tools[@]}"
|
||||
spack add ucx@1.12.0.GPU%gcc@9.4.0 openmpi@4.1.1.GPU%gcc@9.4.0
|
||||
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_gpu_hash}"
|
||||
spack add fftw ^/"${ompi_gpu_hash}"
|
||||
spack add openssl gmp mpfr c-lime
|
||||
spack install
|
||||
spack env deactivate
|
||||
|
||||
spack env create grid-cpu
|
||||
spack env activate grid-cpu
|
||||
spack compiler find --scope site
|
||||
spack add ${gcc_spec} ${ucx_spec_cpu} ${openmpi_spec_cpu}
|
||||
spack add ${hdf5_spec} ${fftw_spec}
|
||||
spack add openssl gmp mpfr c-lime "${dev_tools[@]}"
|
||||
spack concretize
|
||||
spack env depfile -o Makefile.tmp
|
||||
make -j128 -f Makefile.tmp
|
||||
spack add llvm "${dev_tools[@]}"
|
||||
spack add ucx@1.12.0.CPU%gcc@9.4.0 openmpi@4.1.1.CPU%gcc@9.4.0
|
||||
spack add ${hdf5_spec}+cxx+threadsafe ^/"${ompi_cpu_hash}"
|
||||
spack add fftw ^/"${ompi_cpu_hash}"
|
||||
spack add openssl gmp mpfr c-lime
|
||||
spack install
|
||||
spack env deactivate
|
||||
|
||||
# Final setup ##################################################################
|
||||
spack clean
|
||||
#spack gc -y # "spack gc" tends to get hung up for unknown reasons
|
||||
spack gc -y
|
||||
|
||||
# add more environment variables in module loading
|
||||
spack config --scope site add 'modules:prefix_inspections:lib:[LD_LIBRARY_PATH,LIBRARY_PATH]'
|
||||
spack config --scope site add 'modules:prefix_inspections:lib64:[LD_LIBRARY_PATH,LIBRARY_PATH]'
|
||||
spack config --scope site add 'modules:prefix_inspections:include:[C_INCLUDE_PATH,CPLUS_INCLUDE_PATH,INCLUDE]'
|
||||
spack module tcl refresh -y
|
||||
|
||||
# permission change for group access
|
||||
chmod -R g+rw "${dir}/spack/var/spack/cache"
|
||||
setfacl -d -R -m g::rwX "${dir}/spack/var/spack/cache"
|
||||
|
Loading…
Reference in New Issue
Block a user