lattice-benchmarks/Grid/Benchmark_Grid.cpp

990 lines
36 KiB
C++
Raw Normal View History

2023-01-13 19:00:20 +00:00
/*
Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
2023-01-24 18:44:47 +00:00
This is a refactoring of Benchmark_ITT.cpp from Grid
2023-01-13 19:00:20 +00:00
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <Grid/Grid.h>
using namespace Grid;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
double mflop_ref;
double mflop_ref_err;
int NN_global;
struct time_statistics
{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v)
{
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
2023-01-25 17:47:40 +00:00
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
2023-01-13 19:00:20 +00:00
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void comms_header()
{
std::cout << GridLogMessage << " L "
<< "\t"
<< " Ls "
<< "\t"
<< "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
};
2023-01-25 17:47:40 +00:00
Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
2023-01-24 18:44:47 +00:00
2023-01-13 19:00:20 +00:00
struct controls
{
int Opt;
int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
};
class Benchmark
{
2023-01-24 18:44:47 +00:00
public:
2023-01-13 19:00:20 +00:00
static void Decomposition(void)
{
int threads = GridThread::GetThreads();
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads"
<< std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads()
<< std::endl;
std::cout << GridLogMessage
<< "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi())
<< std::endl;
std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd()))
<< std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
}
static void Comms(void)
{
int Nloop = 200;
int nmu = 0;
int maxlat = 32;
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
for (int mu = 0; mu < Nd; mu++)
if (mpi_layout[mu] > 1)
nmu++;
std::vector<double> t_time(Nloop);
time_statistics timestat;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"============================="
<< std::endl;
std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "
<< nmu << " dimensions" << std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"============================="
<< std::endl;
2023-01-13 19:00:20 +00:00
comms_header();
for (int lat = 16; lat <= maxlat; lat += 8)
{
2023-01-25 17:45:30 +00:00
int Ls = 12;
2023-01-25 17:47:40 +00:00
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
2023-01-25 17:45:30 +00:00
lat * mpi_layout[3]});
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank / Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
for (int d = 0; d < 8; d++)
2023-01-13 19:00:20 +00:00
{
2023-01-25 17:45:30 +00:00
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
// bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
// bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
2023-01-13 19:00:20 +00:00
2023-01-25 17:45:30 +00:00
double dbytes;
2023-01-13 19:00:20 +00:00
2023-01-25 17:45:30 +00:00
for (int dir = 0; dir < 8; dir++)
{
int mu = dir % 4;
if (mpi_layout[mu] > 1)
2023-01-13 19:00:20 +00:00
{
2023-01-25 17:45:30 +00:00
std::vector<double> times(Nloop);
for (int i = 0; i < Nloop; i++)
2023-01-13 19:00:20 +00:00
{
2023-01-25 17:45:30 +00:00
dbytes = 0;
double start = usecond();
int xmit_to_rank;
int recv_from_rank;
2023-01-13 19:00:20 +00:00
2023-01-25 17:45:30 +00:00
if (dir == mu)
{
int comm_proc = 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
2023-01-13 19:00:20 +00:00
}
2023-01-25 17:45:30 +00:00
else
{
int comm_proc = mpi_layout[mu] - 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
2023-01-25 17:47:40 +00:00
(void *)&rbuf[dir][0], recv_from_rank, bytes);
2023-01-25 17:45:30 +00:00
dbytes += bytes;
double stop = usecond();
t_time[i] = stop - start; // microseconds
2023-01-13 19:00:20 +00:00
}
2023-01-25 17:45:30 +00:00
timestat.statistics(t_time);
dbytes = dbytes * ppn;
double xbytes = dbytes * 0.5;
double bidibytes = dbytes;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << lat << "\t" << Ls << "\t " << bytes << " \t "
<< xbytes / timestat.mean << " \t "
<< xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t "
<< xbytes / timestat.max << " " << xbytes / timestat.min << "\t\t"
<< bidibytes / timestat.mean << " "
<< bidibytes * timestat.err / (timestat.mean * timestat.mean) << " "
<< bidibytes / timestat.max << " " << bidibytes / timestat.min
<< std::endl;
2023-01-13 19:00:20 +00:00
}
2023-01-25 17:45:30 +00:00
}
for (int d = 0; d < 8; d++)
{
acceleratorFreeDevice(xbuf[d]);
acceleratorFreeDevice(rbuf[d]);
2023-01-13 19:00:20 +00:00
}
}
return;
}
static void Memory(void)
{
const int Nvec = 8;
typedef Lattice<iVector<vReal, Nvec>> LatticeVec;
typedef iVector<vReal, Nvec> Vec;
Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " L "
<< "\t\t"
<< "bytes"
<< "\t\t\t"
<< "GB/s"
<< "\t\t"
<< "Gflop/s"
<< "\t\t seconds"
<< "\t\tGB/s / node" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "----------------------------------------------------------"
<< std::endl;
2023-01-13 19:00:20 +00:00
// uint64_t NP;
uint64_t NN;
uint64_t lmax = 32;
#define NLOOP (1000 * lmax * lmax * lmax * lmax / lat / lat / lat / lat)
GridSerialRNG sRNG;
sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
for (int lat = 8; lat <= lmax; lat += 8)
{
2023-01-25 17:47:40 +00:00
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
2023-01-13 19:00:20 +00:00
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
// NP= Grid.RankCount();
NN = Grid.NodeCount();
Vec rn;
random(sRNG, rn);
LatticeVec z(&Grid);
z = Zero();
LatticeVec x(&Grid);
x = Zero();
LatticeVec y(&Grid);
y = Zero();
double a = 2.0;
uint64_t Nloop = NLOOP;
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
z = a * x - y;
}
double stop = usecond();
double time = (stop - start) / Nloop * 1000;
double flops = vol * Nvec * 2; // mul,add
double bytes = 3.0 * vol * Nvec * sizeof(Real);
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
<< " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
<< (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
<< std::endl;
2023-01-13 19:00:20 +00:00
}
};
static void SU4(void)
{
const int Nc4 = 4;
typedef Lattice<iMatrix<vComplexF, Nc4>> LatticeSU4;
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " L "
<< "\t\t"
<< "bytes"
<< "\t\t\t"
<< "GB/s"
<< "\t\t"
<< "Gflop/s"
<< "\t\t seconds"
<< "\t\tGB/s / node" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "----------------------------------------------------------"
<< std::endl;
2023-01-13 19:00:20 +00:00
uint64_t NN;
uint64_t lmax = 32;
GridSerialRNG sRNG;
sRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
for (int lat = 8; lat <= lmax; lat += 8)
{
2023-01-25 17:47:40 +00:00
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
2023-01-13 19:00:20 +00:00
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
NN = Grid.NodeCount();
LatticeSU4 z(&Grid);
z = Zero();
LatticeSU4 x(&Grid);
x = Zero();
LatticeSU4 y(&Grid);
y = Zero();
// double a=2.0;
uint64_t Nloop = NLOOP;
double start = usecond();
for (int i = 0; i < Nloop; i++)
{
z = x * y;
}
double stop = usecond();
double time = (stop - start) / Nloop * 1000;
double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
<< " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
<< (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
<< std::endl;
2023-01-13 19:00:20 +00:00
}
};
static double DWF(int Ls, int L)
{
RealD mass = 0.1;
RealD M5 = 1.8;
double mflops;
double mflops_best = 0;
double mflops_worst = 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi();
assert(mpi.size() == 4);
Coordinate local({L, L, L, L});
2023-01-25 17:47:40 +00:00
Coordinate latt4(
{local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
2023-01-13 19:00:20 +00:00
2023-01-25 17:47:40 +00:00
GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(
latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
2023-01-13 19:00:20 +00:00
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global = NN;
uint64_t SHM = NP / NN;
///////// Welcome message ////////////
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume "
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "* Nc : " << Nc << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "* Ls : " << Ls << std::endl;
std::cout << GridLogMessage << "* ranks : " << NP << std::endl;
std::cout << GridLogMessage << "* nodes : " << NN << std::endl;
std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi)
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
///////// Lattice Init ////////////
2023-01-25 17:47:40 +00:00
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
2023-01-13 19:00:20 +00:00
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG4(UGrid);
RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid);
RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
typedef DomainWallFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
///////// Source preparation ////////////
Gauge Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4, Umu);
Fermion src(FGrid);
random(RNG5, src);
Fermion src_e(FrbGrid);
Fermion src_o(FrbGrid);
Fermion r_e(FrbGrid);
Fermion r_o(FrbGrid);
Fermion r_eo(FGrid);
Action Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
{
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd, src_o, src);
const int num_cases = 4;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases[] = {
2023-01-25 17:47:40 +00:00
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute,
CartesianCommunicator::CommunicatorPolicyConcurrent},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute,
CartesianCommunicator::CommunicatorPolicyConcurrent},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute,
CartesianCommunicator::CommunicatorPolicySequential},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute,
CartesianCommunicator::CommunicatorPolicySequential}};
2023-01-13 19:00:20 +00:00
for (int c = 0; c < num_cases; c++)
{
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
2023-01-13 19:00:20 +00:00
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
2023-01-13 19:00:20 +00:00
int nwarm = 10;
double t0 = usecond();
FGrid->Barrier();
for (int i = 0; i < nwarm; i++)
{
Dw.DhopEO(src_o, r_e, DaggerNo);
}
FGrid->Barrier();
double t1 = usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0, &ncall, sizeof(ncall));
2023-01-25 17:47:40 +00:00
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per
//second"<<std::endl;
2023-01-13 19:00:20 +00:00
Dw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for (uint64_t i = 0; i < ncall; i++)
{
t0 = usecond();
Dw.DhopEO(src_o, r_e, DaggerNo);
t1 = usecond();
t_time[i] = t1 - t0;
}
FGrid->Barrier();
double volume = Ls;
for (int mu = 0; mu < Nd; mu++)
volume = volume * latt4[mu];
// Nc=3 gives
// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2
// double flops=(1344.0*volume)/2;
#if 0
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2;
#else
2023-01-25 17:47:40 +00:00
double fps =
Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
2023-01-13 19:00:20 +00:00
#endif
double flops = (fps * volume) / 2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops / timestat.min;
mf_lo = flops / timestat.max;
mf_err = flops / timestat.min * timestat.err / timestat.mean;
mflops = flops / timestat.mean;
mflops_all.push_back(mflops);
if (mflops_best == 0)
mflops_best = mflops;
if (mflops_worst == 0)
mflops_worst = mflops;
if (mflops > mflops_best)
mflops_best = mflops;
if (mflops < mflops_worst)
mflops_worst = mflops;
std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
<< "-" << mf_hi << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s per node " << mflops / NN << std::endl;
2023-01-13 19:00:20 +00:00
}
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "====================================================================="
"============="
<< std::endl;
std::cout << GridLogMessage << L << "^4 x " << Ls
<< " Deo Best mflop/s = " << mflops_best << " ; "
<< mflops_best / NN << " per node " << std::endl;
std::cout << GridLogMessage << L << "^4 x " << Ls
<< " Deo Worst mflop/s = " << mflops_worst << " ; "
<< mflops_worst / NN << " per node " << std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << fmt << std::endl;
std::cout << GridLogMessage;
for (int i = 0; i < mflops_all.size(); i++)
{
std::cout << mflops_all[i] / NN << " ; ";
}
std::cout << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "====================================================================="
"============="
<< std::endl;
2023-01-13 19:00:20 +00:00
}
return mflops_best;
}
static double Staggered(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst = 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi();
assert(mpi.size() == 4);
Coordinate local({L, L, L, L});
2023-01-25 17:47:40 +00:00
Coordinate latt4(
{local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
2023-01-13 19:00:20 +00:00
2023-01-25 17:47:40 +00:00
GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(
latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
2023-01-13 19:00:20 +00:00
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global = NN;
uint64_t SHM = NP / NN;
///////// Welcome message ////////////
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L
<< "^4 local volume " << std::endl;
std::cout << GridLogMessage
<< "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "* ranks : " << NP << std::endl;
std::cout << GridLogMessage << "* nodes : " << NN << std::endl;
std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi)
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
///////// Lattice Init ////////////
2023-01-25 17:47:40 +00:00
GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(
latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
2023-01-13 19:00:20 +00:00
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1, 2, 3, 4});
GridParallelRNG RNG4(FGrid);
RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass = 0.1;
RealD c1 = 9.0 / 8.0;
RealD c2 = -1.0 / 24.0;
RealD u0 = 1.0;
typedef ImprovedStaggeredFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid);
SU<Nc>::HotConfiguration(RNG4, Umu);
typename Action::ImplParams params;
Action Ds(Umu, Umu, *FGrid, *FrbGrid, mass, c1, c2, u0, params);
///////// Source preparation ////////////
Fermion src(FGrid);
random(RNG4, src);
Fermion src_e(FrbGrid);
Fermion src_o(FrbGrid);
Fermion r_e(FrbGrid);
Fermion r_o(FrbGrid);
Fermion r_eo(FGrid);
{
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd, src_o, src);
const int num_cases = 4;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases[] = {
2023-01-25 17:47:40 +00:00
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute,
CartesianCommunicator::CommunicatorPolicyConcurrent},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute,
CartesianCommunicator::CommunicatorPolicyConcurrent},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute,
CartesianCommunicator::CommunicatorPolicySequential},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute,
CartesianCommunicator::CommunicatorPolicySequential}};
2023-01-13 19:00:20 +00:00
for (int c = 0; c < num_cases; c++)
{
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
2023-01-13 19:00:20 +00:00
if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric)
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels"
<< std::endl;
2023-01-13 19:00:20 +00:00
if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
2023-01-13 19:00:20 +00:00
int nwarm = 10;
double t0 = usecond();
FGrid->Barrier();
for (int i = 0; i < nwarm; i++)
{
Ds.DhopEO(src_o, r_e, DaggerNo);
}
FGrid->Barrier();
double t1 = usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0, &ncall, sizeof(ncall));
2023-01-25 17:47:40 +00:00
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per
//second"<<std::endl;
2023-01-13 19:00:20 +00:00
Ds.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for (uint64_t i = 0; i < ncall; i++)
{
t0 = usecond();
Ds.DhopEO(src_o, r_e, DaggerNo);
t1 = usecond();
t_time[i] = t1 - t0;
}
FGrid->Barrier();
double volume = 1;
for (int mu = 0; mu < Nd; mu++)
volume = volume * latt4[mu];
double flops = (1146.0 * volume) / 2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops / timestat.min;
mf_lo = flops / timestat.max;
mf_err = flops / timestat.min * timestat.err / timestat.mean;
mflops = flops / timestat.mean;
mflops_all.push_back(mflops);
if (mflops_best == 0)
mflops_best = mflops;
if (mflops_worst == 0)
mflops_worst = mflops;
if (mflops > mflops_best)
mflops_best = mflops;
if (mflops < mflops_worst)
mflops_worst = mflops;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
<< "-" << mf_hi << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s per node " << mflops / NN << std::endl;
2023-01-13 19:00:20 +00:00
}
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "====================================================================="
"============="
<< std::endl;
std::cout << GridLogMessage << L
<< "^4 Deo Best mflop/s = " << mflops_best << " ; "
<< mflops_best / NN << " per node " << std::endl;
std::cout << GridLogMessage << L
<< "^4 Deo Worst mflop/s = " << mflops_worst << " ; "
<< mflops_worst / NN << " per node " << std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << fmt << std::endl;
std::cout << GridLogMessage;
for (int i = 0; i < mflops_all.size(); i++)
{
std::cout << mflops_all[i] / NN << " ; ";
}
std::cout << std::endl;
}
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
return mflops_best;
}
};
int main(int argc, char **argv)
{
Grid_init(&argc, &argv);
2023-01-25 17:47:40 +00:00
CartesianCommunicator::SetCommunicatorPolicy(
CartesianCommunicator::CommunicatorPolicySequential);
2023-01-13 19:00:20 +00:00
#ifdef KNL
LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2});
#else
LebesgueOrder::Block = std::vector<int>({2, 2, 2, 2});
#endif
Benchmark::Decomposition();
int do_su4 = 1;
int do_memory = 1;
int do_comms = 1;
int sel = 4;
std::vector<int> L_list({8, 12, 16, 24, 32});
int selm1 = sel - 1;
std::vector<double> wilson;
std::vector<double> dwf4;
std::vector<double> staggered;
int Ls = 1;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
for (int l = 0; l < L_list.size(); l++)
{
wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
}
Ls = 12;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
for (int l = 0; l < L_list.size(); l++)
{
double result = Benchmark::DWF(Ls, L_list[l]);
dwf4.push_back(result);
}
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
for (int l = 0; l < L_list.size(); l++)
{
double result = Benchmark::Staggered(L_list[l]);
staggered.push_back(result);
}
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t "
<< dwf4[l] << " \t\t " << staggered[l] << std::endl;
2023-01-13 19:00:20 +00:00
}
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
int NN = NN_global;
if (do_memory)
{
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " Memory benchmark " << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
Benchmark::Memory();
}
if (do_su4)
{
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
Benchmark::SU4();
}
if (do_comms)
{
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " Communications benchmark " << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
2023-01-13 19:00:20 +00:00
Benchmark::Comms();
}
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t "
<< dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
2023-01-13 19:00:20 +00:00
}
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage
<< " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
<< " Mflop/s per node" << std::endl;
std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
<< dwf4[selm1] / NN << ") " << std::endl;
2023-01-13 19:00:20 +00:00
std::cout << std::setprecision(3);
2023-01-25 17:47:40 +00:00
std::cout << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
2023-01-13 19:00:20 +00:00
Grid_finalize();
}