Merge remote-tracking branch 'upstream/main' into main

This commit is contained in:
Simon Bürger 2023-01-26 23:14:32 +00:00
commit 5f976b361c
10 changed files with 740 additions and 457 deletions

View File

@ -6,7 +6,7 @@
BreakBeforeBraces: Allman, BreakBeforeBraces: Allman,
AllowShortIfStatementsOnASingleLine: false, AllowShortIfStatementsOnASingleLine: false,
IndentCaseLabels: false, IndentCaseLabels: false,
ColumnLimit: 0, ColumnLimit: 90,
AccessModifierOffset: -4, AccessModifierOffset: -4,
NamespaceIndentation: All, NamespaceIndentation: All,
FixNamespaceComments: false, FixNamespaceComments: false,

View File

@ -18,6 +18,7 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#include "Common.hpp"
#include <Grid/Grid.h> #include <Grid/Grid.h>
using namespace Grid; using namespace Grid;
@ -44,8 +45,7 @@ struct time_statistics
mean = sum / v.size(); mean = sum / v.size();
std::vector<double> diff(v.size()); std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
{ return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size() * (v.size() - 1))); err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
@ -64,11 +64,8 @@ void comms_header()
<< "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl; << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
}; };
Gamma::Algebra Gmu[] = { Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
Gamma::Algebra::GammaX, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT};
struct controls struct controls
{ {
@ -84,26 +81,52 @@ class Benchmark
{ {
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" << std::endl; << "======================================================================="
std::cout << GridLogMessage << "==================================================================================" << std::endl; "==========="
<< std::endl;
std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads"
<< std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "Grid Default Decomposition patterns\n"; std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() << std::endl; std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads()
std::cout << GridLogMessage << "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi()) << std::endl; << std::endl;
std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) << std::endl; << "\tMPI tasks : " << GridCmdVectorIntToString(GridDefaultMpi())
std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) << std::endl; << std::endl;
std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) << std::endl; std::cout << GridLogMessage << "\tvReal : " << sizeof(vReal) * 8 << "bits ; "
std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) << std::endl; << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd()))
std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) << std::endl; << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage << "\tvRealF : " << sizeof(vRealF) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvRealD : " << sizeof(vRealD) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvComplex : " << sizeof(vComplex) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvComplexF : " << sizeof(vComplexF) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd()))
<< std::endl;
std::cout << GridLogMessage << "\tvComplexD : " << sizeof(vComplexD) * 8
<< "bits ; "
<< GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd()))
<< std::endl;
} }
static void Comms(void) static void Comms(void)
{ {
int Nloop = 200; int Nloop = 200;
int nmu = 0; int nmu = 0;
int maxlat = 32; int maxlat = 48;
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
@ -115,94 +138,89 @@ class Benchmark
std::vector<double> t_time(Nloop); std::vector<double> t_time(Nloop);
time_statistics timestat; time_statistics timestat;
std::cout << GridLogMessage << "====================================================================================================" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl; << "======================================================================="
std::cout << GridLogMessage << "====================================================================================================" << std::endl; "============================="
comms_header(); << std::endl;
std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "
<< nmu << " dimensions" << std::endl;
std::cout << GridLogMessage
<< "======================================================================="
"============================="
<< std::endl;
grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)",
"time (usec)", "rate (GB/s)", "std dev", "max");
for (int lat = 16; lat <= maxlat; lat += 8) for (int lat = 16; lat <= maxlat; lat += 8)
{ {
// for(int Ls=8;Ls<=8;Ls*=2){ int Ls = 12;
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank / Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
for (int d = 0; d < 8; d++)
{ {
int Ls = 12; xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
}
Coordinate latt_size({lat * mpi_layout[0], double dbytes;
lat * mpi_layout[1],
lat * mpi_layout[2],
lat * mpi_layout[3]});
GridCartesian Grid(latt_size, simd_layout, mpi_layout); for (int dir = 0; dir < 8; dir++)
RealD Nrank = Grid._Nprocessors; {
RealD Nnode = Grid.NodeCount(); int mu = dir % 4;
RealD ppn = Nrank / Nnode; if (mpi_layout[mu] > 1)
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
// Grid.ShmBufferFreeAll();
uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
for (int d = 0; d < 8; d++)
{ {
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
// bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
// bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
// int ncomm; std::vector<double> times(Nloop);
double dbytes; for (int i = 0; i < Nloop; i++)
for (int dir = 0; dir < 8; dir++)
{
int mu = dir % 4;
if (mpi_layout[mu] > 1)
{ {
std::vector<double> times(Nloop); dbytes = 0;
for (int i = 0; i < Nloop; i++) double start = usecond();
int xmit_to_rank;
int recv_from_rank;
if (dir == mu)
{ {
int comm_proc = 1;
dbytes = 0; Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
double start = usecond();
int xmit_to_rank;
int recv_from_rank;
if (dir == mu)
{
int comm_proc = 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
else
{
int comm_proc = mpi_layout[mu] - 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank,
bytes);
dbytes += bytes;
double stop = usecond();
t_time[i] = stop - start; // microseconds
} }
timestat.statistics(t_time); else
{
int comm_proc = mpi_layout[mu] - 1;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank, bytes);
dbytes += bytes;
dbytes = dbytes * ppn; double stop = usecond();
double xbytes = dbytes * 0.5; t_time[i] = stop - start; // microseconds
double bidibytes = dbytes;
std::cout << GridLogMessage << lat << "\t" << Ls << "\t "
<< bytes << " \t "
<< xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t "
<< xbytes / timestat.max << " " << xbytes / timestat.min
<< "\t\t" << bidibytes / timestat.mean << " " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " "
<< bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl;
} }
timestat.statistics(t_time);
dbytes = dbytes * ppn;
double bidibytes = 2. * dbytes;
double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.;
double rate_err = rate * timestat.err / timestat.mean;
double rate_max = rate * timestat.mean / timestat.min;
grid_printf("%5d %5d %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, bytes,
timestat.mean, rate, rate_err, rate_max);
} }
for (int d = 0; d < 8; d++) }
{ for (int d = 0; d < 8; d++)
acceleratorFreeDevice(xbuf[d]); {
acceleratorFreeDevice(rbuf[d]); acceleratorFreeDevice(xbuf[d]);
} acceleratorFreeDevice(rbuf[d]);
} }
} }
return; return;
@ -217,9 +235,15 @@ class Benchmark
Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd()); Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl; std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " L " std::cout << GridLogMessage << " L "
<< "\t\t" << "\t\t"
<< "bytes" << "bytes"
@ -229,7 +253,9 @@ class Benchmark
<< "Gflop/s" << "Gflop/s"
<< "\t\t seconds" << "\t\t seconds"
<< "\t\tGB/s / node" << std::endl; << "\t\tGB/s / node" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl; std::cout << GridLogMessage
<< "----------------------------------------------------------"
<< std::endl;
// uint64_t NP; // uint64_t NP;
uint64_t NN; uint64_t NN;
@ -242,7 +268,8 @@ class Benchmark
for (int lat = 8; lat <= lmax; lat += 8) for (int lat = 8; lat <= lmax; lat += 8)
{ {
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
GridCartesian Grid(latt_size, simd_layout, mpi_layout); GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@ -273,9 +300,10 @@ class Benchmark
double flops = vol * Nvec * 2; // mul,add double flops = vol * Nvec * 2; // mul,add
double bytes = 3.0 * vol * Nvec * sizeof(Real); double bytes = 3.0 * vol * Nvec * sizeof(Real);
std::cout << GridLogMessage << std::setprecision(3) std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
<< lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000. << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
<< "\t\t" << bytes / time / NN << std::endl; << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
<< std::endl;
} }
}; };
@ -287,9 +315,15 @@ class Benchmark
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd()); Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl; std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " L " std::cout << GridLogMessage << " L "
<< "\t\t" << "\t\t"
<< "bytes" << "bytes"
@ -299,7 +333,9 @@ class Benchmark
<< "Gflop/s" << "Gflop/s"
<< "\t\t seconds" << "\t\t seconds"
<< "\t\tGB/s / node" << std::endl; << "\t\tGB/s / node" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl; std::cout << GridLogMessage
<< "----------------------------------------------------------"
<< std::endl;
uint64_t NN; uint64_t NN;
@ -310,7 +346,8 @@ class Benchmark
for (int lat = 8; lat <= lmax; lat += 8) for (int lat = 8; lat <= lmax; lat += 8)
{ {
Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]}); Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[3]});
int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3]; int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
GridCartesian Grid(latt_size, simd_layout, mpi_layout); GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@ -337,9 +374,10 @@ class Benchmark
double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF); double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
std::cout << GridLogMessage << std::setprecision(3) std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
<< lat << "\t\t" << bytes << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000. << " \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
<< "\t\t" << bytes / time / NN << std::endl; << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
<< std::endl;
} }
}; };
@ -360,31 +398,41 @@ class Benchmark
Coordinate mpi = GridDefaultMpi(); Coordinate mpi = GridDefaultMpi();
assert(mpi.size() == 4); assert(mpi.size() == 4);
Coordinate local({L, L, L, L}); Coordinate local({L, L, L, L});
Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); Coordinate latt4(
{local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultSimd(Nd, vComplex::Nsimd()), latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount(); uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount(); uint64_t NN = TmpGrid->NodeCount();
NN_global = NN; NN_global = NN;
uint64_t SHM = NP / NN; uint64_t SHM = NP / NN;
///////// Welcome message //////////// ///////// Welcome message ////////////
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " << std::endl; << "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume "
<< std::endl;
std::cout << GridLogMessage << "* Nc : " << Nc << std::endl; std::cout << GridLogMessage << "* Nc : " << Nc << std::endl;
std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; std::cout << GridLogMessage
<< "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl;
std::cout << GridLogMessage << "* Ls : " << Ls << std::endl; std::cout << GridLogMessage << "* Ls : " << Ls << std::endl;
std::cout << GridLogMessage << "* ranks : " << NP << std::endl; std::cout << GridLogMessage << "* ranks : " << NP << std::endl;
std::cout << GridLogMessage << "* nodes : " << NN << std::endl; std::cout << GridLogMessage << "* nodes : " << NN << std::endl;
std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl;
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl; std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi)
<< std::endl;
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
///////// Lattice Init //////////// ///////// Lattice Init ////////////
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
@ -423,10 +471,14 @@ class Benchmark
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases[] = { controls Cases[] = {
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute,
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, CartesianCommunicator::CommunicatorPolicyConcurrent},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential}, {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute,
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}}; CartesianCommunicator::CommunicatorPolicyConcurrent},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute,
CartesianCommunicator::CommunicatorPolicySequential},
{WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute,
CartesianCommunicator::CommunicatorPolicySequential}};
for (int c = 0; c < num_cases; c++) for (int c = 0; c < num_cases; c++)
{ {
@ -435,7 +487,10 @@ class Benchmark
WilsonKernelsStatic::Opt = Cases[c].Opt; WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl; std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
@ -443,7 +498,10 @@ class Benchmark
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
std::cout << GridLogMessage << "* SINGLE precision " << std::endl; std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
int nwarm = 10; int nwarm = 10;
double t0 = usecond(); double t0 = usecond();
@ -458,7 +516,8 @@ class Benchmark
FGrid->Broadcast(0, &ncall, sizeof(ncall)); FGrid->Broadcast(0, &ncall, sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per
// second"<<std::endl;
Dw.ZeroCounters(); Dw.ZeroCounters();
time_statistics timestat; time_statistics timestat;
@ -483,7 +542,8 @@ class Benchmark
#if 0 #if 0
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2;
#else #else
double fps = Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2; double fps =
Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
#endif #endif
double flops = (fps * volume) / 2; double flops = (fps * volume) / 2;
double mf_hi, mf_lo, mf_err; double mf_hi, mf_lo, mf_err;
@ -505,14 +565,25 @@ class Benchmark
mflops_worst = mflops; mflops_worst = mflops;
std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl; std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl; std::cout << GridLogMessage << std::fixed << std::setprecision(1)
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank " << mflops / NP << std::endl; << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node " << mflops / NN << std::endl; << "-" << mf_hi << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s per node " << mflops / NN << std::endl;
} }
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Best mflop/s = " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl; << "====================================================================="
std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Worst mflop/s = " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl; "============="
<< std::endl;
std::cout << GridLogMessage << L << "^4 x " << Ls
<< " Deo Best mflop/s = " << mflops_best << " ; "
<< mflops_best / NN << " per node " << std::endl;
std::cout << GridLogMessage << L << "^4 x " << Ls
<< " Deo Worst mflop/s = " << mflops_worst << " ; "
<< mflops_worst / NN << " per node " << std::endl;
std::cout << GridLogMessage << fmt << std::endl; std::cout << GridLogMessage << fmt << std::endl;
std::cout << GridLogMessage; std::cout << GridLogMessage;
@ -521,7 +592,10 @@ class Benchmark
std::cout << mflops_all[i] / NN << " ; "; std::cout << mflops_all[i] / NN << " ; ";
} }
std::cout << std::endl; std::cout << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "====================================================================="
"============="
<< std::endl;
} }
return mflops_best; return mflops_best;
} }
@ -540,29 +614,39 @@ class Benchmark
Coordinate mpi = GridDefaultMpi(); Coordinate mpi = GridDefaultMpi();
assert(mpi.size() == 4); assert(mpi.size() == 4);
Coordinate local({L, L, L, L}); Coordinate local({L, L, L, L});
Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]}); Coordinate latt4(
{local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultSimd(Nd, vComplex::Nsimd()), latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount(); uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount(); uint64_t NN = TmpGrid->NodeCount();
NN_global = NN; NN_global = NN;
uint64_t SHM = NP / NN; uint64_t SHM = NP / NN;
///////// Welcome message //////////// ///////// Welcome message ////////////
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L << "^4 local volume " << std::endl; << "======================================================================="
std::cout << GridLogMessage << "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl; "==========="
<< std::endl;
std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L
<< "^4 local volume " << std::endl;
std::cout << GridLogMessage
<< "* Global volume : " << GridCmdVectorIntToString(latt4) << std::endl;
std::cout << GridLogMessage << "* ranks : " << NP << std::endl; std::cout << GridLogMessage << "* ranks : " << NP << std::endl;
std::cout << GridLogMessage << "* nodes : " << NN << std::endl; std::cout << GridLogMessage << "* nodes : " << NN << std::endl;
std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl; std::cout << GridLogMessage << "* ranks/node : " << SHM << std::endl;
std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi) << std::endl; std::cout << GridLogMessage << "* ranks geom : " << GridCmdVectorIntToString(mpi)
<< std::endl;
std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl; std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
///////// Lattice Init //////////// ///////// Lattice Init ////////////
GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(
latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid); GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init //////////// ///////// RNG Init ////////////
@ -604,10 +688,14 @@ class Benchmark
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases[] = { controls Cases[] = {
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute,
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent}, CartesianCommunicator::CommunicatorPolicyConcurrent},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential}, {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute,
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}}; CartesianCommunicator::CommunicatorPolicyConcurrent},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute,
CartesianCommunicator::CommunicatorPolicySequential},
{StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute,
CartesianCommunicator::CommunicatorPolicySequential}};
for (int c = 0; c < num_cases; c++) for (int c = 0; c < num_cases; c++)
{ {
@ -616,15 +704,22 @@ class Benchmark
StaggeredKernelsStatic::Opt = Cases[c].Opt; StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric) if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" << std::endl; std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels"
<< std::endl;
if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute) if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl; std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl; std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
std::cout << GridLogMessage << "* SINGLE precision " << std::endl; std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "==================================================================="
"==============="
<< std::endl;
int nwarm = 10; int nwarm = 10;
double t0 = usecond(); double t0 = usecond();
@ -639,7 +734,8 @@ class Benchmark
FGrid->Broadcast(0, &ncall, sizeof(ncall)); FGrid->Broadcast(0, &ncall, sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per
// second"<<std::endl;
Ds.ZeroCounters(); Ds.ZeroCounters();
time_statistics timestat; time_statistics timestat;
@ -675,14 +771,25 @@ class Benchmark
if (mflops < mflops_worst) if (mflops < mflops_worst)
mflops_worst = mflops; mflops_worst = mflops;
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl; std::cout << GridLogMessage << std::fixed << std::setprecision(1)
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank " << mflops / NP << std::endl; << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo
std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node " << mflops / NN << std::endl; << "-" << mf_hi << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s per rank " << mflops / NP << std::endl;
std::cout << GridLogMessage << std::fixed << std::setprecision(1)
<< "Deo mflop/s per node " << mflops / NN << std::endl;
} }
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << L << "^4 Deo Best mflop/s = " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl; << "====================================================================="
std::cout << GridLogMessage << L << "^4 Deo Worst mflop/s = " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl; "============="
<< std::endl;
std::cout << GridLogMessage << L
<< "^4 Deo Best mflop/s = " << mflops_best << " ; "
<< mflops_best / NN << " per node " << std::endl;
std::cout << GridLogMessage << L
<< "^4 Deo Worst mflop/s = " << mflops_worst << " ; "
<< mflops_worst / NN << " per node " << std::endl;
std::cout << GridLogMessage << fmt << std::endl; std::cout << GridLogMessage << fmt << std::endl;
std::cout << GridLogMessage; std::cout << GridLogMessage;
@ -692,7 +799,10 @@ class Benchmark
} }
std::cout << std::endl; std::cout << std::endl;
} }
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
return mflops_best; return mflops_best;
} }
}; };
@ -701,7 +811,8 @@ int main(int argc, char **argv)
{ {
Grid_init(&argc, &argv); Grid_init(&argc, &argv);
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); CartesianCommunicator::SetCommunicatorPolicy(
CartesianCommunicator::CommunicatorPolicySequential);
#ifdef KNL #ifdef KNL
LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2}); LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2});
#else #else
@ -709,9 +820,11 @@ int main(int argc, char **argv)
#endif #endif
Benchmark::Decomposition(); Benchmark::Decomposition();
int do_su4 = 1; int do_su4 = 0;
int do_memory = 1; int do_memory = 0;
int do_comms = 1; int do_comms = 1;
int do_flops = 0;
int Ls = 1;
int sel = 4; int sel = 4;
std::vector<int> L_list({8, 12, 16, 24, 32}); std::vector<int> L_list({8, 12, 16, 24, 32});
@ -721,84 +834,170 @@ int main(int argc, char **argv)
std::vector<double> dwf4; std::vector<double> dwf4;
std::vector<double> staggered; std::vector<double> staggered;
int Ls = 1; if (do_flops)
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
for (int l = 0; l < L_list.size(); l++)
{ {
wilson.push_back(Benchmark::DWF(Ls, L_list[l])); Ls = 1;
} std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
for (int l = 0; l < L_list.size(); l++)
{
wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
}
Ls = 12; Ls = 12;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout
std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl; << GridLogMessage
std::cout << GridLogMessage << "==================================================================================" << std::endl; << "========================================================================="
for (int l = 0; l < L_list.size(); l++) "========="
{ << std::endl;
double result = Benchmark::DWF(Ls, L_list[l]); std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
dwf4.push_back(result); std::cout
} << GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
for (int l = 0; l < L_list.size(); l++)
{
double result = Benchmark::DWF(Ls, L_list[l]);
dwf4.push_back(result);
}
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout
std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl; << GridLogMessage
std::cout << GridLogMessage << "==================================================================================" << std::endl; << "========================================================================="
for (int l = 0; l < L_list.size(); l++) "========="
{ << std::endl;
double result = Benchmark::Staggered(L_list[l]); std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised"
staggered.push_back(result); << std::endl;
} std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
for (int l = 0; l < L_list.size(); l++)
{
double result = Benchmark::Staggered(L_list[l]);
staggered.push_back(result);
}
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout
std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl; << GridLogMessage
std::cout << GridLogMessage << "==================================================================================" << std::endl; << "========================================================================="
std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl; "========="
for (int l = 0; l < L_list.size(); l++) << std::endl;
{ std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " << dwf4[l] << " \t\t " << staggered[l] << std::endl; std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t "
<< dwf4[l] << " \t\t " << staggered[l] << std::endl;
}
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
} }
std::cout << GridLogMessage << "==================================================================================" << std::endl;
int NN = NN_global; int NN = NN_global;
if (do_memory) if (do_memory)
{ {
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " Memory benchmark " << std::endl; std::cout << GridLogMessage << " Memory benchmark " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
Benchmark::Memory(); Benchmark::Memory();
} }
if (do_su4) if (do_su4)
{ {
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " SU(4) benchmark " << std::endl; std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
Benchmark::SU4(); Benchmark::SU4();
} }
if (do_comms) if (do_comms)
{ {
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
std::cout << GridLogMessage << " Communications benchmark " << std::endl; std::cout << GridLogMessage << " Communications benchmark " << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout << GridLogMessage
<< "======================================================================="
"==========="
<< std::endl;
Benchmark::Comms(); Benchmark::Comms();
} }
std::cout << GridLogMessage << "==================================================================================" << std::endl; if (do_flops)
std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl;
std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
for (int l = 0; l < L_list.size(); l++)
{ {
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl; std::cout
} << GridLogMessage
std::cout << GridLogMessage << "==================================================================================" << std::endl; << "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
for (int l = 0; l < L_list.size(); l++)
{
std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t "
<< dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
}
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
std::cout << GridLogMessage << "==================================================================================" << std::endl; std::cout
std::cout << GridLogMessage << " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN << " Mflop/s per node" << std::endl; << GridLogMessage
std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl; << "========================================================================="
std::cout << std::setprecision(3); "========="
std::cout << GridLogMessage << "==================================================================================" << std::endl; << std::endl;
std::cout << GridLogMessage
<< " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
<< " Mflop/s per node" << std::endl;
std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
<< dwf4[selm1] / NN << ") " << std::endl;
std::cout << std::setprecision(3);
std::cout
<< GridLogMessage
<< "========================================================================="
"========="
<< std::endl;
}
Grid_finalize(); Grid_finalize();
} }

View File

@ -32,23 +32,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#ifdef HAVE_LIME #ifdef HAVE_LIME
using namespace Grid; using namespace Grid;
std::string filestem(const int l) std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
{
return "iobench_l" + std::to_string(l);
}
int vol(const int i) int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
{
return BENCH_IO_LMIN + 2 * i;
}
int volInd(const int l) int volInd(const int l) { return (l - BENCH_IO_LMIN) / 2; }
{
return (l - BENCH_IO_LMIN) / 2;
}
template <typename Mat> template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
{ {
auto nr = data[0].rows(), nc = data[0].cols(); auto nr = data[0].rows(), nc = data[0].cols();
Eigen::MatrixXd sqSum(nr, nc); Eigen::MatrixXd sqSum(nr, nc);
@ -66,11 +56,11 @@ void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
mean /= n; mean /= n;
} }
#define grid_printf(...) \ #define grid_printf(...) \
{ \ { \
char _buf[1024]; \ char _buf[1024]; \
sprintf(_buf, __VA_ARGS__); \ sprintf(_buf, __VA_ARGS__); \
MSG << _buf; \ MSG << _buf; \
} }
enum enum
@ -173,47 +163,49 @@ int main(int argc, char **argv)
MSG << "SUMMARY" << std::endl; MSG << "SUMMARY" << std::endl;
MSG << BIGSEP << std::endl; MSG << BIGSEP << std::endl;
MSG << "Summary of individual results (all results in MB/s)." << std::endl; MSG << "Summary of individual results (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; MSG << "Every second colum gives the standard deviation of the previous column."
<< std::endl;
MSG << std::endl; MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
"L", "std read", "std dev", "std write", "std dev", "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
"Grid read", "std dev", "Grid write", "std dev");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{ {
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", l,
l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), mean(volInd(l), sRead), stdDev(volInd(l), sRead), mean(volInd(l), sWrite),
mean(volInd(l), sWrite), stdDev(volInd(l), sWrite), stdDev(volInd(l), sWrite), mean(volInd(l), gRead),
mean(volInd(l), gRead), stdDev(volInd(l), gRead), stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); stdDev(volInd(l), gWrite));
} }
MSG << std::endl; MSG << std::endl;
MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl; MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
<< std::endl;
MSG << std::endl; MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s\n", grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
"L", "std read", "std write", "Grid read", "Grid write"); "Grid write");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{ {
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
l, rob(volInd(l), sRead), rob(volInd(l), sWrite), rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
rob(volInd(l), gRead), rob(volInd(l), gWrite));
} }
MSG << std::endl; MSG << std::endl;
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl; MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; << "^4 (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column."
<< std::endl;
MSG << std::endl; MSG << std::endl;
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
"std read", "std dev", "std write", "std dev", "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
"Grid read", "std dev", "Grid write", "std dev"); grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
MSG << std::endl; MSG << std::endl;
MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl; MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
<< std::endl;
MSG << std::endl; MSG << std::endl;
grid_printf("%12s %12s %12s %12s\n", grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
"std read", "std write", "Grid read", "Grid write"); "Grid write");
grid_printf("%12.1f %12.1f %12.1f %12.1f\n", grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite)); avRob(gWrite));
Grid_finalize(); Grid_finalize();

View File

@ -20,9 +20,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <Grid/Grid.h> #include <Grid/Grid.h>
#define MSG std::cout << GridLogMessage #define MSG std::cout << GridLogMessage
#define SEP \ #define SEP \
"-----------------------------------------------------------------------------" "-----------------------------------------------------------------------------"
#define BIGSEP \ #define BIGSEP \
"=============================================================================" "============================================================================="
#ifdef HAVE_LIME #ifdef HAVE_LIME
@ -36,16 +36,15 @@ namespace Grid
// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
// //
// template <typename Field> // template <typename Field> void stdWrite(const std::string filestem, Field &vec)
// void stdWrite(const std::string filestem, Field &vec)
// { // {
// std::string rankStr = std::to_string(vec.Grid()->ThisRank()); // std::string rankStr = std::to_string(vec.Grid()->ThisRank());
// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
// size_t size; // size_t size;
// uint32_t crc; // uint32_t crc;
// GridStopWatch ioWatch, crcWatch; // GridStopWatch ioWatch, crcWatch;
// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); // size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
// autoView(vec_v, vec, CpuRead); // autoView(vec_v, vec, CpuRead);
// crcWatch.Start(); // crcWatch.Start();
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size); // crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
@ -53,36 +52,39 @@ namespace Grid
// crcWatch.Stop(); // crcWatch.Stop();
// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; // MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
// ioWatch.Start(); // ioWatch.Start();
// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); // std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
// vec.Grid()->lSites(), file);
// ioWatch.Stop(); // ioWatch.Stop();
// std::fclose(file); // std::fclose(file);
// size *= vec.Grid()->ProcessorCount(); // size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf; // auto &p = BinaryIO::lastPerf;
// p.size = size; // p.size = size;
// p.time = ioWatch.useconds(); // p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); // p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() // MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; // << ",
// "
// << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; // MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
// } // }
//
// template <typename Field> // template <typename Field> void stdRead(Field &vec, const std::string filestem)
// void stdRead(Field &vec, const std::string filestem)
// { // {
// std::string rankStr = std::to_string(vec.Grid()->ThisRank()); // std::string rankStr = std::to_string(vec.Grid()->ThisRank());
// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); // std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
// size_t size; // size_t size;
// uint32_t crcRead, crcData; // uint32_t crcRead, crcData;
// GridStopWatch ioWatch, crcWatch; // GridStopWatch ioWatch, crcWatch;
// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); // size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
// crcWatch.Start(); // crcWatch.Start();
// std::fread(&crcRead, sizeof(uint32_t), 1, file); // std::fread(&crcRead, sizeof(uint32_t), 1, file);
// crcWatch.Stop(); // crcWatch.Stop();
// { // {
// autoView(vec_v, vec, CpuWrite); // autoView(vec_v, vec, CpuWrite);
// ioWatch.Start(); // ioWatch.Start();
// std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); // std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
// vec.Grid()->lSites(), file);
// ioWatch.Stop(); // ioWatch.Stop();
// std::fclose(file); // std::fclose(file);
// } // }
@ -96,19 +98,19 @@ namespace Grid
// assert(crcData == crcRead); // assert(crcData == crcRead);
// size *= vec.Grid()->ProcessorCount(); // size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf; // auto &p = BinaryIO::lastPerf;
// p.size = size; // p.size = size;
// p.time = ioWatch.useconds(); // p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); // p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() // MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; // << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; // MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
// } // }
template <typename Field> template <typename Field> void stdWrite(const std::string filestem, Field &vec)
void stdWrite(const std::string filestem, Field &vec)
{ {
std::string rankStr = std::to_string(vec.Grid()->ThisRank()); std::string rankStr = std::to_string(vec.Grid()->ThisRank());
std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary); std::ofstream file(filestem + "." + rankStr + ".bin",
std::ios::out | std::ios::binary);
size_t size, sizec; size_t size, sizec;
uint32_t crc; uint32_t crc;
GridStopWatch ioWatch, crcWatch; GridStopWatch ioWatch, crcWatch;
@ -130,16 +132,16 @@ namespace Grid
p.size = size; p.size = size;
p.time = ioWatch.useconds(); p.time = ioWatch.useconds();
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl; << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
} }
template <typename Field> template <typename Field> void stdRead(Field &vec, const std::string filestem)
void stdRead(Field &vec, const std::string filestem)
{ {
std::string rankStr = std::to_string(vec.Grid()->ThisRank()); std::string rankStr = std::to_string(vec.Grid()->ThisRank());
std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary); std::ifstream file(filestem + "." + rankStr + ".bin",
std::ios::in | std::ios::binary);
size_t size, sizec; size_t size, sizec;
uint32_t crcRead, crcData; uint32_t crcRead, crcData;
GridStopWatch ioWatch, crcWatch; GridStopWatch ioWatch, crcWatch;
@ -168,13 +170,12 @@ namespace Grid
p.size = size; p.size = size;
p.time = ioWatch.useconds(); p.time = ioWatch.useconds();
p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6); p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl; << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
} }
template <typename Field> template <typename Field> void limeWrite(const std::string filestem, Field &vec)
void limeWrite(const std::string filestem, Field &vec)
{ {
emptyUserRecord record; emptyUserRecord record;
ScidacWriter binWriter(vec.Grid()->IsBoss()); ScidacWriter binWriter(vec.Grid()->IsBoss());
@ -184,8 +185,7 @@ namespace Grid
binWriter.close(); binWriter.close();
} }
template <typename Field> template <typename Field> void limeRead(Field &vec, const std::string filestem)
void limeRead(Field &vec, const std::string filestem)
{ {
emptyUserRecord record; emptyUserRecord record;
ScidacReader binReader; ScidacReader binReader;
@ -225,12 +225,13 @@ namespace Grid
template <typename Field> template <typename Field>
void writeBenchmark(const Coordinate &latt, const std::string filename, void writeBenchmark(const Coordinate &latt, const std::string filename,
const WriterFn<Field> &write, const WriterFn<Field> &write, const unsigned int Ls = 1,
const unsigned int Ls = 1, const bool rb = false) const bool rb = false)
{ {
auto mpi = GridDefaultMpi(); auto mpi = GridDefaultMpi();
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr<GridCartesian> gBasePt(
SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
std::shared_ptr<GridBase> gPt; std::shared_ptr<GridBase> gPt;
std::random_device rd; std::random_device rd;
@ -251,12 +252,13 @@ namespace Grid
template <typename Field> template <typename Field>
void readBenchmark(const Coordinate &latt, const std::string filename, void readBenchmark(const Coordinate &latt, const std::string filename,
const ReaderFn<Field> &read, const ReaderFn<Field> &read, const unsigned int Ls = 1,
const unsigned int Ls = 1, const bool rb = false) const bool rb = false)
{ {
auto mpi = GridDefaultMpi(); auto mpi = GridDefaultMpi();
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr<GridCartesian> gBasePt(
SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
std::shared_ptr<GridBase> gPt; std::shared_ptr<GridBase> gPt;
makeGrid(gPt, gBasePt, Ls, rb); makeGrid(gPt, gBasePt, Ls, rb);

View File

@ -34,8 +34,7 @@ struct time_statistics
mean = sum / v.size(); mean = sum / v.size();
std::vector<double> diff(v.size()); std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
{ return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size() * (v.size() - 1))); err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
@ -50,8 +49,7 @@ void header()
std::cout << GridLogMessage << " L " std::cout << GridLogMessage << " L "
<< "\t" << "\t"
<< " Ls " << " Ls "
<< "\t" << "\t" << std::setw(11) << "bytes\t\t"
<< std::setw(11) << "bytes\t\t"
<< "MB/s uni" << "MB/s uni"
<< "\t" << "\t"
<< "MB/s bidi" << std::endl; << "MB/s bidi" << std::endl;
@ -64,7 +62,8 @@ int main(int argc, char **argv)
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
<< std::endl;
int Nloop = 250; int Nloop = 250;
int nmu = 0; int nmu = 0;
@ -73,13 +72,21 @@ int main(int argc, char **argv)
if (mpi_layout[mu] > 1) if (mpi_layout[mu] > 1)
nmu++; nmu++;
std::cout << GridLogMessage << "Number of iterations to average: " << Nloop << std::endl; std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
<< std::endl;
std::vector<double> t_time(Nloop); std::vector<double> t_time(Nloop);
// time_statistics timestat; // time_statistics timestat;
std::cout << GridLogMessage << "====================================================================================================" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "= Benchmarking sequential halo exchange from host memory " << std::endl; << "========================================================================="
std::cout << GridLogMessage << "====================================================================================================" << std::endl; "==========================="
<< std::endl;
std::cout << GridLogMessage
<< "= Benchmarking sequential halo exchange from host memory " << std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
header(); header();
for (int lat = 8; lat <= maxlat; lat += 4) for (int lat = 8; lat <= maxlat; lat += 4)
@ -87,9 +94,7 @@ int main(int argc, char **argv)
for (int Ls = 8; Ls <= 8; Ls *= 2) for (int Ls = 8; Ls <= 8; Ls *= 2)
{ {
Coordinate latt_size({lat * mpi_layout[0], Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[1],
lat * mpi_layout[2],
lat * mpi_layout[3]}); lat * mpi_layout[3]});
GridCartesian Grid(latt_size, simd_layout, mpi_layout); GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@ -127,22 +132,16 @@ int main(int argc, char **argv)
{ {
std::vector<CommsRequest_t> requests; std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0], Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
xmit_to_rank, (void *)&rbuf[mu][0], recv_from_rank, bytes);
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
} }
comm_proc = mpi_layout[mu] - 1; comm_proc = mpi_layout[mu] - 1;
{ {
std::vector<CommsRequest_t> requests; std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
xmit_to_rank, (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
(void *)&rbuf[mu + 4][0],
recv_from_rank,
bytes);
} }
} }
Grid.Barrier(); Grid.Barrier();
@ -154,17 +153,24 @@ int main(int argc, char **argv)
double bidibytes = xbytes + rbytes; double bidibytes = xbytes + rbytes;
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t" std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
<< std::setw(11) << bytes << std::fixed << std::setprecision(1) << std::setw(7) << " " << std::setw(11) << bytes << std::fixed << std::setprecision(1)
<< std::right << xbytes / mean << " " << std::setw(7) << " " << std::right << xbytes / mean << " "
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl; << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
} }
} }
} }
} }
std::cout << GridLogMessage << "====================================================================================================" << std::endl; std::cout << GridLogMessage
std::cout << GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory " << std::endl; << "========================================================================="
std::cout << GridLogMessage << "====================================================================================================" << std::endl; "==========================="
<< std::endl;
std::cout << GridLogMessage
<< "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
header(); header();
for (int lat = 8; lat <= maxlat; lat += 4) for (int lat = 8; lat <= maxlat; lat += 4)
@ -172,9 +178,7 @@ int main(int argc, char **argv)
for (int Ls = 8; Ls <= 8; Ls *= 2) for (int Ls = 8; Ls <= 8; Ls *= 2)
{ {
Coordinate latt_size({lat * mpi_layout[0], Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
lat * mpi_layout[1],
lat * mpi_layout[2],
lat * mpi_layout[3]}); lat * mpi_layout[3]});
GridCartesian Grid(latt_size, simd_layout, mpi_layout); GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@ -212,22 +216,16 @@ int main(int argc, char **argv)
{ {
std::vector<CommsRequest_t> requests; std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0], Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
xmit_to_rank, (void *)&rbuf[mu][0], recv_from_rank, bytes);
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
} }
comm_proc = mpi_layout[mu] - 1; comm_proc = mpi_layout[mu] - 1;
{ {
std::vector<CommsRequest_t> requests; std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank); Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
xmit_to_rank, (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
(void *)&rbuf[mu + 4][0],
recv_from_rank,
bytes);
} }
} }
Grid.Barrier(); Grid.Barrier();
@ -239,8 +237,8 @@ int main(int argc, char **argv)
double bidibytes = xbytes + rbytes; double bidibytes = xbytes + rbytes;
std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t" std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
<< std::setw(11) << bytes << std::fixed << std::setprecision(1) << std::setw(7) << " " << std::setw(11) << bytes << std::fixed << std::setprecision(1)
<< std::right << xbytes / mean << " " << std::setw(7) << " " << std::right << xbytes / mean << " "
<< "\t\t" << std::setw(7) << bidibytes / mean << std::endl; << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
} }
} }
@ -253,9 +251,15 @@ int main(int argc, char **argv)
} }
} }
std::cout << GridLogMessage << "====================================================================================================" << std::endl; std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl; std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
std::cout << GridLogMessage << "====================================================================================================" << std::endl; std::cout << GridLogMessage
<< "========================================================================="
"==========================="
<< std::endl;
Grid_finalize(); Grid_finalize();
} }

View File

@ -30,8 +30,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
template <class d> template <class d> struct scal
struct scal
{ {
d internal; d internal;
}; };
@ -69,13 +68,11 @@ int main(int argc, char **argv)
json["single_site_flops"] = single_site_flops; json["single_site_flops"] = single_site_flops;
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid( GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridDefaultMpi()); GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridRedBlackCartesian *UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian *FrbGrid = GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
json["grid"] = FGrid->FullDimensions().toVector(); json["grid"] = FGrid->FullDimensions().toVector();
json["local_grid"] = FGrid->LocalDimensions().toVector(); json["local_grid"] = FGrid->LocalDimensions().toVector();
@ -83,11 +80,10 @@ int main(int argc, char **argv)
std::cout << GridLogMessage << "Making s innermost grids" << std::endl; std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
GridCartesian *sUGrid = GridCartesian *sUGrid =
SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi()); SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
GridRedBlackCartesian *sUrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid); GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
GridRedBlackCartesian *sFrbGrid = GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
std::vector<int> seeds4({1, 2, 3, 4}); std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8}); std::vector<int> seeds5({5, 6, 7, 8});
@ -150,8 +146,7 @@ int main(int argc, char **argv)
{ {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu); U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
} }
std::cout << GridLogMessage << "Setting up Cshift based reference " std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
<< std::endl;
if (1) if (1)
{ {
@ -200,54 +195,45 @@ int main(int argc, char **argv)
json["ranks"] = NP; json["ranks"] = NP;
json["nodes"] = NN; json["nodes"] = NN;
std::cout
<< GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout
<< GridLogMessage
<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
<< std::endl;
std::cout
<< GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout
<< GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout << GridLogMessage std::cout << GridLogMessage
<< "* Benchmarking DomainWallFermionR::Dhop " << "*****************************************************************"
<< std::endl; << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by " std::cout << GridLogMessage
<< vComplexF::Nsimd() << std::endl; << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << std::endl;
<< " B" << std::endl; std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout << GridLogMessage
<< "*****************************************************************"
<< std::endl;
std::cout << GridLogMessage
<< "* Benchmarking DomainWallFermionR::Dhop " << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
<< std::endl;
std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
<< std::endl;
if (sizeof(RealF) == 4) if (sizeof(RealF) == 4)
std::cout << GridLogMessage << "* SINGLE precision " << std::endl; std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
if (sizeof(RealF) == 8) if (sizeof(RealF) == 8)
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl; std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
#ifdef GRID_OMP #ifdef GRID_OMP
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
<< std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential comms compute" std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
<< std::endl;
#endif #endif
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
<< std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
<< std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels"
<< std::endl; std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
std::cout std::cout << GridLogMessage
<< GridLogMessage << "*****************************************************************"
<< "*****************************************************************" << std::endl;
<< std::endl;
DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
int ncall = 300; int ncall = 300;
@ -277,16 +263,13 @@ int main(int argc, char **argv)
auto simdwidth = sizeof(vComplex); auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
simdwidth / nsimd * ncall / (1024. * 1024. * 1024.); nsimd * ncall / (1024. * 1024. * 1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors // mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = double data_mem =
(volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
simdwidth / nsimd * ncall / (1024. * 1024. * 1024.); nsimd * ncall / (1024. * 1024. * 1024.);
std::cout << GridLogMessage << "Called Dw " << ncall << " times in "
<< t1 - t0 << " us" << std::endl;
json["Dw"]["calls"] = ncall; json["Dw"]["calls"] = ncall;
json["Dw"]["time"] = t1 - t0; json["Dw"]["time"] = t1 - t0;
@ -296,15 +279,16 @@ int main(int argc, char **argv)
json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0)); json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0)); json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
// std::cout<<GridLogMessage << "norm result "<< std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
// norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm ref "<< << " us" << std::endl;
// norm2(ref)<<std::endl; // std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout << GridLogMessage << "mflop/s = " << flops / (t1 - t0) // std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout << GridLogMessage << "mflop/s = " << flops / (t1 - t0) << std::endl;
std::cout << GridLogMessage << "mflop/s per rank = " << flops / (t1 - t0) / NP
<< std::endl; << std::endl;
std::cout << GridLogMessage std::cout << GridLogMessage << "mflop/s per node = " << flops / (t1 - t0) / NN
<< "mflop/s per rank = " << flops / (t1 - t0) / NP << std::endl; << std::endl;
std::cout << GridLogMessage
<< "mflop/s per node = " << flops / (t1 - t0) / NN << std::endl;
std::cout << GridLogMessage std::cout << GridLogMessage
<< "RF GiB/s (base 2) = " << 1000000. * data_rf / ((t1 - t0)) << "RF GiB/s (base 2) = " << 1000000. * data_rf / ((t1 - t0))
<< std::endl; << std::endl;
@ -381,13 +365,12 @@ int main(int argc, char **argv)
} }
// dump=1; // dump=1;
Dw.Dhop(src, result, 1); Dw.Dhop(src, result, 1);
std::cout
<< GridLogMessage std::cout << GridLogMessage
<< "Compare to naive wilson implementation Dag to verify correctness" << "Compare to naive wilson implementation Dag to verify correctness"
<< std::endl;
std::cout << GridLogMessage << "Called DwDag" << std::endl;
std::cout << GridLogMessage << "norm dag result " << norm2(result)
<< std::endl; << std::endl;
std::cout << GridLogMessage << "Called DwDag" << std::endl;
std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl; std::cout << GridLogMessage << "norm dag ref " << norm2(ref) << std::endl;
err = ref - result; err = ref - result;
std::cout << GridLogMessage << "norm dag diff " << norm2(err) << std::endl; std::cout << GridLogMessage << "norm dag diff " << norm2(err) << std::endl;
@ -405,8 +388,7 @@ int main(int argc, char **argv)
LatticeFermionF r_o(FrbGrid); LatticeFermionF r_o(FrbGrid);
LatticeFermionF r_eo(FGrid); LatticeFermionF r_eo(FGrid);
std::cout << GridLogMessage std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
<< "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
<< std::endl; << std::endl;
pickCheckerboard(Even, src_e, src); pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd, src_o, src); pickCheckerboard(Odd, src_o, src);
@ -416,37 +398,32 @@ int main(int argc, char **argv)
// S-direction is INNERMOST and takes no part in the parity. // S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage std::cout << GridLogMessage
<< "*********************************************************"
<< std::endl; << "*********************************************************" << std::endl;
std::cout << GridLogMessage std::cout << GridLogMessage
<< "* Benchmarking DomainWallFermionF::DhopEO " << "* Benchmarking DomainWallFermionF::DhopEO " << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
<< std::endl; << std::endl;
std::cout << GridLogMessage << "* Vectorising space-time by "
<< vComplexF::Nsimd() << std::endl;
if (sizeof(RealF) == 4) if (sizeof(RealF) == 4)
std::cout << GridLogMessage << "* SINGLE precision " << std::endl; std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
if (sizeof(RealF) == 8) if (sizeof(RealF) == 8)
std::cout << GridLogMessage << "* DOUBLE precision " << std::endl; std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
#ifdef GRID_OMP #ifdef GRID_OMP
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute) if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
<< std::endl;
if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
std::cout << GridLogMessage << "* Using sequential comms compute" std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
<< std::endl;
#endif #endif
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
<< std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" std::cout << GridLogMessage << "* Using Nc=3 WilsonKernels" << std::endl;
<< std::endl;
if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm) if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" std::cout << GridLogMessage << "* Using Asm Nc=3 WilsonKernels" << std::endl;
<< std::endl;
std::cout << GridLogMessage std::cout << GridLogMessage
<< "*********************************************************" << "*********************************************************" << std::endl;
<< std::endl;
{ {
Dw.ZeroCounters(); Dw.ZeroCounters();
FGrid->Barrier(); FGrid->Barrier();
@ -478,12 +455,12 @@ int main(int argc, char **argv)
json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP; json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN; json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) std::cout << GridLogMessage << "Deo mflop/s = " << flops / (t1 - t0) << std::endl;
std::cout << GridLogMessage << "Deo mflop/s per rank " << flops / (t1 - t0) / NP
<< std::endl; << std::endl;
std::cout << GridLogMessage << "Deo mflop/s per rank " std::cout << GridLogMessage << "Deo mflop/s per node " << flops / (t1 - t0) / NN
<< flops / (t1 - t0) / NP << std::endl; << std::endl;
std::cout << GridLogMessage << "Deo mflop/s per node "
<< flops / (t1 - t0) / NN << std::endl;
Dw.Report(); Dw.Report();
} }
Dw.DhopEO(src_o, r_e, DaggerNo); Dw.DhopEO(src_o, r_e, DaggerNo);
@ -510,18 +487,16 @@ int main(int argc, char **argv)
pickCheckerboard(Even, src_e, err); pickCheckerboard(Even, src_e, err);
pickCheckerboard(Odd, src_o, err); pickCheckerboard(Odd, src_o, err);
std::cout << GridLogMessage << "norm diff even " << norm2(src_e) std::cout << GridLogMessage << "norm diff even " << norm2(src_e) << std::endl;
<< std::endl; std::cout << GridLogMessage << "norm diff odd " << norm2(src_o) << std::endl;
std::cout << GridLogMessage << "norm diff odd " << norm2(src_o)
<< std::endl;
assert(norm2(src_e) < 1.0e-4); assert(norm2(src_e) < 1.0e-4);
assert(norm2(src_o) < 1.0e-4); assert(norm2(src_o) < 1.0e-4);
if (!json_filename.empty()) if (!json_filename.empty())
{ {
std::cout << GridLogMessage << "writing benchmark results to " std::cout << GridLogMessage << "writing benchmark results to " << json_filename
<< json_filename << std::endl; << std::endl;
int me = 0; int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_rank(MPI_COMM_WORLD, &me);

36
Grid/Common.hpp Normal file
View File

@ -0,0 +1,36 @@
/*
Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef Grid_Benchmarks_Common_hpp_
#define Grid_Benchmarks_Common_hpp_
#ifndef GRID_MSG
#define GRID_MSG std::cout << GridLogMessage
#endif
#ifndef GRID_MSG_MAXSIZE
#define GRID_MSG_MAXSIZE 1024
#endif
#define grid_printf(...) \
{ \
char _buf[GRID_MSG_MAXSIZE]; \
snprintf(_buf, GRID_MSG_MAXSIZE, __VA_ARGS__); \
GRID_MSG << _buf; \
}
#endif // Grid_Benchmarks_Common_hpp_

69
Grid/Readme.md Normal file
View File

@ -0,0 +1,69 @@
# Grid benchmarks
This folder contains benchmarks for the [Grid](https://github.com/aportelli/) library.
The benchmarks can be summarised as follows
- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
matrices, as well as bandwidth measurement for different operations. Measurements are
performed for a fixed range of problem sizes.
## TL;DR
Build and install Grid, all dependencies, and the benchmark with
```bash
systems/<system>/bootstrap-env.sh <env_dir> # build dependencies, takes a long time
./build-grid.sh <env_dir> <config> # build Grid
./build-benchmark.sh <env_dir> <config> # build benchmarks
```
where `<env_dir>` is an arbitrary directory where every product will be stored, `<system>`
is a sub-directory of `systems` containing system-specific scripts
(an existing preset or your own), and finally `<config>` is the name of a build config
in `systems/<system>/grid-config.json`. After a successful execution the benchmark binaries
will be in `<env_dir>/prefix/gridbench_<config>`.
## Environment setup
A complete runtime environnement can be deploy using scripts from this repository. System-specific scripts are in the `systems` directory.
You should first deploy the environment for the specific system you are using, for example
```bash
systems/tursa/bootstrap-env.sh ./env
```
will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
of packages, and might take some time to complete.
After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
```bash
source ./env/env.sh
```
Additional scripts `env-*.sh` can be sourced after to activate more specific environments,
this should be done after sourcing `env.sh` as above.
## Building the benchmarks
The environnement directory contains a `grid-config.json` file specifying compilation flag
configurations for Grid (please see Grid's repository for documentation). All entries have
the form
```json
{
"name": "foo", // name of the configuration
"env-script": "bar.sh", // script to source before building
// (path relative to the environment directory)
"commit": "...", // Grid commit to use
// (anything that can be an argument of git checkout)
"config-options": "..." // options to pass to the configure script,
"env" : { // environment variables
"VAR": "value" // export VAR="value" before building
}
}
```
Grid can then be built with
```
./build-grid.sh <env_dir> <config>
```
where `<env_dir>` is the environment directory and `<config>` is the build config name in
`grid-config.json`. Similarly, the benchmarks can then be built with
```
./build-grid <env_dir> <config>
```
## Running the benchmarks
After building the benchmarks as above you can find the binaries in
`<env_dir>/prefix/gridbench_<config>`.

View File

@ -16,18 +16,16 @@ cd "${env_dir}"
env_dir=$(pwd -P) env_dir=$(pwd -P)
cd "${call_dir}" cd "${call_dir}"
build_dir="${env_dir}/build/Grid-benchmarks/${cfg}" build_dir="${env_dir}/build/Grid-benchmarks/${cfg}"
if [ -d "${build_dir}" ]; then
echo "error: directory '${build_dir}' exists"
exit 1
fi
mkdir -p "${build_dir}" mkdir -p "${build_dir}"
source "${env_dir}/env.sh" source "${env_dir}/env.sh"
entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json) entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
env_script=$(echo "${entry}" | jq -r ".\"env-script\"") env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
cd "${build_dir}" || return cd "${build_dir}" || return
source "${env_dir}/${env_script}" source "${env_dir}/${env_script}"
"${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \ if [ ! -f Makefile ]; then
--prefix="${env_dir}/prefix/gridbench_${cfg}" "${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
--prefix="${env_dir}/prefix/gridbench_${cfg}"
fi
make -j 128 make -j 128
make install make install
cd "${call_dir}" cd "${call_dir}"

8
Readme.md Normal file
View File

@ -0,0 +1,8 @@
# Lattice benchmarks
This repository is an attempt at packaging benchmarks for various libraries used for
lattice field theory simulations. It is currently only featuring the Grid library but
more will be added later.
Libraries:
- [Grid](https://github.com/aportelli/) - [Documentation](Grid/Readme.md)