1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-26 17:49:33 +00:00

Compare commits

...

3 Commits

Author SHA1 Message Date
de8b2dcca3 Hadrons: faster A2A matrix load 2019-01-11 16:12:49 +00:00
efe000341d Hadrons: contractor fixes 2019-01-11 16:12:16 +00:00
11086c5c25 Hadrons: first stab at MPI contractor 2019-01-10 16:29:57 +00:00
7 changed files with 231 additions and 160 deletions

View File

@@ -62,6 +62,9 @@ using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
template <typename T>
using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
template <typename T>
using A2AMatrixMap = Eigen::Map<A2AMatrix<T>>;
template <typename T>
using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
@@ -108,7 +111,7 @@ public:
void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
const unsigned int i, const unsigned int j);
template <template <class> class Vec, typename VecT>
void load(Vec<VecT> &v, double *tRead = nullptr);
void load(Vec<VecT> &v, double *tRead = nullptr, const bool useCache = true);
private:
std::string filename_{""}, dataname_{""};
unsigned int nt_{0}, ni_{0}, nj_{0};
@@ -495,7 +498,7 @@ void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
template <typename T>
template <template <class> class Vec, typename VecT>
void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead, const bool useCache)
{
#ifdef HAVE_HDF5
Hdf5Reader reader(filename_);
@@ -532,36 +535,55 @@ void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
nj_ = hdim[2];
}
A2AMatrix<T> buf(ni_, nj_);
std::vector<hsize_t> count = {1, static_cast<hsize_t>(ni_),
static_cast<hsize_t>(nj_)},
stride = {1, 1, 1},
block = {1, 1, 1},
memCount = {static_cast<hsize_t>(ni_),
static_cast<hsize_t>(nj_)};
H5NS::DataSpace memspace(memCount.size(), memCount.data());
std::cout << "Loading timeslice";
std::cout.flush();
*tRead = 0.;
for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
if (useCache)
{
unsigned int t = tp1 - 1;
std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
if (t % 10 == 0)
std::vector<T> buf(nt_*ni_*nj_);
T *pt;
dataset.read(buf.data(), datatype);
pt = buf.data();
for (unsigned int t = 0; t < nt_; ++t)
{
std::cout << " " << t;
std::cout.flush();
A2AMatrixMap<T> bufMap(pt, ni_, nj_);
v[t] = bufMap.template cast<VecT>();
pt += ni_*nj_;
}
dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
stride.data(), block.data());
if (tRead) *tRead -= usecond();
dataset.read(buf.data(), datatype, memspace, dataspace);
if (tRead) *tRead += usecond();
v[t] = buf.template cast<VecT>();
}
std::cout << std::endl;
// if useCache = false, do I/O timeslice per timeslice (much slower)
else
{
A2AMatrix<T> buf(ni_, nj_);
std::vector<hsize_t> count = {1, static_cast<hsize_t>(ni_),
static_cast<hsize_t>(nj_)},
stride = {1, 1, 1},
block = {1, 1, 1},
memCount = {static_cast<hsize_t>(ni_),
static_cast<hsize_t>(nj_)};
H5NS::DataSpace memspace(memCount.size(), memCount.data());
std::cout << "Loading timeslice";
std::cout.flush();
*tRead = 0.;
for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
{
unsigned int t = tp1 - 1;
std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
if (t % 10 == 0)
{
std::cout << " " << t;
std::cout.flush();
}
dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
stride.data(), block.data());
if (tRead) *tRead -= usecond();
dataset.read(buf.data(), datatype, memspace, dataspace);
if (tRead) *tRead += usecond();
v[t] = buf.template cast<VecT>();
}
std::cout << std::endl;
}
#else
HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
#endif

View File

@@ -41,14 +41,6 @@ BEGIN_HADRONS_NAMESPACE
class Application
{
public:
class TrajRange: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
unsigned int, start,
unsigned int, end,
unsigned int, step);
};
class GlobalPar: Serializable
{
public:

View File

@@ -263,6 +263,33 @@ void tokenReplace(std::string &str, const std::string token,
}
}
// trajectory range
class TrajRange: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
unsigned int, start,
unsigned int, end,
unsigned int, step,
std::string, exclude);
inline std::vector<unsigned int> getTrajectoryList(void)
{
std::vector<unsigned int> excVec = strToVec<unsigned int>(exclude);
std::vector<unsigned int> list;
for (unsigned int t = start; t < end; t += step)
{
if (std::find(excVec.begin(), excVec.end(), t) == excVec.end())
{
list.push_back(t);
}
}
return list;
}
};
END_HADRONS_NAMESPACE
#include <Hadrons/Exceptions.hpp>

View File

@@ -34,4 +34,5 @@ nobase_libHadrons_a_HEADERS = \
Solver.hpp \
TimerArray.hpp \
VirtualMachine.hpp \
Utilities/Contractor.hpp \
$(modules_hpp)

View File

@@ -28,6 +28,25 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Hadrons/A2AMatrix.hpp>
#include <Hadrons/DiskVector.hpp>
#include <Hadrons/TimerArray.hpp>
#include <Hadrons/Utilities/Contractor.hpp>
#ifdef GRID_COMMS_MPI3
#define GET_RANK(rank, nMpi) \
MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
#define GLOBAL_DSUM(x) MPI_Allreduce(MPI_IN_PLACE, &x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD)
#define GLOBAL_DMAX(x) MPI_Allreduce(MPI_IN_PLACE, &x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD)
#define INIT() MPI_Init(NULL, NULL)
#define FINALIZE() MPI_Finalize()
#else
#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
#define BARRIER()
#define GLOBAL_DSUM(x)
#define GLOBAL_DMAX(x)
#define INIT()
#define FINALIZE()
#endif
using namespace Grid;
using namespace QCD;
@@ -35,58 +54,6 @@ using namespace Hadrons;
#define TIME_MOD(t) (((t) + par.global.nt) % par.global.nt)
namespace Contractor
{
class TrajRange: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
unsigned int, start,
unsigned int, end,
unsigned int, step);
};
class GlobalPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
TrajRange, trajCounter,
unsigned int, nt,
std::string, diskVectorDir,
std::string, output);
};
class A2AMatrixPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar,
std::string, file,
std::string, dataset,
unsigned int, cacheSize,
std::string, name);
};
class ProductPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar,
std::string, terms,
std::vector<std::string>, times,
std::string, translations,
bool, translationAverage);
};
class CorrelatorResult: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult,
std::vector<Contractor::A2AMatrixPar>, a2aMatrix,
ProductPar, contraction,
std::vector<unsigned int>, times,
std::vector<ComplexD>, correlator);
};
}
struct ContractorPar
{
Contractor::GlobalPar global;
@@ -143,6 +110,27 @@ void saveCorrelator(const Contractor::CorrelatorResult &result, const std::strin
write(writer, fileStem, result);
}
void printPerf(const double bytes, const double usec)
{
double maxt;
maxt = usec;
GLOBAL_DMAX(maxt);
std::cout << maxt/1.0e6 << " sec " << bytes/maxt*1.0e6/1024/1024/1024 << " GB/s";
}
void printPerf(const double bytes, const double busec,
const double flops, const double fusec)
{
double maxt;
printPerf(bytes, busec);
std::cout << " ";
maxt = fusec;
GLOBAL_DMAX(maxt);
std::cout << flops/fusec/1.0e3 << " GFlop/s";
}
std::set<unsigned int> parseTimeRange(const std::string str, const unsigned int nt)
{
std::regex rex("([0-9]+)|(([0-9]+)\\.\\.([0-9]+))");
@@ -184,59 +172,18 @@ std::set<unsigned int> parseTimeRange(const std::string str, const unsigned int
return tSet;
}
struct Sec
{
Sec(const double usec)
{
seconds = usec/1.0e6;
}
double seconds;
};
inline std::ostream & operator<< (std::ostream& s, const Sec &&sec)
{
s << std::setw(10) << sec.seconds << " sec";
return s;
}
struct Flops
{
Flops(const double flops, const double fusec)
{
gFlopsPerSec = flops/fusec/1.0e3;
}
double gFlopsPerSec;
};
inline std::ostream & operator<< (std::ostream& s, const Flops &&f)
{
s << std::setw(10) << f.gFlopsPerSec << " GFlop/s";
return s;
}
struct Bytes
{
Bytes(const double bytes, const double busec)
{
gBytesPerSec = bytes/busec*1.0e6/1024/1024/1024;
}
double gBytesPerSec;
};
inline std::ostream & operator<< (std::ostream& s, const Bytes &&b)
{
s << std::setw(10) << b.gBytesPerSec << " GB/s";
return s;
}
int main(int argc, char* argv[])
{
// MPI init
int nMpi, rank;
INIT();
GET_RANK(rank, nMpi);
if (rank != 0)
{
std::cout.setstate(std::ios::badbit);
}
// parse command line
std::string parFilename;
@@ -266,31 +213,68 @@ int main(int argc, char* argv[])
for (auto &p: par.a2aMatrix)
{
std::string dirName = par.global.diskVectorDir + "/" + p.name;
std::string dirName = par.global.diskVectorDir + "/" + p.name + "." + std::to_string(rank);
a2aMat.emplace(p.name, EigenDiskVector<ComplexD>(dirName, par.global.nt, p.cacheSize));
}
// trajectory loop
for (unsigned int traj = par.global.trajCounter.start;
traj < par.global.trajCounter.end; traj += par.global.trajCounter.step)
{
std::cout << ":::::::: Trajectory " << traj << std::endl;
std::vector<unsigned int> tList = par.global.trajCounter.getTrajectoryList();
unsigned int indi, inde, indPerRank;
indPerRank = tList.size()/nMpi;
indi = rank*indPerRank;
BARRIER();
for (unsigned int tInd = indi; tInd < indi + indPerRank; tInd++)
{
unsigned int traj;
if (tInd < tList.size())
{
traj = tList[tInd];
}
else
{
traj = tList.back();
}
if (nMpi > 1)
{
if (rank == 0)
{
std::cout << ":::::::: Trajectories ";
for (unsigned int r = 0; r < nMpi - 1; ++r)
{
std::cout << tList[tInd + r*indPerRank] << " ";
}
if (tInd + (nMpi - 1)*indPerRank < tList.size())
{
std::cout << tList[tInd + (nMpi - 1)*indPerRank];
}
std::cout << std::endl;
}
}
else
{
std::cout << ":::::::: Trajectory " << traj << std::endl;
}
// load data
for (auto &p: par.a2aMatrix)
{
std::string filename = p.file;
double t, size;
double t;
tokenReplace(filename, "traj", traj);
std::cout << "======== Loading '" << filename << "'" << std::endl;
std::cout << "======== Loading '" << p.file << "'" << std::endl;
BARRIER();
A2AMatrixIo<HADRONS_A2AM_IO_TYPE> a2aIo(filename, p.dataset, par.global.nt);
a2aIo.load(a2aMat.at(p.name), &t);
std::cout << "Read " << a2aIo.getSize() << " bytes in " << t/1.0e6
<< " sec, " << a2aIo.getSize()/t*1.0e6/1024/1024 << " MB/s" << std::endl;
GLOBAL_DMAX(t);
std::cout << "Read " << nMpi*a2aIo.getSize() << " bytes in " << t/1.0e6
<< " sec, " << nMpi*a2aIo.getSize()/t*1.0e6/1024/1024
<< " MB/s" << std::endl;
}
// contract
@@ -308,6 +292,7 @@ int main(int argc, char* argv[])
double fusec, busec, flops, bytes, tusec;
Contractor::CorrelatorResult result;
BARRIER();
tAr.startTimer("Total");
std::cout << "======== Contraction tr(";
for (unsigned int g = 0; g < term.size(); ++g)
@@ -358,9 +343,10 @@ int main(int argc, char* argv[])
}
tAr.stopTimer("Transpose caching");
}
bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
std::cout << Sec(tAr.getDTimer("Transpose caching")) << " "
<< Bytes(bytes, tAr.getDTimer("Transpose caching")) << std::endl;
bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols();
bytes *= sizeof(ComplexD)*nMpi;
printPerf(bytes, tAr.getDTimer("Transpose caching"));
std::cout << std::endl;
for (unsigned int i = 0; i < timeSeq.size(); ++i)
{
unsigned int dti = 0;
@@ -378,7 +364,7 @@ int main(int argc, char* argv[])
<< " -- positions= " << t << ", dt= " << dt << std::endl;
if (term.size() > 2)
{
std::cout << std::setw(8) << "products";
std::cout << std::setw(10) << "products ";
}
flops = 0.;
bytes = 0.;
@@ -405,11 +391,11 @@ int main(int argc, char* argv[])
}
if (term.size() > 2)
{
std::cout << Sec(tAr.getDTimer("A*B total") - busec) << " "
<< Flops(flops, tAr.getDTimer("A*B algebra") - fusec) << " "
<< Bytes(bytes, tAr.getDTimer("A*B total") - busec) << std::endl;
printPerf(bytes*nMpi, tAr.getDTimer("A*B total") - busec,
flops*nMpi, tAr.getDTimer("A*B algebra") - fusec);
std::cout << std::endl;
}
std::cout << std::setw(8) << "traces";
std::cout << std::setw(10) << "traces ";
flops = 0.;
bytes = 0.;
fusec = tAr.getDTimer("tr(A*B)");
@@ -423,9 +409,9 @@ int main(int argc, char* argv[])
bytes += 2.*prod.rows()*prod.cols()*sizeof(ComplexD);
}
tAr.stopTimer("Linear algebra");
std::cout << Sec(tAr.getDTimer("tr(A*B)") - busec) << " "
<< Flops(flops, tAr.getDTimer("tr(A*B)") - fusec) << " "
<< Bytes(bytes, tAr.getDTimer("tr(A*B)") - busec) << std::endl;
printPerf(bytes*nMpi, tAr.getDTimer("tr(A*B)") - busec,
flops*nMpi, tAr.getDTimer("tr(A*B)") - fusec);
std::cout << std::endl;
if (!p.translationAverage)
{
saveCorrelator(result, par.global.output, dt, traj);
@@ -450,5 +436,7 @@ int main(int argc, char* argv[])
}
}
FINALIZE();
return EXIT_SUCCESS;
}

View File

@@ -5,7 +5,48 @@
BEGIN_HADRONS_NAMESPACE
namespace Contractor
{
class GlobalPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
TrajRange, trajCounter,
unsigned int, nt,
std::string, diskVectorDir,
std::string, output);
};
class A2AMatrixPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar,
std::string, file,
std::string, dataset,
unsigned int, cacheSize,
std::string, name);
};
class ProductPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar,
std::string, terms,
std::vector<std::string>, times,
std::string, translations,
bool, translationAverage);
};
class CorrelatorResult: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult,
std::vector<Contractor::A2AMatrixPar>, a2aMatrix,
ProductPar, contraction,
std::vector<unsigned int>, times,
std::vector<ComplexD>, correlator);
};
}
END_HADRONS_NAMESPACE

View File

@@ -7,7 +7,7 @@ HadronsFermionEP64To32_SOURCES = EigenPackCast.cc
HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
HadronsFermionEP64To32_LDADD = ../libHadrons.a ../../Grid/libGrid.a
HadronsContractor_SOURCES = Contractor.cc
HadronsContractor_SOURCES = Contractor.cc Contractor.hpp
HadronsContractor_LDADD = ../libHadrons.a ../../Grid/libGrid.a
HadronsContractorBenchmark_SOURCES = ContractorBenchmark.cc