1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-07-28 18:27:07 +01:00

Merge branch 'develop' into feature/gpu-port

This commit is contained in:
Peter Boyle
2018-12-13 05:11:34 +00:00
647 changed files with 49155 additions and 11160 deletions

View File

@@ -0,0 +1,454 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Hadrons/Utilities/Contractor.cc
Copyright (C) 2015-2018
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Hadrons/Global.hpp>
#include <Hadrons/A2AMatrix.hpp>
#include <Hadrons/DiskVector.hpp>
#include <Hadrons/TimerArray.hpp>
using namespace Grid;
using namespace QCD;
using namespace Hadrons;
#define TIME_MOD(t) (((t) + par.global.nt) % par.global.nt)
namespace Contractor
{
class TrajRange: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
unsigned int, start,
unsigned int, end,
unsigned int, step);
};
class GlobalPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
TrajRange, trajCounter,
unsigned int, nt,
std::string, diskVectorDir,
std::string, output);
};
class A2AMatrixPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar,
std::string, file,
std::string, dataset,
unsigned int, cacheSize,
std::string, name);
};
class ProductPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar,
std::string, terms,
std::vector<std::string>, times,
std::string, translations,
bool, translationAverage);
};
class CorrelatorResult: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult,
std::vector<Contractor::A2AMatrixPar>, a2aMatrix,
ProductPar, contraction,
std::vector<unsigned int>, times,
std::vector<ComplexD>, correlator);
};
}
struct ContractorPar
{
Contractor::GlobalPar global;
std::vector<Contractor::A2AMatrixPar> a2aMatrix;
std::vector<Contractor::ProductPar> product;
};
void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq,
const std::vector<std::set<unsigned int>> &times,
std::vector<unsigned int> &current,
const unsigned int depth)
{
if (depth > 0)
{
for (auto t: times[times.size() - depth])
{
current[times.size() - depth] = t;
makeTimeSeq(timeSeq, times, current, depth - 1);
}
}
else
{
timeSeq.push_back(current);
}
}
void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq,
const std::vector<std::set<unsigned int>> &times)
{
std::vector<unsigned int> current(times.size());
makeTimeSeq(timeSeq, times, current, times.size());
}
void saveCorrelator(const Contractor::CorrelatorResult &result, const std::string dir,
const unsigned int dt, const unsigned int traj)
{
std::string fileStem = "", filename;
std::vector<std::string> terms = strToVec<std::string>(result.contraction.terms);
for (unsigned int i = 0; i < terms.size() - 1; i++)
{
fileStem += terms[i] + "_" + std::to_string(result.times[i]) + "_";
}
fileStem += terms.back();
if (!result.contraction.translationAverage)
{
fileStem += "_dt_" + std::to_string(dt);
}
filename = dir + "/" + RESULT_FILE_NAME(fileStem, traj);
std::cout << "Saving correlator to '" << filename << "'" << std::endl;
makeFileDir(dir);
ResultWriter writer(filename);
write(writer, fileStem, result);
}
std::set<unsigned int> parseTimeRange(const std::string str, const unsigned int nt)
{
std::regex rex("([0-9]+)|(([0-9]+)\\.\\.([0-9]+))");
std::smatch sm;
std::vector<std::string> rstr = strToVec<std::string>(str);
std::set<unsigned int> tSet;
for (auto &s: rstr)
{
std::regex_match(s, sm, rex);
if (sm[1].matched)
{
unsigned int t;
t = std::stoi(sm[1].str());
if (t >= nt)
{
HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
}
tSet.insert(t);
}
else if (sm[2].matched)
{
unsigned int ta, tb;
ta = std::stoi(sm[3].str());
tb = std::stoi(sm[4].str());
if ((ta >= nt) or (tb >= nt))
{
HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
}
for (unsigned int ti = ta; ti <= tb; ++ti)
{
tSet.insert(ti);
}
}
}
return tSet;
}
struct Sec
{
Sec(const double usec)
{
seconds = usec/1.0e6;
}
double seconds;
};
inline std::ostream & operator<< (std::ostream& s, const Sec &&sec)
{
s << std::setw(10) << sec.seconds << " sec";
return s;
}
struct Flops
{
Flops(const double flops, const double fusec)
{
gFlopsPerSec = flops/fusec/1.0e3;
}
double gFlopsPerSec;
};
inline std::ostream & operator<< (std::ostream& s, const Flops &&f)
{
s << std::setw(10) << f.gFlopsPerSec << " GFlop/s";
return s;
}
struct Bytes
{
Bytes(const double bytes, const double busec)
{
gBytesPerSec = bytes/busec*1.0e6/1024/1024/1024;
}
double gBytesPerSec;
};
inline std::ostream & operator<< (std::ostream& s, const Bytes &&b)
{
s << std::setw(10) << b.gBytesPerSec << " GB/s";
return s;
}
int main(int argc, char* argv[])
{
// parse command line
std::string parFilename;
if (argc != 2)
{
std::cerr << "usage: " << argv[0] << " <parameter file>";
std::cerr << std::endl;
return EXIT_FAILURE;
}
parFilename = argv[1];
// parse parameter file
ContractorPar par;
unsigned int nMat, nCont;
XmlReader reader(parFilename);
read(reader, "global", par.global);
read(reader, "a2aMatrix", par.a2aMatrix);
read(reader, "product", par.product);
nMat = par.a2aMatrix.size();
nCont = par.product.size();
// create diskvectors
std::map<std::string, EigenDiskVector<ComplexD>> a2aMat;
unsigned int cacheSize;
for (auto &p: par.a2aMatrix)
{
std::string dirName = par.global.diskVectorDir + "/" + p.name;
a2aMat.emplace(p.name, EigenDiskVector<ComplexD>(dirName, par.global.nt, p.cacheSize));
}
// trajectory loop
for (unsigned int traj = par.global.trajCounter.start;
traj < par.global.trajCounter.end; traj += par.global.trajCounter.step)
{
std::cout << ":::::::: Trajectory " << traj << std::endl;
// load data
for (auto &p: par.a2aMatrix)
{
std::string filename = p.file;
double t, size;
tokenReplace(filename, "traj", traj);
std::cout << "======== Loading '" << filename << "'" << std::endl;
A2AMatrixIo<HADRONS_A2AM_IO_TYPE> a2aIo(filename, p.dataset, par.global.nt);
a2aIo.load(a2aMat.at(p.name), &t);
std::cout << "Read " << a2aIo.getSize() << " bytes in " << t/1.0e6
<< " sec, " << a2aIo.getSize()/t*1.0e6/1024/1024 << " MB/s" << std::endl;
}
// contract
EigenDiskVector<ComplexD>::Matrix buf;
for (auto &p: par.product)
{
std::vector<std::string> term = strToVec<std::string>(p.terms);
std::vector<std::set<unsigned int>> times;
std::vector<std::vector<unsigned int>> timeSeq;
std::set<unsigned int> translations;
std::vector<A2AMatrixTr<ComplexD>> lastTerm(par.global.nt);
A2AMatrix<ComplexD> prod, buf, tmp;
TimerArray tAr;
double fusec, busec, flops, bytes, tusec;
Contractor::CorrelatorResult result;
tAr.startTimer("Total");
std::cout << "======== Contraction tr(";
for (unsigned int g = 0; g < term.size(); ++g)
{
std::cout << term[g] << ((g == term.size() - 1) ? ')' : '*');
}
std::cout << std::endl;
if (term.size() != p.times.size() + 1)
{
HADRONS_ERROR(Size, "number of terms (" + std::to_string(term.size())
+ ") different from number of times ("
+ std::to_string(p.times.size() + 1) + ")");
}
for (auto &s: p.times)
{
times.push_back(parseTimeRange(s, par.global.nt));
}
for (auto &m: par.a2aMatrix)
{
if (std::find(result.a2aMatrix.begin(), result.a2aMatrix.end(), m) == result.a2aMatrix.end())
{
result.a2aMatrix.push_back(m);
tokenReplace(result.a2aMatrix.back().file, "traj", traj);
}
}
result.contraction = p;
result.correlator.resize(par.global.nt, 0.);
translations = parseTimeRange(p.translations, par.global.nt);
makeTimeSeq(timeSeq, times);
std::cout << timeSeq.size()*translations.size()*(term.size() - 2) << " A*B, "
<< timeSeq.size()*translations.size()*par.global.nt << " tr(A*B)"
<< std::endl;
std::cout << "* Caching transposed last term" << std::endl;
for (unsigned int t = 0; t < par.global.nt; ++t)
{
tAr.startTimer("Disk vector overhead");
const A2AMatrix<ComplexD> &ref = a2aMat.at(term.back())[t];
tAr.stopTimer("Disk vector overhead");
tAr.startTimer("Transpose caching");
lastTerm[t].resize(ref.rows(), ref.cols());
parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
for (unsigned int i = 0; i < ref.rows(); ++i)
{
lastTerm[t](i, j) = ref(i, j);
}
tAr.stopTimer("Transpose caching");
}
bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
std::cout << Sec(tAr.getDTimer("Transpose caching")) << " "
<< Bytes(bytes, tAr.getDTimer("Transpose caching")) << std::endl;
for (unsigned int i = 0; i < timeSeq.size(); ++i)
{
unsigned int dti = 0;
auto &t = timeSeq[i];
result.times = t;
for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
{
result.correlator[tLast] = 0.;
}
for (auto &dt: translations)
{
std::cout << "* Step " << i*translations.size() + dti + 1
<< "/" << timeSeq.size()*translations.size()
<< " -- positions= " << t << ", dt= " << dt << std::endl;
if (term.size() > 2)
{
std::cout << std::setw(8) << "products";
}
flops = 0.;
bytes = 0.;
fusec = tAr.getDTimer("A*B algebra");
busec = tAr.getDTimer("A*B total");
tAr.startTimer("Linear algebra");
tAr.startTimer("Disk vector overhead");
prod = a2aMat.at(term[0])[TIME_MOD(t[0] + dt)];
tAr.stopTimer("Disk vector overhead");
for (unsigned int j = 1; j < term.size() - 1; ++j)
{
tAr.startTimer("Disk vector overhead");
const A2AMatrix<ComplexD> &ref = a2aMat.at(term[j])[TIME_MOD(t[j] + dt)];
tAr.stopTimer("Disk vector overhead");
tAr.startTimer("A*B total");
tAr.startTimer("A*B algebra");
A2AContraction::mul(tmp, prod, ref);
tAr.stopTimer("A*B algebra");
flops += A2AContraction::mulFlops(prod, ref);
prod = tmp;
tAr.stopTimer("A*B total");
bytes += 3.*tmp.rows()*tmp.cols()*sizeof(ComplexD);
}
if (term.size() > 2)
{
std::cout << Sec(tAr.getDTimer("A*B total") - busec) << " "
<< Flops(flops, tAr.getDTimer("A*B algebra") - fusec) << " "
<< Bytes(bytes, tAr.getDTimer("A*B total") - busec) << std::endl;
}
std::cout << std::setw(8) << "traces";
flops = 0.;
bytes = 0.;
fusec = tAr.getDTimer("tr(A*B)");
busec = tAr.getDTimer("tr(A*B)");
for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
{
tAr.startTimer("tr(A*B)");
A2AContraction::accTrMul(result.correlator[TIME_MOD(tLast - dt)], prod, lastTerm[tLast]);
tAr.stopTimer("tr(A*B)");
flops += A2AContraction::accTrMulFlops(prod, lastTerm[tLast]);
bytes += 2.*prod.rows()*prod.cols()*sizeof(ComplexD);
}
tAr.stopTimer("Linear algebra");
std::cout << Sec(tAr.getDTimer("tr(A*B)") - busec) << " "
<< Flops(flops, tAr.getDTimer("tr(A*B)") - fusec) << " "
<< Bytes(bytes, tAr.getDTimer("tr(A*B)") - busec) << std::endl;
if (!p.translationAverage)
{
saveCorrelator(result, par.global.output, dt, traj);
for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
{
result.correlator[tLast] = 0.;
}
}
dti++;
}
if (p.translationAverage)
{
for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
{
result.correlator[tLast] /= translations.size();
}
saveCorrelator(result, par.global.output, 0, traj);
}
}
tAr.stopTimer("Total");
printTimeProfile(tAr.getTimings(), tAr.getTimer("Total"));
}
}
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,12 @@
#ifndef Hadrons_Contractor_hpp_
#define Hadrons_Contractor_hpp_
#include <Hadrons/Global.hpp>
BEGIN_HADRONS_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Contractor_hpp_

View File

@@ -0,0 +1,434 @@
#include <Hadrons/Global.hpp>
#include <Hadrons/A2AMatrix.hpp>
#ifdef USE_MKL
#include "mkl.h"
#include "mkl_cblas.h"
#endif
using namespace Grid;
using namespace Hadrons;
#ifdef GRID_COMMS_MPI3
#define GET_RANK(rank, nMpi) \
MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
#define INIT() MPI_Init(NULL, NULL)
#define FINALIZE() MPI_Finalize()
#else
#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
#define BARRIER()
#define INIT()
#define FINALIZE()
#endif
template <typename Function, typename MatLeft, typename MatRight>
inline void trBenchmark(const std::string name, const MatLeft &left,
const MatRight &right, const ComplexD ref, Function fn)
{
double t, flops, bytes, n = left[0].rows()*left[0].cols();
unsigned int nMat = left.size();
int nMpi, rank;
ComplexD buf;
t = 0.;
GET_RANK(rank, nMpi);
t = -usecond();
BARRIER();
for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
{
fn(buf, left[i], right[i]);
}
BARRIER();
t += usecond();
flops = nMat*(6.*n + 2.*(n - 1.));
bytes = nMat*(2.*n*sizeof(ComplexD));
if (rank == 0)
{
std::cout << std::setw(34) << name << ": diff= "
<< std::setw(12) << std::norm(buf-ref)
<< std::setw(10) << t/1.0e6 << " sec "
<< std::setw(10) << flops/t/1.0e3 << " GFlop/s "
<< std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
<< std::endl;
}
::sleep(1);
}
template <typename Function, typename MatV, typename Mat>
inline void mulBenchmark(const std::string name, const MatV &left,
const MatV &right, const Mat &ref, Function fn)
{
double t, flops, bytes;
double nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
unsigned int nMat = left.size();
int nMpi, rank;
Mat buf(left[0].rows(), left[0].rows());
t = 0.;
GET_RANK(rank, nMpi);
t = -usecond();
BARRIER();
for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
{
fn(buf, left[i], right[i]);
}
BARRIER();
t += usecond();
flops = nMat*(nr*nr*(6.*nc + 2.*(nc - 1.)));
bytes = nMat*(2*nc*nr*sizeof(ComplexD));
if (rank == 0)
{
std::cout << std::setw(34) << name << ": diff= "
<< std::setw(12) << (buf-ref).squaredNorm()
<< std::setw(10) << t/1.0e6 << " sec "
<< std::setw(10) << flops/t/1.0e3 << " GFlop/s "
<< std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
<< std::endl;
}
::sleep(1);
}
#ifdef USE_MKL
template <typename MatLeft, typename MatRight>
static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
const MatLeft &a, const MatRight &b)
{
const ComplexD *aPt, *bPt;
unsigned int aInc, bInc;
if (MatLeft::Options == Eigen::RowMajor)
{
aPt = a.data() + aRow*a.cols();
aInc = 1;
}
else if (MatLeft::Options == Eigen::ColMajor)
{
aPt = a.data() + aRow;
aInc = a.rows();
}
if (MatRight::Options == Eigen::RowMajor)
{
bPt = b.data() + aRow;
bInc = b.cols();
}
else if (MatRight::Options == Eigen::ColMajor)
{
bPt = b.data() + aRow*b.rows();
bInc = 1;
}
cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
}
template <typename MatLeft, typename MatRight>
static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
const MatLeft &a, const MatRight &b)
{
const ComplexD *aPt, *bPt;
unsigned int aInc, bInc;
if (MatLeft::Options == Eigen::RowMajor)
{
aPt = a.data() + aCol;
aInc = a.cols();
}
else if (MatLeft::Options == Eigen::ColMajor)
{
aPt = a.data() + aCol*a.rows();
aInc = 1;
}
if (MatRight::Options == Eigen::RowMajor)
{
bPt = b.data() + aCol*b.cols();
bInc = 1;
}
else if (MatRight::Options == Eigen::ColMajor)
{
bPt = b.data() + aCol;
bInc = b.rows();
}
cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
}
#endif
template <typename MatLeft, typename MatRight>
void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
{
std::vector<MatLeft> left;
std::vector<MatRight> right;
MatRight buf;
ComplexD ref;
int rank, nMpi;
left.resize(nMat, MatLeft::Random(ni, nj));
right.resize(nMat, MatRight::Random(nj, ni));
GET_RANK(rank, nMpi);
if (rank == 0)
{
std::cout << "==== tr(A*B) benchmarks" << std::endl;
std::cout << "A matrices use ";
if (MatLeft::Options == Eigen::RowMajor)
{
std::cout << "row-major ordering" << std::endl;
}
else if (MatLeft::Options == Eigen::ColMajor)
{
std::cout << "col-major ordering" << std::endl;
}
std::cout << "B matrices use ";
if (MatRight::Options == Eigen::RowMajor)
{
std::cout << "row-major ordering" << std::endl;
}
else if (MatRight::Options == Eigen::ColMajor)
{
std::cout << "col-major ordering" << std::endl;
}
std::cout << std::endl;
}
BARRIER();
ref = (left.back()*right.back()).trace();
trBenchmark("Hadrons A2AContraction::accTrMul", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
A2AContraction::accTrMul(res, a, b);
});
trBenchmark("Naive loop rows first", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
auto nr = a.rows(), nc = a.cols();
res = 0.;
parallel_for (unsigned int i = 0; i < nr; ++i)
{
ComplexD tmp = 0.;
for (unsigned int j = 0; j < nc; ++j)
{
tmp += a(i, j)*b(j, i);
}
parallel_critical
{
res += tmp;
}
}
});
trBenchmark("Naive loop cols first", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
auto nr = a.rows(), nc = a.cols();
res = 0.;
parallel_for (unsigned int j = 0; j < nc; ++j)
{
ComplexD tmp = 0.;
for (unsigned int i = 0; i < nr; ++i)
{
tmp += a(i, j)*b(j, i);
}
parallel_critical
{
res += tmp;
}
}
});
trBenchmark("Eigen tr(A*B)", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = (a*b).trace();
});
trBenchmark("Eigen row-wise dot", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
{
ComplexD tmp;
tmp = a.row(r).conjugate().dot(b.col(r));
parallel_critical
{
res += tmp;
}
}
});
trBenchmark("Eigen col-wise dot", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
{
ComplexD tmp;
tmp = a.col(c).conjugate().dot(b.row(c));
parallel_critical
{
res += tmp;
}
}
});
trBenchmark("Eigen Hadamard", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = a.cwiseProduct(b.transpose()).sum();
});
#ifdef USE_MKL
trBenchmark("MKL row-wise zdotu", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
{
ComplexD tmp;
zdotuRow(tmp, r, a, b);
parallel_critical
{
res += tmp;
}
}
});
trBenchmark("MKL col-wise zdotu", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
{
ComplexD tmp;
zdotuCol(tmp, c, a, b);
parallel_critical
{
res += tmp;
}
}
});
#endif
BARRIER();
if (rank == 0)
{
std::cout << std::endl;
}
}
template <typename Mat>
void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
{
std::vector<Mat> left, right;
Mat ref;
int rank, nMpi;
left.resize(nMat, Mat::Random(ni, nj));
right.resize(nMat, Mat::Random(nj, ni));
GET_RANK(rank, nMpi);
if (rank == 0)
{
std::cout << "==== A*B benchmarks" << std::endl;
std::cout << "all matrices use ";
if (Mat::Options == Eigen::RowMajor)
{
std::cout << "row-major ordering" << std::endl;
}
else if (Mat::Options == Eigen::ColMajor)
{
std::cout << "col-major ordering" << std::endl;
}
std::cout << std::endl;
}
BARRIER();
ref = left.back()*right.back();
mulBenchmark("Hadrons A2AContraction::mul", left, right, ref,
[](Mat &res, const Mat &a, const Mat &b)
{
A2AContraction::mul(res, a, b);
});
mulBenchmark("Eigen A*B", left, right, ref,
[](Mat &res, const Mat &a, const Mat &b)
{
res = a*b;
});
#ifdef USE_MKL
mulBenchmark("MKL A*B", left, right, ref,
[](Mat &res, const Mat &a, const Mat &b)
{
const ComplexD one(1., 0.), zero(0., 0.);
if (Mat::Options == Eigen::RowMajor)
{
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
res.data(), res.cols());
}
else if (Mat::Options == Eigen::ColMajor)
{
cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
res.data(), res.rows());
}
});
#endif
BARRIER();
if (rank == 0)
{
std::cout << std::endl;
}
}
int main(int argc, char *argv[])
{
// parse command line
Eigen::Index ni, nj, nMat;
int nMpi, rank;
if (argc != 4)
{
std::cerr << "usage: " << argv[0] << " <Ni> <Nj> <#matrices>";
std::cerr << std::endl;
return EXIT_FAILURE;
}
ni = std::stoi(argv[1]);
nj = std::stoi(argv[2]);
nMat = std::stoi(argv[3]);
INIT();
GET_RANK(rank, nMpi);
if (rank == 0)
{
std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
std::cout << nMat << " couples of " << ni << "x" << nj << " matrices\n" << std::endl;
std::cout << nMpi << " MPI processes" << std::endl;
#ifdef GRID_OMP
#pragma omp parallel
{
#pragma omp single
std::cout << omp_get_num_threads() << " threads\n" << std::endl;
}
#else
std::cout << "Single-threaded\n" << std::endl;
#endif
#ifdef EIGEN_USE_MKL_ALL
std::cout << "Eigen uses the MKL" << std::endl;
#endif
std::cout << "Eigen uses " << Eigen::nbThreads() << " threads" << std::endl;
#ifdef USE_MKL
std::cout << "MKL uses " << mkl_get_max_threads() << " threads" << std::endl;
#endif
std::cout << std::endl;
}
fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
fullMulBenchmark<A2AMatrix<ComplexD>>(ni, nj, nMat);
fullMulBenchmark<A2AMatrixTr<ComplexD>>(ni, nj, nMat);
FINALIZE();
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,217 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Hadrons/Utilities/EigenPackCast.cc
Copyright (C) 2015-2018
Author: Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Hadrons/EigenPack.hpp>
#include <Hadrons/Environment.hpp>
using namespace Grid;
using namespace QCD;
using namespace Hadrons;
template <typename FOut, typename FIn>
void convert(const std::string outFilename, const std::string inFilename,
const unsigned int Ls, const bool rb, const unsigned int size,
const bool multiFile, const bool testRead)
{
assert(outFilename != inFilename);
typedef EigenPack<FOut> EPOut;
typedef EigenPack<FIn> EPIn;
typedef typename FOut::vector_type VTypeOut;
typedef typename FIn::vector_type VTypeIn;
std::shared_ptr<GridCartesian> gInBase, gOutBase, gIn5, gOut5;
std::shared_ptr<GridRedBlackCartesian> rbgIn, rbgOut;
GridBase *gIn, *gOut;
auto dim = GridDefaultLatt();
unsigned int nd = dim.size();
auto simdOut = GridDefaultSimd(nd, VTypeOut::Nsimd());
auto simdIn = GridDefaultSimd(nd, VTypeIn::Nsimd());
gOutBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdOut, GridDefaultMpi()));
gInBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdIn, GridDefaultMpi()));
if (rb)
{
if (Ls > 1)
{
rbgOut.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gOutBase.get()));
rbgIn.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gInBase.get()));
}
else
{
rbgOut.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gOutBase.get()));
rbgIn.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gInBase.get()));
}
gOut = rbgOut.get();
gIn = rbgIn.get();
}
else
{
if (Ls > 1)
{
gOut5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gOutBase.get()));
gIn5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gInBase.get()));
gOut = gOut5.get();
gIn = gIn5.get();
}
else
{
gOut = gOutBase.get();
gIn = gInBase.get();
}
}
FOut bufOut(gOut);
FIn bufIn(gIn), testIn(gIn);
ScidacWriter binWriter(gOut->IsBoss());
ScidacReader binReader;
PackRecord record;
RealD eval;
LOG(Message) << "==== EIGENPACK CONVERSION" << std::endl;
LOG(Message) << "Lattice : " << gIn->GlobalDimensions() << std::endl;
LOG(Message) << "Checkerboarded: " << (rb ? "yes" : "no") << std::endl;
LOG(Message) << "In path : " << inFilename << std::endl;
LOG(Message) << "In type : " << typeName<FIn>() << std::endl;
LOG(Message) << "Out path : " << outFilename << std::endl;
LOG(Message) << "Out type : " << typeName<FOut>() << std::endl;
LOG(Message) << "#vectors : " << size << std::endl;
LOG(Message) << "Multifile : " << (multiFile ? "yes" : "no") << std::endl;
LOG(Message) << "Test read : " << (testRead ? "yes" : "no") << std::endl;
if (multiFile)
{
for(unsigned int k = 0; k < size; ++k)
{
std::string outV = outFilename + "/v" + std::to_string(k) + ".bin";
std::string inV = inFilename + "/v" + std::to_string(k) + ".bin";
LOG(Message) << "==== Converting vector " << k << std::endl;
LOG(Message) << "In : " << inV << std::endl;
LOG(Message) << "Out: " << outV << std::endl;
// conversion
LOG(Message) << "-- Doing conversion" << std::endl;
makeFileDir(outV, gOut);
binWriter.open(outV);
binReader.open(inV);
EigenPackIo::readHeader(record, binReader);
EigenPackIo::writeHeader(binWriter, record);
EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
binWriter.close();
binReader.close();
// read test
if (testRead)
{
LOG(Message) << "-- Test read" << std::endl;
binReader.open(outV);
EigenPackIo::readElement<FOut>(bufOut, eval, k, binReader);
binReader.close();
}
}
}
else
{
// conversion
LOG(Message) << "-- Doing conversion" << std::endl;
makeFileDir(outFilename, gOut);
binWriter.open(outFilename);
binReader.open(inFilename);
EigenPackIo::readHeader(record, binReader);
EigenPackIo::writeHeader(binWriter, record);
for(unsigned int k = 0; k < size; ++k)
{
EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
}
binWriter.close();
binReader.close();
// read test
if (testRead)
{
LOG(Message) << "-- Test read" << std::endl;
binReader.open(outFilename);
EigenPackIo::readHeader(record, binReader);
for(unsigned int k = 0; k < size; ++k)
{
EigenPackIo::readElement<FOut>(bufOut, eval, k, binReader);
}
binReader.close();
}
}
}
#ifndef FOUT
#warning "FOUT undefined (set to WilsonImplF::FermionField by default)"
#define FOUT WilsonImplF::FermionField
#endif
#ifndef FIN
#warning "FIN undefined (set to WilsonImplD::FermionField by default)"
#define FIN WilsonImplD::FermionField
#endif
int main(int argc, char *argv[])
{
// parse command line
std::string outFilename, inFilename;
unsigned int size, Ls;
bool rb, multiFile, testRead;
if (argc < 8)
{
std::cerr << "usage: " << argv[0] << " <out eigenpack> <in eigenpack> <Ls> <red-black {0|1}> <#vector> <multifile {0|1}> <test read {0|1}> [Grid options]";
std::cerr << std::endl;
std::exit(EXIT_FAILURE);
}
outFilename = argv[1];
inFilename = argv[2];
Ls = std::stoi(std::string(argv[3]));
rb = (std::string(argv[4]) != "0");
size = std::stoi(std::string(argv[5]));
multiFile = (std::string(argv[6]) != "0");
testRead = (std::string(argv[7]) != "0");
// initialization
Grid_init(&argc, &argv);
initLogger();
// execution
try
{
convert<FOUT, FIN>(outFilename, inFilename, Ls, rb, size, multiFile, testRead);
}
catch (const std::exception& e)
{
Exceptions::abort(e);
}
// epilogue
LOG(Message) << "Grid is finalizing now" << std::endl;
Grid_finalize();
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,80 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Hadrons/Utilities/HadronsXmlRun.cc
Copyright (C) 2015-2018
Author: Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Hadrons/Application.hpp>
using namespace Grid;
using namespace Hadrons;
int main(int argc, char *argv[])
{
// parse command line
std::string parameterFileName, scheduleFileName = "";
if (argc < 2)
{
std::cerr << "usage: " << argv[0] << " <parameter file> [<precomputed schedule>] [Grid options]";
std::cerr << std::endl;
std::exit(EXIT_FAILURE);
}
parameterFileName = argv[1];
if (argc > 2)
{
if (argv[2][0] != '-')
{
scheduleFileName = argv[2];
}
}
// initialization
Grid_init(&argc, &argv);
// execution
try
{
Application application(parameterFileName);
application.parseParameterFile(parameterFileName);
if (!scheduleFileName.empty())
{
application.loadSchedule(scheduleFileName);
}
application.run();
}
catch (const std::exception& e)
{
Exceptions::abort(e);
}
// epilogue
LOG(Message) << "Grid is finalizing now" << std::endl;
Grid_finalize();
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,14 @@
bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32 HadronsContractor HadronsContractorBenchmark
HadronsXmlRun_SOURCES = HadronsXmlRun.cc
HadronsXmlRun_LDADD = ../libHadrons.a ../../Grid/libGrid.a
HadronsFermionEP64To32_SOURCES = EigenPackCast.cc
HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
HadronsFermionEP64To32_LDADD = ../libHadrons.a ../../Grid/libGrid.a
HadronsContractor_SOURCES = Contractor.cc
HadronsContractor_LDADD = ../libHadrons.a ../../Grid/libGrid.a
HadronsContractorBenchmark_SOURCES = ContractorBenchmark.cc
HadronsContractorBenchmark_LDADD = ../libHadrons.a ../../Grid/libGrid.a