1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-27 22:25:56 +01:00

Hadrons: contractor performance fix

This commit is contained in:
Antonin Portelli 2018-11-16 20:59:49 +00:00
parent 8b007b5c24
commit f592ec8baa
3 changed files with 52 additions and 104 deletions

View File

@ -59,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
#define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
#define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
#define parallel_critical PARALLEL_CRITICAL
namespace Grid { namespace Grid {

View File

@ -167,51 +167,38 @@ public:
template <typename C, typename MatLeft, typename MatRight> template <typename C, typename MatLeft, typename MatRight>
static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b) static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
{ {
int nThreads = GridThread::GetThreads();
std::vector<C> tacc(nThreads, 0.);
if ((MatLeft::Options == Eigen::RowMajor) and if ((MatLeft::Options == Eigen::RowMajor) and
(MatRight::Options == Eigen::ColMajor)) (MatRight::Options == Eigen::ColMajor))
{ {
parallel_for (int thr = 0; thr < nThreads; ++thr) parallel_for (unsigned int r = 0; r < a.rows(); ++r)
{ {
int rt, nr;
GridThread::GetWork(a.rows(), thr, nr, rt);
for (unsigned int r = rt; r < nr + rt; ++r)
{
#ifdef USE_MKL
C tmp; C tmp;
#ifdef USE_MKL
dotuRow(tmp, r, a, b); dotuRow(tmp, r, a, b);
tacc[thr] += tmp;
#else #else
tacc[thr] += a.row(r).conjugate().dot(b.col(r)); tmp = a.row(r).conjugate().dot(b.col(r));
#endif #endif
parallel_critical
{
acc += tmp;
} }
} }
} }
else else
{ {
parallel_for (int thr = 0; thr < nThreads; ++thr) parallel_for (unsigned int c = 0; c < a.cols(); ++c)
{ {
int ct, nc;
GridThread::GetWork(a.cols(), thr, nc, ct);
for (unsigned int c = ct; c < nc + ct; ++c)
{
#ifdef USE_MKL
C tmp; C tmp;
#ifdef USE_MKL
dotuCol(tmp, c, a, b); dotuCol(tmp, c, a, b);
tacc[thr] += tmp;
#else #else
tacc[thr] += a.col(c).conjugate().dot(b.row(c)); tmp = a.col(c).conjugate().dot(b.row(c));
#endif #endif
} parallel_critical
}
}
for (int thr = 0; thr < nThreads; ++thr)
{ {
acc += tacc[thr]; acc += tmp;
}
}
} }
} }

View File

@ -199,49 +199,41 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
trBenchmark("Naive loop rows first", left, right, ref, trBenchmark("Naive loop rows first", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b) [](ComplexD &res, const MatLeft &a, const MatRight &b)
{ {
int nThreads = GridThread::GetThreads(); auto nr = a.rows(), nc = a.cols();
std::vector<ComplexD> tres(nThreads, 0.);
res = 0.; res = 0.;
parallel_for (int thr = 0; thr < nThreads; ++thr) parallel_for (unsigned int i = 0; i < nr; ++i)
{ {
int rt, nr; ComplexD tmp = 0.;
auto nc = a.cols();
GridThread::GetWork(a.rows(), thr, nr, rt);
for (unsigned int i = rt; i < nr + rt; ++i)
for (unsigned int j = 0; j < nc; ++j) for (unsigned int j = 0; j < nc; ++j)
{ {
tres[thr] += a(i, j)*b(j, i); tmp += a(i, j)*b(j, i);
} }
} parallel_critical
for (int thr = 0; thr < nThreads; ++thr)
{ {
res += tres[thr]; res += tmp;
}
} }
}); });
trBenchmark("Naive loop cols first", left, right, ref, trBenchmark("Naive loop cols first", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b) [](ComplexD &res, const MatLeft &a, const MatRight &b)
{ {
int nThreads = GridThread::GetThreads(); auto nr = a.rows(), nc = a.cols();
std::vector<ComplexD> tres(nThreads, 0.);
res = 0.; res = 0.;
parallel_for (int thr = 0; thr < nThreads; ++thr) parallel_for (unsigned int j = 0; j < nc; ++j)
{ {
int ct, nc; ComplexD tmp = 0.;
auto nr = a.rows();
GridThread::GetWork(a.cols(), thr, nc, ct);
for (unsigned int j = ct; j < nc + ct; ++j)
for (unsigned int i = 0; i < nr; ++i) for (unsigned int i = 0; i < nr; ++i)
{ {
tres[thr] += a(i, j)*b(j, i); tmp += a(i, j)*b(j, i);
} }
} parallel_critical
for (int thr = 0; thr < nThreads; ++thr)
{ {
res += tres[thr]; res += tmp;
}
} }
}); });
trBenchmark("Eigen tr(A*B)", left, right, ref, trBenchmark("Eigen tr(A*B)", left, right, ref,
@ -252,46 +244,32 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
trBenchmark("Eigen row-wise dot", left, right, ref, trBenchmark("Eigen row-wise dot", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b) [](ComplexD &res, const MatLeft &a, const MatRight &b)
{ {
int nThreads = GridThread::GetThreads();
std::vector<ComplexD> tres(nThreads, 0.);
res = 0.; res = 0.;
parallel_for (int thr = 0; thr < nThreads; ++thr) parallel_for (unsigned int r = 0; r < a.rows(); ++r)
{ {
int rt, nr; ComplexD tmp;
GridThread::GetWork(a.rows(), thr, nr, rt); tmp = a.row(r).conjugate().dot(b.col(r));
for (unsigned int i = rt; i < nr + rt; ++i) parallel_critical
{ {
tres[thr] += a.row(i).conjugate().dot(b.col(i)); res += tmp;
} }
} }
for (int thr = 0; thr < nThreads; ++thr)
{
res += tres[thr];
}
}); });
trBenchmark("Eigen col-wise dot", left, right, ref, trBenchmark("Eigen col-wise dot", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b) [](ComplexD &res, const MatLeft &a, const MatRight &b)
{ {
int nThreads = GridThread::GetThreads();
std::vector<ComplexD> tres(nThreads, 0.);
res = 0.; res = 0.;
parallel_for (int thr = 0; thr < nThreads; ++thr) parallel_for (unsigned int c = 0; c < a.cols(); ++c)
{ {
int ct, nc; ComplexD tmp;
GridThread::GetWork(a.cols(), thr, nc, ct); tmp = a.col(c).conjugate().dot(b.row(c));
for (unsigned int j = ct; j < nc + ct; ++j) parallel_critical
{ {
tres[thr] += a.col(j).conjugate().dot(b.row(j)); res += tmp;
} }
} }
for (int thr = 0; thr < nThreads; ++thr)
{
res += tres[thr];
}
}); });
trBenchmark("Eigen Hadamard", left, right, ref, trBenchmark("Eigen Hadamard", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b) [](ComplexD &res, const MatLeft &a, const MatRight &b)
@ -302,50 +280,32 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
trBenchmark("MKL row-wise zdotu", left, right, ref, trBenchmark("MKL row-wise zdotu", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b) [](ComplexD &res, const MatLeft &a, const MatRight &b)
{ {
int nThreads = GridThread::GetThreads();
std::vector<ComplexD> tres(nThreads, 0.);
res = 0.; res = 0.;
parallel_for (int thr = 0; thr < nThreads; ++thr) parallel_for (unsigned int r = 0; r < a.rows(); ++r)
{ {
ComplexD tmp; ComplexD tmp;
int rt, nr;
GridThread::GetWork(a.rows(), thr, nr, rt); zdotuRow(tmp, r, a, b);
for (unsigned int i = rt; i < nr + rt; ++i) parallel_critical
{ {
zdotuRow(tmp, i, a, b); res += tmp;
tres[thr] += tmp;
} }
} }
for (int thr = 0; thr < nThreads; ++thr)
{
res += tres[thr];
}
}); });
trBenchmark("MKL col-wise zdotu", left, right, ref, trBenchmark("MKL col-wise zdotu", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b) [](ComplexD &res, const MatLeft &a, const MatRight &b)
{ {
int nThreads = GridThread::GetThreads();
std::vector<ComplexD> tres(nThreads, 0.);
res = 0.; res = 0.;
parallel_for (int thr = 0; thr < nThreads; ++thr) parallel_for (unsigned int c = 0; c < a.cols(); ++c)
{ {
ComplexD tmp; ComplexD tmp;
int ct, nc;
GridThread::GetWork(a.cols(), thr, nc, ct); zdotuCol(tmp, c, a, b);
for (unsigned int j = ct; j < nc + ct; ++j) parallel_critical
{ {
zdotuCol(tmp, j, a, b); res += tmp;
tres[thr] += tmp;
} }
} }
for (int thr = 0; thr < nThreads; ++thr)
{
res += tres[thr];
}
}); });
#endif #endif
BARRIER(); BARRIER();