mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Hadrons: remove the use of OpenMP reductions
This commit is contained in:
parent
2ac57370f1
commit
8b007b5c24
@ -38,14 +38,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
|
|
||||||
// complex reductions
|
|
||||||
#pragma omp declare reduction(ComplexPlus:Grid::ComplexD, Grid::vComplexD, Grid::ComplexF, Grid::vComplexF: omp_out += omp_in)
|
|
||||||
|
|
||||||
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
|
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
|
||||||
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
|
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
|
||||||
#define PARALLEL_FOR_REDUCE_HELPER0(x) #x
|
|
||||||
#define PARALLEL_FOR_REDUCE_HELPER1(op, var) PARALLEL_FOR_REDUCE_HELPER0(omp parallel for schedule(static) reduction(op:var))
|
|
||||||
#define PARALLEL_FOR_LOOP_REDUCE(op, var) _Pragma(PARALLEL_FOR_REDUCE_HELPER1(op, var))
|
|
||||||
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
|
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
|
||||||
#define PARALLEL_NESTED_LOOP5 _Pragma("omp parallel for collapse(5)")
|
#define PARALLEL_NESTED_LOOP5 _Pragma("omp parallel for collapse(5)")
|
||||||
#define PARALLEL_REGION _Pragma("omp parallel")
|
#define PARALLEL_REGION _Pragma("omp parallel")
|
||||||
@ -63,7 +57,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define parallel_region PARALLEL_REGION
|
#define parallel_region PARALLEL_REGION
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
#define parallel_for PARALLEL_FOR_LOOP for
|
||||||
#define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
|
#define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
|
||||||
#define parallel_for_reduce(op, var) PARALLEL_FOR_LOOP_REDUCE(op, var) for
|
|
||||||
#define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
|
#define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
|
||||||
#define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
|
#define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
|
||||||
|
|
||||||
|
@ -167,34 +167,53 @@ public:
|
|||||||
template <typename C, typename MatLeft, typename MatRight>
|
template <typename C, typename MatLeft, typename MatRight>
|
||||||
static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
|
static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
|
||||||
{
|
{
|
||||||
|
int nThreads = GridThread::GetThreads();
|
||||||
|
std::vector<C> tacc(nThreads, 0.);
|
||||||
|
|
||||||
if ((MatLeft::Options == Eigen::RowMajor) and
|
if ((MatLeft::Options == Eigen::RowMajor) and
|
||||||
(MatRight::Options == Eigen::ColMajor))
|
(MatRight::Options == Eigen::ColMajor))
|
||||||
{
|
{
|
||||||
parallel_for_reduce(ComplexPlus, acc) (unsigned int r = 0; r < a.rows(); ++r)
|
parallel_for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
int rt, nr;
|
||||||
|
|
||||||
|
GridThread::GetWork(a.rows(), thr, nr, rt);
|
||||||
|
for (unsigned int r = rt; r < nr + rt; ++r)
|
||||||
{
|
{
|
||||||
#ifdef USE_MKL
|
#ifdef USE_MKL
|
||||||
ComplexD tmp;
|
C tmp;
|
||||||
dotuRow(tmp, r, a, b);
|
dotuRow(tmp, r, a, b);
|
||||||
acc += tmp;
|
tacc[thr] += tmp;
|
||||||
#else
|
#else
|
||||||
acc += a.row(r).conjugate().dot(b.col(r));
|
tacc[thr] += a.row(r).conjugate().dot(b.col(r));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
parallel_for_reduce(ComplexPlus, acc) (unsigned int c = 0; c < a.cols(); ++c)
|
parallel_for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
int ct, nc;
|
||||||
|
|
||||||
|
GridThread::GetWork(a.cols(), thr, nc, ct);
|
||||||
|
for (unsigned int c = ct; c < nc + ct; ++c)
|
||||||
{
|
{
|
||||||
#ifdef USE_MKL
|
#ifdef USE_MKL
|
||||||
ComplexD tmp;
|
C tmp;
|
||||||
dotuCol(tmp, c, a, b);
|
dotuCol(tmp, c, a, b);
|
||||||
acc += tmp;
|
tacc[thr] += tmp;
|
||||||
#else
|
#else
|
||||||
acc += a.col(c).conjugate().dot(b.row(c));
|
tacc[thr] += a.col(c).conjugate().dot(b.row(c));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
acc += tacc[thr];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename MatLeft, typename MatRight>
|
template <typename MatLeft, typename MatRight>
|
||||||
static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
|
static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
|
||||||
|
@ -199,23 +199,49 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
|||||||
trBenchmark("Naive loop rows first", left, right, ref,
|
trBenchmark("Naive loop rows first", left, right, ref,
|
||||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||||
{
|
{
|
||||||
|
int nThreads = GridThread::GetThreads();
|
||||||
|
std::vector<ComplexD> tres(nThreads, 0.);
|
||||||
|
|
||||||
res = 0.;
|
res = 0.;
|
||||||
auto nr = a.rows(), nc = a.cols();
|
parallel_for (int thr = 0; thr < nThreads; ++thr)
|
||||||
parallel_for_reduce(ComplexPlus, res) (unsigned int i = 0; i < nr; ++i)
|
{
|
||||||
|
int rt, nr;
|
||||||
|
auto nc = a.cols();
|
||||||
|
|
||||||
|
GridThread::GetWork(a.rows(), thr, nr, rt);
|
||||||
|
for (unsigned int i = rt; i < nr + rt; ++i)
|
||||||
for (unsigned int j = 0; j < nc; ++j)
|
for (unsigned int j = 0; j < nc; ++j)
|
||||||
{
|
{
|
||||||
res += a(i, j)*b(j, i);
|
tres[thr] += a(i, j)*b(j, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
res += tres[thr];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
trBenchmark("Naive loop cols first", left, right, ref,
|
trBenchmark("Naive loop cols first", left, right, ref,
|
||||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||||
{
|
{
|
||||||
|
int nThreads = GridThread::GetThreads();
|
||||||
|
std::vector<ComplexD> tres(nThreads, 0.);
|
||||||
|
|
||||||
res = 0.;
|
res = 0.;
|
||||||
auto nr = a.rows(), nc = a.cols();
|
parallel_for (int thr = 0; thr < nThreads; ++thr)
|
||||||
parallel_for_reduce(ComplexPlus, res) (unsigned int j = 0; j < nc; ++j)
|
{
|
||||||
|
int ct, nc;
|
||||||
|
auto nr = a.rows();
|
||||||
|
|
||||||
|
GridThread::GetWork(a.cols(), thr, nc, ct);
|
||||||
|
for (unsigned int j = ct; j < nc + ct; ++j)
|
||||||
for (unsigned int i = 0; i < nr; ++i)
|
for (unsigned int i = 0; i < nr; ++i)
|
||||||
{
|
{
|
||||||
res += a(i, j)*b(j, i);
|
tres[thr] += a(i, j)*b(j, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
res += tres[thr];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
trBenchmark("Eigen tr(A*B)", left, right, ref,
|
trBenchmark("Eigen tr(A*B)", left, right, ref,
|
||||||
@ -226,21 +252,45 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
|||||||
trBenchmark("Eigen row-wise dot", left, right, ref,
|
trBenchmark("Eigen row-wise dot", left, right, ref,
|
||||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||||
{
|
{
|
||||||
res = 0.;
|
int nThreads = GridThread::GetThreads();
|
||||||
|
std::vector<ComplexD> tres(nThreads, 0.);
|
||||||
|
|
||||||
parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
|
res = 0.;
|
||||||
|
parallel_for (int thr = 0; thr < nThreads; ++thr)
|
||||||
{
|
{
|
||||||
res += a.row(r).conjugate().dot(b.col(r));
|
int rt, nr;
|
||||||
|
|
||||||
|
GridThread::GetWork(a.rows(), thr, nr, rt);
|
||||||
|
for (unsigned int i = rt; i < nr + rt; ++i)
|
||||||
|
{
|
||||||
|
tres[thr] += a.row(i).conjugate().dot(b.col(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
res += tres[thr];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
trBenchmark("Eigen col-wise dot", left, right, ref,
|
trBenchmark("Eigen col-wise dot", left, right, ref,
|
||||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||||
{
|
{
|
||||||
res = 0.;
|
int nThreads = GridThread::GetThreads();
|
||||||
|
std::vector<ComplexD> tres(nThreads, 0.);
|
||||||
|
|
||||||
parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
|
res = 0.;
|
||||||
|
parallel_for (int thr = 0; thr < nThreads; ++thr)
|
||||||
{
|
{
|
||||||
res += a.col(c).conjugate().dot(b.row(c));
|
int ct, nc;
|
||||||
|
|
||||||
|
GridThread::GetWork(a.cols(), thr, nc, ct);
|
||||||
|
for (unsigned int j = ct; j < nc + ct; ++j)
|
||||||
|
{
|
||||||
|
tres[thr] += a.col(j).conjugate().dot(b.row(j));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
res += tres[thr];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
trBenchmark("Eigen Hadamard", left, right, ref,
|
trBenchmark("Eigen Hadamard", left, right, ref,
|
||||||
@ -252,27 +302,49 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
|||||||
trBenchmark("MKL row-wise zdotu", left, right, ref,
|
trBenchmark("MKL row-wise zdotu", left, right, ref,
|
||||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||||
{
|
{
|
||||||
res = 0.;
|
int nThreads = GridThread::GetThreads();
|
||||||
|
std::vector<ComplexD> tres(nThreads, 0.);
|
||||||
|
|
||||||
parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
|
res = 0.;
|
||||||
|
parallel_for (int thr = 0; thr < nThreads; ++thr)
|
||||||
{
|
{
|
||||||
ComplexD tmp;
|
ComplexD tmp;
|
||||||
|
int rt, nr;
|
||||||
|
|
||||||
zdotuRow(tmp, r, a, b);
|
GridThread::GetWork(a.rows(), thr, nr, rt);
|
||||||
res += tmp;
|
for (unsigned int i = rt; i < nr + rt; ++i)
|
||||||
|
{
|
||||||
|
zdotuRow(tmp, i, a, b);
|
||||||
|
tres[thr] += tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
res += tres[thr];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
trBenchmark("MKL col-wise zdotu", left, right, ref,
|
trBenchmark("MKL col-wise zdotu", left, right, ref,
|
||||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||||
{
|
{
|
||||||
res = 0.;
|
int nThreads = GridThread::GetThreads();
|
||||||
|
std::vector<ComplexD> tres(nThreads, 0.);
|
||||||
|
|
||||||
parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
|
res = 0.;
|
||||||
|
parallel_for (int thr = 0; thr < nThreads; ++thr)
|
||||||
{
|
{
|
||||||
ComplexD tmp;
|
ComplexD tmp;
|
||||||
|
int ct, nc;
|
||||||
|
|
||||||
zdotuCol(tmp, c, a, b);
|
GridThread::GetWork(a.cols(), thr, nc, ct);
|
||||||
res += tmp;
|
for (unsigned int j = ct; j < nc + ct; ++j)
|
||||||
|
{
|
||||||
|
zdotuCol(tmp, j, a, b);
|
||||||
|
tres[thr] += tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int thr = 0; thr < nThreads; ++thr)
|
||||||
|
{
|
||||||
|
res += tres[thr];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user