diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index 580278d6..1c13fccb 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -38,14 +38,8 @@ Author: paboyle #ifdef GRID_OMP #include -// complex reductions -#pragma omp declare reduction(ComplexPlus:Grid::ComplexD, Grid::vComplexD, Grid::ComplexF, Grid::vComplexF: omp_out += omp_in) - #define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)") #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)") -#define PARALLEL_FOR_REDUCE_HELPER0(x) #x -#define PARALLEL_FOR_REDUCE_HELPER1(op, var) PARALLEL_FOR_REDUCE_HELPER0(omp parallel for schedule(static) reduction(op:var)) -#define PARALLEL_FOR_LOOP_REDUCE(op, var) _Pragma(PARALLEL_FOR_REDUCE_HELPER1(op, var)) #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)") #define PARALLEL_NESTED_LOOP5 _Pragma("omp parallel for collapse(5)") #define PARALLEL_REGION _Pragma("omp parallel") @@ -63,7 +57,6 @@ Author: paboyle #define parallel_region PARALLEL_REGION #define parallel_for PARALLEL_FOR_LOOP for #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for -#define parallel_for_reduce(op, var) PARALLEL_FOR_LOOP_REDUCE(op, var) for #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for diff --git a/Hadrons/A2AMatrix.hpp b/Hadrons/A2AMatrix.hpp index 831d2a02..288bb7c5 100644 --- a/Hadrons/A2AMatrix.hpp +++ b/Hadrons/A2AMatrix.hpp @@ -167,33 +167,52 @@ public: template static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b) { + int nThreads = GridThread::GetThreads(); + std::vector tacc(nThreads, 0.); + if ((MatLeft::Options == Eigen::RowMajor) and (MatRight::Options == Eigen::ColMajor)) { - parallel_for_reduce(ComplexPlus, acc) (unsigned int r = 0; r < a.rows(); ++r) + parallel_for (int thr = 0; thr < nThreads; ++thr) { + int rt, nr; + + GridThread::GetWork(a.rows(), thr, nr, rt); + for (unsigned int r = rt; r < nr + rt; ++r) + { #ifdef USE_MKL - ComplexD tmp; - dotuRow(tmp, r, a, b); - acc += tmp; + C tmp; + dotuRow(tmp, r, a, b); + tacc[thr] += tmp; #else - acc += a.row(r).conjugate().dot(b.col(r)); + tacc[thr] += a.row(r).conjugate().dot(b.col(r)); #endif + } } } else { - parallel_for_reduce(ComplexPlus, acc) (unsigned int c = 0; c < a.cols(); ++c) + parallel_for (int thr = 0; thr < nThreads; ++thr) { + int ct, nc; + + GridThread::GetWork(a.cols(), thr, nc, ct); + for (unsigned int c = ct; c < nc + ct; ++c) + { #ifdef USE_MKL - ComplexD tmp; - dotuCol(tmp, c, a, b); - acc += tmp; + C tmp; + dotuCol(tmp, c, a, b); + tacc[thr] += tmp; #else - acc += a.col(c).conjugate().dot(b.row(c)); + tacc[thr] += a.col(c).conjugate().dot(b.row(c)); #endif + } } } + for (int thr = 0; thr < nThreads; ++thr) + { + acc += tacc[thr]; + } } template diff --git a/Hadrons/Utilities/ContractorBenchmark.cc b/Hadrons/Utilities/ContractorBenchmark.cc index 83d0e49f..a33f4ab4 100644 --- a/Hadrons/Utilities/ContractorBenchmark.cc +++ b/Hadrons/Utilities/ContractorBenchmark.cc @@ -199,23 +199,49 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne trBenchmark("Naive loop rows first", left, right, ref, [](ComplexD &res, const MatLeft &a, const MatRight &b) { + int nThreads = GridThread::GetThreads(); + std::vector tres(nThreads, 0.); + res = 0.; - auto nr = a.rows(), nc = a.cols(); - parallel_for_reduce(ComplexPlus, res) (unsigned int i = 0; i < nr; ++i) - for (unsigned int j = 0; j < nc; ++j) + parallel_for (int thr = 0; thr < nThreads; ++thr) { - res += a(i, j)*b(j, i); + int rt, nr; + auto nc = a.cols(); + + GridThread::GetWork(a.rows(), thr, nr, rt); + for (unsigned int i = rt; i < nr + rt; ++i) + for (unsigned int j = 0; j < nc; ++j) + { + tres[thr] += a(i, j)*b(j, i); + } + } + for (int thr = 0; thr < nThreads; ++thr) + { + res += tres[thr]; } }); trBenchmark("Naive loop cols first", left, right, ref, [](ComplexD &res, const MatLeft &a, const MatRight &b) - { + { + int nThreads = GridThread::GetThreads(); + std::vector tres(nThreads, 0.); + res = 0.; - auto nr = a.rows(), nc = a.cols(); - parallel_for_reduce(ComplexPlus, res) (unsigned int j = 0; j < nc; ++j) - for (unsigned int i = 0; i < nr; ++i) + parallel_for (int thr = 0; thr < nThreads; ++thr) { - res += a(i, j)*b(j, i); + int ct, nc; + auto nr = a.rows(); + + GridThread::GetWork(a.cols(), thr, nc, ct); + for (unsigned int j = ct; j < nc + ct; ++j) + for (unsigned int i = 0; i < nr; ++i) + { + tres[thr] += a(i, j)*b(j, i); + } + } + for (int thr = 0; thr < nThreads; ++thr) + { + res += tres[thr]; } }); trBenchmark("Eigen tr(A*B)", left, right, ref, @@ -225,22 +251,46 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne }); trBenchmark("Eigen row-wise dot", left, right, ref, [](ComplexD &res, const MatLeft &a, const MatRight &b) - { - res = 0.; + { + int nThreads = GridThread::GetThreads(); + std::vector tres(nThreads, 0.); - parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r) + res = 0.; + parallel_for (int thr = 0; thr < nThreads; ++thr) { - res += a.row(r).conjugate().dot(b.col(r)); + int rt, nr; + + GridThread::GetWork(a.rows(), thr, nr, rt); + for (unsigned int i = rt; i < nr + rt; ++i) + { + tres[thr] += a.row(i).conjugate().dot(b.col(i)); + } + } + for (int thr = 0; thr < nThreads; ++thr) + { + res += tres[thr]; } }); trBenchmark("Eigen col-wise dot", left, right, ref, [](ComplexD &res, const MatLeft &a, const MatRight &b) - { - res = 0.; + { + int nThreads = GridThread::GetThreads(); + std::vector tres(nThreads, 0.); - parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c) + res = 0.; + parallel_for (int thr = 0; thr < nThreads; ++thr) { - res += a.col(c).conjugate().dot(b.row(c)); + int ct, nc; + + GridThread::GetWork(a.cols(), thr, nc, ct); + for (unsigned int j = ct; j < nc + ct; ++j) + { + tres[thr] += a.col(j).conjugate().dot(b.row(j)); + } + } + for (int thr = 0; thr < nThreads; ++thr) + { + res += tres[thr]; } }); trBenchmark("Eigen Hadamard", left, right, ref, @@ -251,28 +301,50 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne #ifdef USE_MKL trBenchmark("MKL row-wise zdotu", left, right, ref, [](ComplexD &res, const MatLeft &a, const MatRight &b) - { - res = 0.; + { + int nThreads = GridThread::GetThreads(); + std::vector tres(nThreads, 0.); - parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r) + res = 0.; + parallel_for (int thr = 0; thr < nThreads; ++thr) { ComplexD tmp; + int rt, nr; - zdotuRow(tmp, r, a, b); - res += tmp; + GridThread::GetWork(a.rows(), thr, nr, rt); + for (unsigned int i = rt; i < nr + rt; ++i) + { + zdotuRow(tmp, i, a, b); + tres[thr] += tmp; + } + } + for (int thr = 0; thr < nThreads; ++thr) + { + res += tres[thr]; } }); trBenchmark("MKL col-wise zdotu", left, right, ref, [](ComplexD &res, const MatLeft &a, const MatRight &b) - { - res = 0.; + { + int nThreads = GridThread::GetThreads(); + std::vector tres(nThreads, 0.); - parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c) + res = 0.; + parallel_for (int thr = 0; thr < nThreads; ++thr) { ComplexD tmp; + int ct, nc; - zdotuCol(tmp, c, a, b); - res += tmp; + GridThread::GetWork(a.cols(), thr, nc, ct); + for (unsigned int j = ct; j < nc + ct; ++j) + { + zdotuCol(tmp, j, a, b); + tres[thr] += tmp; + } + } + for (int thr = 0; thr < nThreads; ++thr) + { + res += tres[thr]; } }); #endif