mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Hadrons: remove the use of OpenMP reductions
This commit is contained in:
		@@ -38,14 +38,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#include <omp.h>
 | 
			
		||||
 | 
			
		||||
// complex reductions
 | 
			
		||||
#pragma omp declare reduction(ComplexPlus:Grid::ComplexD, Grid::vComplexD, Grid::ComplexF, Grid::vComplexF: omp_out += omp_in)
 | 
			
		||||
 | 
			
		||||
#define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 | 
			
		||||
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
 | 
			
		||||
#define PARALLEL_FOR_REDUCE_HELPER0(x) #x
 | 
			
		||||
#define PARALLEL_FOR_REDUCE_HELPER1(op, var) PARALLEL_FOR_REDUCE_HELPER0(omp parallel for schedule(static) reduction(op:var))
 | 
			
		||||
#define PARALLEL_FOR_LOOP_REDUCE(op, var) _Pragma(PARALLEL_FOR_REDUCE_HELPER1(op, var))
 | 
			
		||||
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 | 
			
		||||
#define PARALLEL_NESTED_LOOP5 _Pragma("omp parallel for collapse(5)")
 | 
			
		||||
#define PARALLEL_REGION       _Pragma("omp parallel")
 | 
			
		||||
@@ -63,7 +57,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define parallel_region    PARALLEL_REGION
 | 
			
		||||
#define parallel_for       PARALLEL_FOR_LOOP for
 | 
			
		||||
#define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 | 
			
		||||
#define parallel_for_reduce(op, var) PARALLEL_FOR_LOOP_REDUCE(op, var) for
 | 
			
		||||
#define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 | 
			
		||||
#define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -167,34 +167,53 @@ public:
 | 
			
		||||
    template <typename C, typename MatLeft, typename MatRight>
 | 
			
		||||
    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
 | 
			
		||||
    {
 | 
			
		||||
        int            nThreads = GridThread::GetThreads();
 | 
			
		||||
        std::vector<C> tacc(nThreads, 0.);
 | 
			
		||||
 | 
			
		||||
        if ((MatLeft::Options == Eigen::RowMajor) and
 | 
			
		||||
            (MatRight::Options == Eigen::ColMajor))
 | 
			
		||||
        {
 | 
			
		||||
            parallel_for_reduce(ComplexPlus, acc) (unsigned int r = 0; r < a.rows(); ++r)
 | 
			
		||||
            parallel_for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
            {
 | 
			
		||||
                int rt, nr;
 | 
			
		||||
 | 
			
		||||
                GridThread::GetWork(a.rows(), thr, nr, rt);
 | 
			
		||||
                for (unsigned int r = rt; r < nr + rt; ++r)
 | 
			
		||||
                {
 | 
			
		||||
#ifdef USE_MKL
 | 
			
		||||
                ComplexD tmp;
 | 
			
		||||
                    C tmp;
 | 
			
		||||
                    dotuRow(tmp, r, a, b);
 | 
			
		||||
                acc += tmp;
 | 
			
		||||
                    tacc[thr] += tmp;
 | 
			
		||||
#else
 | 
			
		||||
                acc += a.row(r).conjugate().dot(b.col(r));
 | 
			
		||||
                    tacc[thr] += a.row(r).conjugate().dot(b.col(r));
 | 
			
		||||
#endif
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
            parallel_for_reduce(ComplexPlus, acc) (unsigned int c = 0; c < a.cols(); ++c)
 | 
			
		||||
            parallel_for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
            {
 | 
			
		||||
                int ct, nc;
 | 
			
		||||
 | 
			
		||||
                GridThread::GetWork(a.cols(), thr, nc, ct);
 | 
			
		||||
                for (unsigned int c = ct; c < nc + ct; ++c)
 | 
			
		||||
                {
 | 
			
		||||
#ifdef USE_MKL
 | 
			
		||||
                ComplexD tmp;
 | 
			
		||||
                    C tmp;
 | 
			
		||||
                    dotuCol(tmp, c, a, b);
 | 
			
		||||
                acc += tmp;
 | 
			
		||||
                    tacc[thr] += tmp;
 | 
			
		||||
#else
 | 
			
		||||
                acc += a.col(c).conjugate().dot(b.row(c));
 | 
			
		||||
                    tacc[thr] += a.col(c).conjugate().dot(b.row(c));
 | 
			
		||||
#endif
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            acc += tacc[thr];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template <typename MatLeft, typename MatRight>
 | 
			
		||||
    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
 | 
			
		||||
 
 | 
			
		||||
@@ -199,23 +199,49 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
 | 
			
		||||
    trBenchmark("Naive loop rows first", left, right, ref,
 | 
			
		||||
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
 | 
			
		||||
    { 
 | 
			
		||||
        int                   nThreads = GridThread::GetThreads();
 | 
			
		||||
        std::vector<ComplexD> tres(nThreads, 0.);
 | 
			
		||||
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        auto nr = a.rows(), nc = a.cols();
 | 
			
		||||
        parallel_for_reduce(ComplexPlus, res) (unsigned int i = 0; i < nr; ++i)
 | 
			
		||||
        parallel_for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            int  rt, nr;
 | 
			
		||||
            auto nc = a.cols();
 | 
			
		||||
 | 
			
		||||
            GridThread::GetWork(a.rows(), thr, nr, rt);
 | 
			
		||||
            for (unsigned int i = rt; i < nr + rt; ++i)
 | 
			
		||||
            for (unsigned int j = 0; j < nc; ++j)
 | 
			
		||||
            {
 | 
			
		||||
            res += a(i, j)*b(j, i);
 | 
			
		||||
                tres[thr] += a(i, j)*b(j, i);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            res += tres[thr];
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
    trBenchmark("Naive loop cols first", left, right, ref,
 | 
			
		||||
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
 | 
			
		||||
    {
 | 
			
		||||
        int                   nThreads = GridThread::GetThreads();
 | 
			
		||||
        std::vector<ComplexD> tres(nThreads, 0.);
 | 
			
		||||
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        auto nr = a.rows(), nc = a.cols();
 | 
			
		||||
        parallel_for_reduce(ComplexPlus, res) (unsigned int j = 0; j < nc; ++j)
 | 
			
		||||
        parallel_for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            int  ct, nc;
 | 
			
		||||
            auto nr = a.rows();
 | 
			
		||||
 | 
			
		||||
            GridThread::GetWork(a.cols(), thr, nc, ct);
 | 
			
		||||
            for (unsigned int j = ct; j < nc + ct; ++j)
 | 
			
		||||
            for (unsigned int i = 0; i < nr; ++i)
 | 
			
		||||
            {
 | 
			
		||||
            res += a(i, j)*b(j, i);
 | 
			
		||||
                tres[thr] += a(i, j)*b(j, i);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            res += tres[thr];
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
    trBenchmark("Eigen tr(A*B)", left, right, ref,
 | 
			
		||||
@@ -226,21 +252,45 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
 | 
			
		||||
    trBenchmark("Eigen row-wise dot", left, right, ref,
 | 
			
		||||
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
 | 
			
		||||
    {
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        int                   nThreads = GridThread::GetThreads();
 | 
			
		||||
        std::vector<ComplexD> tres(nThreads, 0.);
 | 
			
		||||
 | 
			
		||||
        parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        parallel_for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            res += a.row(r).conjugate().dot(b.col(r));
 | 
			
		||||
            int  rt, nr;
 | 
			
		||||
 | 
			
		||||
            GridThread::GetWork(a.rows(), thr, nr, rt);
 | 
			
		||||
            for (unsigned int i = rt; i < nr + rt; ++i)
 | 
			
		||||
            {
 | 
			
		||||
                tres[thr] += a.row(i).conjugate().dot(b.col(i));
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            res += tres[thr];
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
    trBenchmark("Eigen col-wise dot", left, right, ref,
 | 
			
		||||
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
 | 
			
		||||
    {
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        int                   nThreads = GridThread::GetThreads();
 | 
			
		||||
        std::vector<ComplexD> tres(nThreads, 0.);
 | 
			
		||||
 | 
			
		||||
        parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        parallel_for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            res += a.col(c).conjugate().dot(b.row(c));
 | 
			
		||||
            int  ct, nc;
 | 
			
		||||
 | 
			
		||||
            GridThread::GetWork(a.cols(), thr, nc, ct);
 | 
			
		||||
            for (unsigned int j = ct; j < nc + ct; ++j)
 | 
			
		||||
            {
 | 
			
		||||
                tres[thr] += a.col(j).conjugate().dot(b.row(j));
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            res += tres[thr];
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
    trBenchmark("Eigen Hadamard", left, right, ref,
 | 
			
		||||
@@ -252,27 +302,49 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
 | 
			
		||||
    trBenchmark("MKL row-wise zdotu", left, right, ref,
 | 
			
		||||
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
 | 
			
		||||
    {
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        int                   nThreads = GridThread::GetThreads();
 | 
			
		||||
        std::vector<ComplexD> tres(nThreads, 0.);
 | 
			
		||||
 | 
			
		||||
        parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        parallel_for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            ComplexD tmp;
 | 
			
		||||
            int      rt, nr;
 | 
			
		||||
 | 
			
		||||
            zdotuRow(tmp, r, a, b);
 | 
			
		||||
            res += tmp;
 | 
			
		||||
            GridThread::GetWork(a.rows(), thr, nr, rt);
 | 
			
		||||
            for (unsigned int i = rt; i < nr + rt; ++i)
 | 
			
		||||
            {
 | 
			
		||||
                zdotuRow(tmp, i, a, b);
 | 
			
		||||
                tres[thr] += tmp;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            res += tres[thr];
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
    trBenchmark("MKL col-wise zdotu", left, right, ref,
 | 
			
		||||
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
 | 
			
		||||
    {
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        int                   nThreads = GridThread::GetThreads();
 | 
			
		||||
        std::vector<ComplexD> tres(nThreads, 0.);
 | 
			
		||||
 | 
			
		||||
        parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
 | 
			
		||||
        res = 0.;
 | 
			
		||||
        parallel_for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            ComplexD tmp;
 | 
			
		||||
            int      ct, nc;
 | 
			
		||||
 | 
			
		||||
            zdotuCol(tmp, c, a, b);
 | 
			
		||||
            res += tmp;
 | 
			
		||||
            GridThread::GetWork(a.cols(), thr, nc, ct);
 | 
			
		||||
            for (unsigned int j = ct; j < nc + ct; ++j)
 | 
			
		||||
            {
 | 
			
		||||
                zdotuCol(tmp, j, a, b);
 | 
			
		||||
                tres[thr] += tmp;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        for (int thr = 0; thr < nThreads; ++thr)
 | 
			
		||||
        {
 | 
			
		||||
            res += tres[thr];
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user