Hadrons: contractor performance fix

2026-07-17 15:43:27 +01:00 · 2018-11-16 20:59:49 +00:00
parent 8b007b5c24
commit f592ec8baa
3 changed files with 52 additions and 104 deletions
@@ -59,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
+#define parallel_critical PARALLEL_CRITICAL

 namespace Grid {

@@ -167,52 +167,39 @@ public:
    template <typename C, typename MatLeft, typename MatRight>
    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
    {
-        int            nThreads = GridThread::GetThreads();
-        std::vector<C> tacc(nThreads, 0.);
-
        if ((MatLeft::Options == Eigen::RowMajor) and
            (MatRight::Options == Eigen::ColMajor))
        {
-            parallel_for (int thr = 0; thr < nThreads; ++thr)
+            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
            {
-                int rt, nr;
-
-                GridThread::GetWork(a.rows(), thr, nr, rt);
-                for (unsigned int r = rt; r < nr + rt; ++r)
-                {
+                C tmp;
 #ifdef USE_MKL
-                    C tmp;
-                    dotuRow(tmp, r, a, b);
-                    tacc[thr] += tmp;
+                dotuRow(tmp, r, a, b);
 #else
-                    tacc[thr] += a.row(r).conjugate().dot(b.col(r));
+                tmp = a.row(r).conjugate().dot(b.col(r));
 #endif
+                parallel_critical
+                {
+                    acc += tmp;
                }
            }
        }
        else
        {
-            parallel_for (int thr = 0; thr < nThreads; ++thr)
+            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
            {
-                int ct, nc;
-
-                GridThread::GetWork(a.cols(), thr, nc, ct);
-                for (unsigned int c = ct; c < nc + ct; ++c)
-                {
-#ifdef USE_MKL
-                    C tmp;
-                    dotuCol(tmp, c, a, b);
-                    tacc[thr] += tmp;
+                C tmp;
+#ifdef USE_MKL 
+                dotuCol(tmp, c, a, b);
 #else
-                    tacc[thr] += a.col(c).conjugate().dot(b.row(c));
+                tmp = a.col(c).conjugate().dot(b.row(c));
 #endif
+                parallel_critical
+                {
+                    acc += tmp;
                }
            }
        }
-        for (int thr = 0; thr < nThreads; ++thr)
-        {
-            acc += tacc[thr];
-        }
    }

    template <typename MatLeft, typename MatRight>
@@ -199,50 +199,42 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    trBenchmark("Naive loop rows first", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    { 
-        int                   nThreads = GridThread::GetThreads();
-        std::vector<ComplexD> tres(nThreads, 0.);
-
+        auto nr = a.rows(), nc = a.cols();
+        
        res = 0.;
-        parallel_for (int thr = 0; thr < nThreads; ++thr)
+        parallel_for (unsigned int i = 0; i < nr; ++i)
        {
-            int  rt, nr;
-            auto nc = a.cols();
+            ComplexD tmp = 0.;

-            GridThread::GetWork(a.rows(), thr, nr, rt);
-            for (unsigned int i = rt; i < nr + rt; ++i)
            for (unsigned int j = 0; j < nc; ++j)
            {
-                tres[thr] += a(i, j)*b(j, i);
+                tmp += a(i, j)*b(j, i);
+            }
+            parallel_critical
+            {
+                res += tmp;
            }
-        }
-        for (int thr = 0; thr < nThreads; ++thr)
-        {
-            res += tres[thr];
        }
    });
    trBenchmark("Naive loop cols first", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
-        int                   nThreads = GridThread::GetThreads();
-        std::vector<ComplexD> tres(nThreads, 0.);
-
+        auto nr = a.rows(), nc = a.cols();
+        
        res = 0.;
-        parallel_for (int thr = 0; thr < nThreads; ++thr)
+        parallel_for (unsigned int j = 0; j < nc; ++j)
        {
-            int  ct, nc;
-            auto nr = a.rows();
+            ComplexD tmp = 0.;

-            GridThread::GetWork(a.cols(), thr, nc, ct);
-            for (unsigned int j = ct; j < nc + ct; ++j)
            for (unsigned int i = 0; i < nr; ++i)
            {
-                tres[thr] += a(i, j)*b(j, i);
+                tmp += a(i, j)*b(j, i);
+            }        
+            parallel_critical
+            {
+                res += tmp;
            }
        }
-        for (int thr = 0; thr < nThreads; ++thr)
-        {
-            res += tres[thr];
-        }
    });
    trBenchmark("Eigen tr(A*B)", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
@@ -252,46 +244,32 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    trBenchmark("Eigen row-wise dot", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
-        int                   nThreads = GridThread::GetThreads();
-        std::vector<ComplexD> tres(nThreads, 0.);
-
        res = 0.;
-        parallel_for (int thr = 0; thr < nThreads; ++thr)
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
        {
-            int  rt, nr;
+            ComplexD tmp;

-            GridThread::GetWork(a.rows(), thr, nr, rt);
-            for (unsigned int i = rt; i < nr + rt; ++i)
+            tmp = a.row(r).conjugate().dot(b.col(r));
+            parallel_critical
            {
-                tres[thr] += a.row(i).conjugate().dot(b.col(i));
+                res += tmp;
            }
        }
-        for (int thr = 0; thr < nThreads; ++thr)
-        {
-            res += tres[thr];
-        }
    });
    trBenchmark("Eigen col-wise dot", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
-        int                   nThreads = GridThread::GetThreads();
-        std::vector<ComplexD> tres(nThreads, 0.);
-
        res = 0.;
-        parallel_for (int thr = 0; thr < nThreads; ++thr)
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
        {
-            int  ct, nc;
+            ComplexD tmp;

-            GridThread::GetWork(a.cols(), thr, nc, ct);
-            for (unsigned int j = ct; j < nc + ct; ++j)
+            tmp = a.col(c).conjugate().dot(b.row(c));
+            parallel_critical
            {
-                tres[thr] += a.col(j).conjugate().dot(b.row(j));
+                res += tmp;
            }
        }
-        for (int thr = 0; thr < nThreads; ++thr)
-        {
-            res += tres[thr];
-        }
    });
    trBenchmark("Eigen Hadamard", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
@@ -302,50 +280,32 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    trBenchmark("MKL row-wise zdotu", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
-        int                   nThreads = GridThread::GetThreads();
-        std::vector<ComplexD> tres(nThreads, 0.);
-
        res = 0.;
-        parallel_for (int thr = 0; thr < nThreads; ++thr)
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
        {
            ComplexD tmp;
-            int      rt, nr;

-            GridThread::GetWork(a.rows(), thr, nr, rt);
-            for (unsigned int i = rt; i < nr + rt; ++i)
+            zdotuRow(tmp, r, a, b);
+            parallel_critical
            {
-                zdotuRow(tmp, i, a, b);
-                tres[thr] += tmp;
+                res += tmp;
            }
        }
-        for (int thr = 0; thr < nThreads; ++thr)
-        {
-            res += tres[thr];
-        }
    });
    trBenchmark("MKL col-wise zdotu", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
-        int                   nThreads = GridThread::GetThreads();
-        std::vector<ComplexD> tres(nThreads, 0.);
-
        res = 0.;
-        parallel_for (int thr = 0; thr < nThreads; ++thr)
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
        {
            ComplexD tmp;
-            int      ct, nc;

-            GridThread::GetWork(a.cols(), thr, nc, ct);
-            for (unsigned int j = ct; j < nc + ct; ++j)
+            zdotuCol(tmp, c, a, b);
+            parallel_critical
            {
-                zdotuCol(tmp, j, a, b);
-                tres[thr] += tmp;
+                res += tmp;
            }
        }
-        for (int thr = 0; thr < nThreads; ++thr)
-        {
-            res += tres[thr];
-        }
    });
 #endif
    BARRIER();