diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h
index 580278d6..1c13fccb 100644
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -38,14 +38,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_OMP
 #include <omp.h>
 
-// complex reductions
-#pragma omp declare reduction(ComplexPlus:Grid::ComplexD, Grid::vComplexD, Grid::ComplexF, Grid::vComplexF: omp_out += omp_in)
-
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
-#define PARALLEL_FOR_REDUCE_HELPER0(x) #x
-#define PARALLEL_FOR_REDUCE_HELPER1(op, var) PARALLEL_FOR_REDUCE_HELPER0(omp parallel for schedule(static) reduction(op:var))
-#define PARALLEL_FOR_LOOP_REDUCE(op, var) _Pragma(PARALLEL_FOR_REDUCE_HELPER1(op, var))
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #define PARALLEL_NESTED_LOOP5 _Pragma("omp parallel for collapse(5)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
@@ -63,7 +57,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_region    PARALLEL_REGION
 #define parallel_for       PARALLEL_FOR_LOOP for
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
-#define parallel_for_reduce(op, var) PARALLEL_FOR_LOOP_REDUCE(op, var) for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
 
diff --git a/Hadrons/A2AMatrix.hpp b/Hadrons/A2AMatrix.hpp
index 831d2a02..288bb7c5 100644
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -167,33 +167,52 @@ public:
     template <typename C, typename MatLeft, typename MatRight>
     static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
     {
+        int            nThreads = GridThread::GetThreads();
+        std::vector<C> tacc(nThreads, 0.);
+
         if ((MatLeft::Options == Eigen::RowMajor) and
             (MatRight::Options == Eigen::ColMajor))
         {
-            parallel_for_reduce(ComplexPlus, acc) (unsigned int r = 0; r < a.rows(); ++r)
+            parallel_for (int thr = 0; thr < nThreads; ++thr)
             {
+                int rt, nr;
+
+                GridThread::GetWork(a.rows(), thr, nr, rt);
+                for (unsigned int r = rt; r < nr + rt; ++r)
+                {
 #ifdef USE_MKL
-                ComplexD tmp;
-                dotuRow(tmp, r, a, b);
-                acc += tmp;
+                    C tmp;
+                    dotuRow(tmp, r, a, b);
+                    tacc[thr] += tmp;
 #else
-                acc += a.row(r).conjugate().dot(b.col(r));
+                    tacc[thr] += a.row(r).conjugate().dot(b.col(r));
 #endif
+                }
             }
         }
         else
         {
-            parallel_for_reduce(ComplexPlus, acc) (unsigned int c = 0; c < a.cols(); ++c)
+            parallel_for (int thr = 0; thr < nThreads; ++thr)
             {
+                int ct, nc;
+
+                GridThread::GetWork(a.cols(), thr, nc, ct);
+                for (unsigned int c = ct; c < nc + ct; ++c)
+                {
 #ifdef USE_MKL
-                ComplexD tmp;
-                dotuCol(tmp, c, a, b);
-                acc += tmp;
+                    C tmp;
+                    dotuCol(tmp, c, a, b);
+                    tacc[thr] += tmp;
 #else
-                acc += a.col(c).conjugate().dot(b.row(c));
+                    tacc[thr] += a.col(c).conjugate().dot(b.row(c));
 #endif
+                }
             }
         }
+        for (int thr = 0; thr < nThreads; ++thr)
+        {
+            acc += tacc[thr];
+        }
     }
 
     template <typename MatLeft, typename MatRight>
diff --git a/Hadrons/Utilities/ContractorBenchmark.cc b/Hadrons/Utilities/ContractorBenchmark.cc
index 83d0e49f..a33f4ab4 100644
--- a/Hadrons/Utilities/ContractorBenchmark.cc
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -199,23 +199,49 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
     trBenchmark("Naive loop rows first", left, right, ref,
     [](ComplexD &res, const MatLeft &a, const MatRight &b)
     { 
+        int                   nThreads = GridThread::GetThreads();
+        std::vector<ComplexD> tres(nThreads, 0.);
+
         res = 0.;
-        auto nr = a.rows(), nc = a.cols();
-        parallel_for_reduce(ComplexPlus, res) (unsigned int i = 0; i < nr; ++i)
-        for (unsigned int j = 0; j < nc; ++j)
+        parallel_for (int thr = 0; thr < nThreads; ++thr)
         {
-            res += a(i, j)*b(j, i);
+            int  rt, nr;
+            auto nc = a.cols();
+
+            GridThread::GetWork(a.rows(), thr, nr, rt);
+            for (unsigned int i = rt; i < nr + rt; ++i)
+            for (unsigned int j = 0; j < nc; ++j)
+            {
+                tres[thr] += a(i, j)*b(j, i);
+            }
+        }
+        for (int thr = 0; thr < nThreads; ++thr)
+        {
+            res += tres[thr];
         }
     });
     trBenchmark("Naive loop cols first", left, right, ref,
     [](ComplexD &res, const MatLeft &a, const MatRight &b)
-    { 
+    {
+        int                   nThreads = GridThread::GetThreads();
+        std::vector<ComplexD> tres(nThreads, 0.);
+
         res = 0.;
-        auto nr = a.rows(), nc = a.cols();
-        parallel_for_reduce(ComplexPlus, res) (unsigned int j = 0; j < nc; ++j)
-        for (unsigned int i = 0; i < nr; ++i)
+        parallel_for (int thr = 0; thr < nThreads; ++thr)
         {
-            res += a(i, j)*b(j, i);
+            int  ct, nc;
+            auto nr = a.rows();
+
+            GridThread::GetWork(a.cols(), thr, nc, ct);
+            for (unsigned int j = ct; j < nc + ct; ++j)
+            for (unsigned int i = 0; i < nr; ++i)
+            {
+                tres[thr] += a(i, j)*b(j, i);
+            }
+        }
+        for (int thr = 0; thr < nThreads; ++thr)
+        {
+            res += tres[thr];
         }
     });
     trBenchmark("Eigen tr(A*B)", left, right, ref,
@@ -225,22 +251,46 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
     });
     trBenchmark("Eigen row-wise dot", left, right, ref,
     [](ComplexD &res, const MatLeft &a, const MatRight &b)
-    { 
-        res = 0.;
+    {
+        int                   nThreads = GridThread::GetThreads();
+        std::vector<ComplexD> tres(nThreads, 0.);
 
-        parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
+        res = 0.;
+        parallel_for (int thr = 0; thr < nThreads; ++thr)
         {
-            res += a.row(r).conjugate().dot(b.col(r));
+            int  rt, nr;
+
+            GridThread::GetWork(a.rows(), thr, nr, rt);
+            for (unsigned int i = rt; i < nr + rt; ++i)
+            {
+                tres[thr] += a.row(i).conjugate().dot(b.col(i));
+            }
+        }
+        for (int thr = 0; thr < nThreads; ++thr)
+        {
+            res += tres[thr];
         }
     });
     trBenchmark("Eigen col-wise dot", left, right, ref,
     [](ComplexD &res, const MatLeft &a, const MatRight &b)
-    { 
-        res = 0.;
+    {
+        int                   nThreads = GridThread::GetThreads();
+        std::vector<ComplexD> tres(nThreads, 0.);
 
-        parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
+        res = 0.;
+        parallel_for (int thr = 0; thr < nThreads; ++thr)
         {
-            res += a.col(c).conjugate().dot(b.row(c));
+            int  ct, nc;
+
+            GridThread::GetWork(a.cols(), thr, nc, ct);
+            for (unsigned int j = ct; j < nc + ct; ++j)
+            {
+                tres[thr] += a.col(j).conjugate().dot(b.row(j));
+            }
+        }
+        for (int thr = 0; thr < nThreads; ++thr)
+        {
+            res += tres[thr];
         }
     });
     trBenchmark("Eigen Hadamard", left, right, ref,
@@ -251,28 +301,50 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
 #ifdef USE_MKL
     trBenchmark("MKL row-wise zdotu", left, right, ref,
     [](ComplexD &res, const MatLeft &a, const MatRight &b)
-    { 
-        res = 0.;
+    {
+        int                   nThreads = GridThread::GetThreads();
+        std::vector<ComplexD> tres(nThreads, 0.);
 
-        parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
+        res = 0.;
+        parallel_for (int thr = 0; thr < nThreads; ++thr)
         {
             ComplexD tmp;
+            int      rt, nr;
 
-            zdotuRow(tmp, r, a, b);
-            res += tmp;
+            GridThread::GetWork(a.rows(), thr, nr, rt);
+            for (unsigned int i = rt; i < nr + rt; ++i)
+            {
+                zdotuRow(tmp, i, a, b);
+                tres[thr] += tmp;
+            }
+        }
+        for (int thr = 0; thr < nThreads; ++thr)
+        {
+            res += tres[thr];
         }
     });
     trBenchmark("MKL col-wise zdotu", left, right, ref,
     [](ComplexD &res, const MatLeft &a, const MatRight &b)
-    { 
-        res = 0.;
+    {
+        int                   nThreads = GridThread::GetThreads();
+        std::vector<ComplexD> tres(nThreads, 0.);
 
-        parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
+        res = 0.;
+        parallel_for (int thr = 0; thr < nThreads; ++thr)
         {
             ComplexD tmp;
+            int      ct, nc;
 
-            zdotuCol(tmp, c, a, b);
-            res += tmp;
+            GridThread::GetWork(a.cols(), thr, nc, ct);
+            for (unsigned int j = ct; j < nc + ct; ++j)
+            {
+                zdotuCol(tmp, j, a, b);
+                tres[thr] += tmp;
+            }
+        }
+        for (int thr = 0; thr < nThreads; ++thr)
+        {
+            res += tres[thr];
         }
     });
 #endif