diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h
index 6403bff9..580278d6 100644
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -39,12 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <omp.h>
 
 // complex reductions
-#pragma omp declare reduction(ComplexPlus: Grid::Complex: omp_out += omp_in)
-#pragma omp declare reduction(GridVComplexPlus: Grid::vComplex: omp_out += omp_in)
-#pragma omp declare reduction(ComplexDPlus: Grid::ComplexD: omp_out += omp_in)
-#pragma omp declare reduction(GridVComplexDPlus: Grid::vComplexD: omp_out += omp_in)
-#pragma omp declare reduction(ComplexFPlus: Grid::ComplexF: omp_out += omp_in)
-#pragma omp declare reduction(GridVComplexFPlus: Grid::vComplexF: omp_out += omp_in)
+#pragma omp declare reduction(ComplexPlus:Grid::ComplexD, Grid::vComplexD, Grid::ComplexF, Grid::vComplexF: omp_out += omp_in)
 
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
diff --git a/Hadrons/A2AMatrix.hpp b/Hadrons/A2AMatrix.hpp
index 9842e15b..5266d050 100644
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -32,6 +32,10 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
 
 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
@@ -58,6 +62,9 @@ using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
 template <typename T>
 using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
 
+template <typename T>
+using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
+
 /******************************************************************************
  *                      Abstract class for A2A kernels                        *
  ******************************************************************************/
@@ -150,6 +157,198 @@ private:
     std::vector<IoHelper> nodeIo_;
 };
 
+/******************************************************************************
+ *                       A2A matrix contraction kernels                       *
+ ******************************************************************************/
+class A2AContraction
+{
+public:
+    // accTrMul(acc, a, b): acc += tr(a*b)
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
+    {
+        if ((MatLeft::Options == Eigen::RowMajor) and
+            (MatRight::Options == Eigen::ColMajor))
+        {
+            parallel_for_reduce(ComplexPlus, acc) (unsigned int r = 0; r < a.rows(); ++r)
+            {
+#ifdef USE_MKL
+                ComplexD tmp;
+                dotuRow(tmp, r, a, b);
+                acc += tmp;
+#else
+                acc += a.row(r).conjugate().dot(b.col(r));
+#endif
+            }
+        }
+        else
+        {
+            parallel_for_reduce(ComplexPlus, acc) (unsigned int c = 0; c < a.cols(); ++c)
+            {
+#ifdef USE_MKL
+                ComplexD tmp;
+                dotuCol(tmp, c, a, b);
+                acc += tmp;
+#else
+                acc += a.col(c).conjugate().dot(b.row(c));
+#endif
+            }
+        }
+    }
+
+    // mul(res, a, b): res = a*b
+#ifdef USE_MKL
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexD, Opts...> &res, 
+                           const Mat<ComplexD, Opts...> &a, 
+                           const Mat<ComplexD, Opts...> &b)
+    {
+        static const ComplexD one(1., 0.), zero(0., 0.);
+
+        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexF, Opts...> &res, 
+                           const Mat<ComplexF, Opts...> &a, 
+                           const Mat<ComplexF, Opts...> &b)
+    {
+        static const ComplexF one(1., 0.), zero(0., 0.);
+
+        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+#else
+    template <typename Mat>
+    static inline void mul(Mat &res, const Mat &a, const Mat &b)
+    {
+        res = a*b;
+    }
+#endif
+
+private:
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aRow, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aRow*a.cols();
+            aInc = 1;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aRow;
+            aInc = a.rows();
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aRow;
+            bInc = b.cols();
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aRow*b.rows();
+            bInc = 1;
+        }
+    }
+
+#ifdef USE_MKL
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aCol, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aCol;
+            aInc = a.cols();
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aCol*a.rows();
+            aInc = 1;
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aCol*b.cols();
+            bInc = 1;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aCol;
+            bInc = b.rows();
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+#endif
+};
+
 /******************************************************************************
  *                     A2AMatrixIo template implementation                    *
  ******************************************************************************/
diff --git a/Hadrons/Utilities/ContractorBenchmark.cc b/Hadrons/Utilities/ContractorBenchmark.cc
index 2a8b033b..83d0e49f 100644
--- a/Hadrons/Utilities/ContractorBenchmark.cc
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -1,16 +1,12 @@
 #include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
-#ifdef AVX2
-#include <immintrin.h>
-#endif
 
 using namespace Grid;
-
-typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::RowMajor> RMCMat;
-typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::ColMajor> CMCMat;
+using namespace Hadrons;
 
 #ifdef GRID_COMMS_MPI3
 #define GET_RANK(rank, nMpi) \
@@ -50,7 +46,7 @@ inline void trBenchmark(const std::string name, const MatLeft &left,
 
     if (rank == 0)
     {
-        std::cout << std::setw(30) << name << ": diff= "
+        std::cout << std::setw(34) << name << ": diff= "
                   << std::setw(12) << std::norm(buf-ref)
                   << std::setw(10) << t/1.0e6 << " sec "
                   << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
@@ -85,16 +81,17 @@ inline void mulBenchmark(const std::string name, const MatV &left,
 
     if (rank == 0)
     {
-        std::cout << std::setw(30) << name << ": diff= "
-                << std::setw(12) << (buf-ref).squaredNorm()
-                << std::setw(10) << t/1.0e6 << " sec "
-                << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
-                << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
-                << std::endl;
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << (buf-ref).squaredNorm()
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
     }
     ::sleep(1);
 }
 
+#ifdef USE_MKL
 template <typename MatLeft, typename MatRight>
 static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
                             const MatLeft &a, const MatRight &b)
@@ -154,6 +151,7 @@ static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
     }
     cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
 }
+#endif
 
 template <typename MatLeft, typename MatRight>
 void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
@@ -192,12 +190,18 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
     }
     BARRIER();
     ref = (left.back()*right.back()).trace();
+    trBenchmark("Hadrons A2AContraction::accTrMul", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+        A2AContraction::accTrMul(res, a, b);
+    });
     trBenchmark("Naive loop rows first", left, right, ref,
     [](ComplexD &res, const MatLeft &a, const MatRight &b)
     { 
         res = 0.;
         auto nr = a.rows(), nc = a.cols();
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int i = 0; i < nr; ++i)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int i = 0; i < nr; ++i)
         for (unsigned int j = 0; j < nc; ++j)
         {
             res += a(i, j)*b(j, i);
@@ -208,7 +212,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
     { 
         res = 0.;
         auto nr = a.rows(), nc = a.cols();
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int j = 0; j < nc; ++j)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int j = 0; j < nc; ++j)
         for (unsigned int i = 0; i < nr; ++i)
         {
             res += a(i, j)*b(j, i);
@@ -224,7 +228,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
     { 
         res = 0.;
 
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
         {
             res += a.row(r).conjugate().dot(b.col(r));
         }
@@ -234,7 +238,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
     { 
         res = 0.;
 
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
         {
             res += a.col(c).conjugate().dot(b.row(c));
         }
@@ -250,7 +254,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
     { 
         res = 0.;
 
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
         {
             ComplexD tmp;
 
@@ -263,7 +267,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
     { 
         res = 0.;
 
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
         {
             ComplexD tmp;
 
@@ -305,6 +309,11 @@ void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsign
     }
     BARRIER();
     ref = left.back()*right.back();
+    mulBenchmark("Hadrons A2AContraction::mul", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        A2AContraction::mul(res, a, b);
+    });
     mulBenchmark("Eigen A*B", left, right, ref,
     [](Mat &res, const Mat &a, const Mat &b)
     { 
@@ -381,12 +390,12 @@ int main(int argc, char *argv[])
         std::cout << std::endl;
     }
 
-    fullTrBenchmark<RMCMat, RMCMat>(ni, nj, nMat);
-    fullTrBenchmark<RMCMat, CMCMat>(ni, nj, nMat);
-    fullTrBenchmark<CMCMat, RMCMat>(ni, nj, nMat);
-    fullTrBenchmark<CMCMat, CMCMat>(ni, nj, nMat);
-    fullMulBenchmark<RMCMat>(ni, nj, nMat);
-    fullMulBenchmark<CMCMat>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrixTr<ComplexD>>(ni, nj, nMat);
     FINALIZE();
 
     return EXIT_SUCCESS;