Hadrons: final, portable form of the contractor benchmark

2025-10-29 02:59:33 +00:00 · 2018-11-05 21:29:13 +00:00
parent 1ed4ea344d
commit 1651111d18
2 changed files with 352 additions and 213 deletions
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -38,8 +38,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_OMP
 #include <omp.h>

+// complex reductions
+#pragma omp declare reduction(ComplexPlus: Grid::Complex: omp_out += omp_in)
+#pragma omp declare reduction(GridVComplexPlus: Grid::vComplex: omp_out += omp_in)
+#pragma omp declare reduction(ComplexDPlus: Grid::ComplexD: omp_out += omp_in)
+#pragma omp declare reduction(GridVComplexDPlus: Grid::vComplexD: omp_out += omp_in)
+#pragma omp declare reduction(ComplexFPlus: Grid::ComplexF: omp_out += omp_in)
+#pragma omp declare reduction(GridVComplexFPlus: Grid::vComplexF: omp_out += omp_in)
+
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
+#define PARALLEL_FOR_REDUCE_HELPER0(x) #x
+#define PARALLEL_FOR_REDUCE_HELPER1(op, var) PARALLEL_FOR_REDUCE_HELPER0(omp parallel for schedule(static) reduction(op:var))
+#define PARALLEL_FOR_LOOP_REDUCE(op, var) _Pragma(PARALLEL_FOR_REDUCE_HELPER1(op, var))
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #define PARALLEL_NESTED_LOOP5 _Pragma("omp parallel for collapse(5)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
@@ -47,6 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
+#define PARALLEL_FOR_LOOP_REDUCE(op, var)
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_NESTED_LOOP5
 #define PARALLEL_REGION
@@ -56,6 +68,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_region    PARALLEL_REGION
 #define parallel_for       PARALLEL_FOR_LOOP for
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
+#define parallel_for_reduce(op, var) PARALLEL_FOR_LOOP_REDUCE(op, var) for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for

--- a/Hadrons/Utilities/ContractorBenchmark.cc
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -1,86 +1,346 @@
 #include <Hadrons/Global.hpp>
-#include <Hadrons/DiskVector.hpp>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
+#ifdef AVX2
+#include <immintrin.h>
+#endif

 using namespace Grid;

-#define EIGEN_ROW_MAJOR 1
-#define EIGEN_COL_MAJOR 2
-#ifndef EIGEN_ORDER 
-#define EIGEN_ORDER EIGEN_ROW_MAJOR
-#endif
-#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
-typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::RowMajor> CMat;
-#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
-typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::ColMajor> CMat;
+typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::RowMajor> RMCMat;
+typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::ColMajor> CMCMat;
+
+#ifdef GRID_COMMS_MPI3
+#define GET_RANK(rank, nMpi) \
+MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
+MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
+#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
+#define INIT() MPI_Init(NULL, NULL)
+#define FINALIZE() MPI_Finalize()
+#else
+#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
+#define BARRIER()
+#define INIT()
+#define FINALIZE()
 #endif

-typedef std::vector<std::vector<CMat>>                      CMatSet;
-
-#pragma omp declare reduction(ComplexPlus: ComplexD: omp_out += omp_in) 
-
-template <typename Function>
-inline void trBenchmark(const std::string name, const CMatSet &mat, 
-                        const ComplexD ref, Function fn)
+template <typename Function, typename MatLeft, typename MatRight>
+inline void trBenchmark(const std::string name, const MatLeft &left,
+                        const MatRight &right, const ComplexD ref, Function fn)
 {
-    double       t, flops, bytes, n = mat[0][0].rows()*mat[0][0].cols();
-    unsigned int nMat = mat[0].size();
+    double       t, flops, bytes, n = left[0].rows()*left[0].cols();
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
    ComplexD     buf;

    t = 0.;
-    for (unsigned int i = 0; i < nMat; ++i)
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
    {
-        t -= usecond();
-        fn(buf, mat[0][i], mat[1][i]);
-        t += usecond();
+        fn(buf, left[i], right[i]);      
    }
+    BARRIER();
+    t += usecond();
    flops = nMat*(6.*n + 2.*(n - 1.));
    bytes = nMat*(2.*n*sizeof(ComplexD));

-    std::cout << std::setw(30) << name << ": diff= "
-              << std::setw(12) << std::norm(buf-ref)
-              << std::setw(10) << t/1.0e6 << " sec "
-              << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
-              << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
-              << std::endl;
+    if (rank == 0)
+    {
+        std::cout << std::setw(30) << name << ": diff= "
+                  << std::setw(12) << std::norm(buf-ref)
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
    ::sleep(1);
 }

-template <typename Function>
-inline void mulBenchmark(const std::string name, const CMatSet &mat, 
-                         const CMat ref, Function fn)
+template <typename Function, typename MatV, typename Mat>
+inline void mulBenchmark(const std::string name, const MatV &left,
+                         const MatV &right, const Mat &ref, Function fn)
 {
    double       t, flops, bytes;
-    double       nr = mat[0][0].rows(), nc = mat[0][0].cols(), n = nr*nc;
-    unsigned int nMat = mat[0].size();
-    CMat         buf(mat[0][0].rows(), mat[0][0].rows());
+    double       nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    Mat          buf(left[0].rows(), left[0].rows());

    t = 0.;
-    for (unsigned int i = 0; i < nMat; ++i)
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
    {
-        t -= usecond();
-        fn(buf, mat[0][i], mat[1][i]);
-        t += usecond();
+        fn(buf, left[i], right[i]);
    }
-    flops = nMat*(n*(6.*nc + 2.*(nc - 1.)));
-    bytes = nMat*((2.*n+nr*nr)*sizeof(ComplexD));
+    BARRIER();
+    t += usecond();
+    flops = nMat*(nr*nr*(6.*nc + 2.*(nc - 1.)));
+    bytes = nMat*(2*nc*nr*sizeof(ComplexD));

-    std::cout << std::setw(30) << name << ": diff= "
-              << std::setw(12) << (buf-ref).squaredNorm()
-              << std::setw(10) << t/1.0e6 << " sec "
-              << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
-              << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
-              << std::endl;
+    if (rank == 0)
+    {
+        std::cout << std::setw(30) << name << ": diff= "
+                << std::setw(12) << (buf-ref).squaredNorm()
+                << std::setw(10) << t/1.0e6 << " sec "
+                << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                << std::endl;
+    }
    ::sleep(1);
 }

+template <typename MatLeft, typename MatRight>
+static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aRow*a.cols();
+        aInc = 1;
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aRow;
+        aInc = a.rows();
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aRow;
+        bInc = b.cols();
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aRow*b.rows();
+        bInc = 1;
+    }
+    cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+}
+
+template <typename MatLeft, typename MatRight>
+static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aCol;
+        aInc = a.cols();
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aCol*a.rows();
+        aInc = 1;
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aCol*b.cols();
+        bInc = 1;
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aCol;
+        bInc = b.rows();
+    }
+    cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+}
+
+template <typename MatLeft, typename MatRight>
+void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<MatLeft>  left;
+    std::vector<MatRight> right;
+    MatRight              buf;
+    ComplexD              ref;
+    int                   rank, nMpi;
+
+    left.resize(nMat, MatLeft::Random(ni, nj));
+    right.resize(nMat, MatRight::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== tr(A*B) benchmarks" << std::endl;
+        std::cout << "A matrices use ";
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << "B matrices use ";
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = (left.back()*right.back()).trace();
+    trBenchmark("Naive loop rows first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+        auto nr = a.rows(), nc = a.cols();
+        parallel_for_reduce(ComplexDPlus, res) (unsigned int i = 0; i < nr; ++i)
+        for (unsigned int j = 0; j < nc; ++j)
+        {
+            res += a(i, j)*b(j, i);
+        }
+    });
+    trBenchmark("Naive loop cols first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+        auto nr = a.rows(), nc = a.cols();
+        parallel_for_reduce(ComplexDPlus, res) (unsigned int j = 0; j < nc; ++j)
+        for (unsigned int i = 0; i < nr; ++i)
+        {
+            res += a(i, j)*b(j, i);
+        }
+    });
+    trBenchmark("Eigen tr(A*B)", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = (a*b).trace();
+    });
+    trBenchmark("Eigen row-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+
+        parallel_for_reduce(ComplexDPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            res += a.row(r).conjugate().dot(b.col(r));
+        }
+    });
+    trBenchmark("Eigen col-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+
+        parallel_for_reduce(ComplexDPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            res += a.col(c).conjugate().dot(b.row(c));
+        }
+    });
+    trBenchmark("Eigen Hadamard", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = a.cwiseProduct(b.transpose()).sum();
+    });
+#ifdef USE_MKL
+    trBenchmark("MKL row-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+
+        parallel_for_reduce(ComplexDPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            zdotuRow(tmp, r, a, b);
+            res += tmp;
+        }
+    });
+    trBenchmark("MKL col-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+
+        parallel_for_reduce(ComplexDPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            zdotuCol(tmp, c, a, b);
+            res += tmp;
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+template <typename Mat>
+void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<Mat> left, right;
+    Mat              ref;
+    int              rank, nMpi;
+
+    left.resize(nMat, Mat::Random(ni, nj));
+    right.resize(nMat, Mat::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== A*B benchmarks" << std::endl;
+        std::cout << "all matrices use ";
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = left.back()*right.back();
+    mulBenchmark("Eigen A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        res = a*b;
+    });
+#ifdef USE_MKL
+    mulBenchmark("MKL A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    {
+        const ComplexD one(1., 0.), zero(0., 0.);
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
 int main(int argc, char *argv[])
 {
    // parse command line
    Eigen::Index ni, nj, nMat;
+    int          nMpi, rank;

    if (argc != 4)
    {
@@ -93,175 +353,41 @@ int main(int argc, char *argv[])
    nj   = std::stoi(argv[2]);
    nMat = std::stoi(argv[3]);

-    std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
-    std::cout << nMat << " " << ni << "x" << nj << " matrices\n" << std::endl;
-#ifdef EIGEN_USE_MKL_ALL
-    std::cout << "Eigen uses the MKL" << std::endl;
-#endif
-    std::cout << "Eigen uses ";
-#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
-    std::cout << "row-major ordering" << std::endl;
-#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
-    std::cout << "column-major ordering" << std::endl;
-#endif
-    std::cout << "Eigen uses " << Eigen::nbThreads() << " thread(s)" << std::endl;
-#ifdef USE_MKL
-    std::cout << "MKL   uses " << mkl_get_max_threads() << " thread(s)" << std::endl;
-#endif
-    std::cout << std::endl;
-
-    CMatSet  mat(2);
-    CMat     buf;
-    ComplexD ref;
-
-    mat[0].resize(nMat, Eigen::MatrixXcd::Random(ni, nj));
-    mat[1].resize(nMat, Eigen::MatrixXcd::Random(nj, ni));
-
-    std::cout << "==== tr(A*B) benchmarks" << std::endl;
-    ref = (mat[0].back()*mat[1].back()).trace();
-    trBenchmark("Naive loop rows first", mat, ref,
-    [](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        res = 0.;
-        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
-        for (unsigned int i = 0; i < a.rows(); ++i)
-        for (unsigned int j = 0; j < a.cols(); ++j)
-        {
-            res += a(i, j)*b(j, i);
-        }
-    });
-    trBenchmark("Naive loop cols first", mat, ref,
-    [](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        res = 0.;
-        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
-        for (unsigned int j = 0; j < a.cols(); ++j)
-        for (unsigned int i = 0; i < a.rows(); ++i)
-        {
-            res += a(i, j)*b(j, i);
-        }
-    });
-    trBenchmark("Eigen tr(A*B)", mat, ref,
-    [](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        res = (a*b).trace();
-    });
-    trBenchmark("Eigen global dot", mat, ref,
-    [&buf](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        buf = b.transpose();
-        Eigen::Map<const Eigen::VectorXcd> av(a.data(), a.rows()*a.cols());
-        Eigen::Map<const Eigen::VectorXcd> bv(buf.data(), b.rows()*b.cols());
-
-        res = av.conjugate().dot(bv);
-    });
-    trBenchmark("Eigen row-wise dot", mat, ref,
-    [](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        res = 0.;
-        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
-        for (unsigned int r = 0; r < a.rows(); ++r)
-        {
-            res += a.row(r).conjugate().dot(b.col(r));
-        }
-    });
-    trBenchmark("Eigen col-wise dot", mat, ref,
-    [](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        res = 0.;
-        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
-        for (unsigned int c = 0; c < a.cols(); ++c)
-        {
-            res += a.col(c).conjugate().dot(b.row(c));
-        }
-    });
-    trBenchmark("Eigen Hadamard", mat, ref,
-    [](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        res = a.cwiseProduct(b.transpose()).sum();
-    });
-#ifdef USE_MKL
-    trBenchmark("MKL row-wise zdotu", mat, ref,
-    [](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        res = 0.;
-        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
-        for (unsigned int r = 0; r < a.rows(); ++r)
-        {
-            ComplexD tmp;
-
-#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
-            cblas_zdotu_sub(a.cols(), a.data() + r*a.cols(), 1, b.data() + r, b.cols(), &tmp);
-#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
-            cblas_zdotu_sub(a.cols(), a.data() + r, a.rows(), b.data() + r*b.rows(), 1, &tmp);
-#endif
-            res += tmp;
-        }
-    });
-    trBenchmark("MKL col-wise zdotu", mat, ref,
-    [](ComplexD &res, const CMat &a, const CMat &b)
-    { 
-        res = 0.;
-        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
-        for (unsigned int c = 0; c < a.cols(); ++c)
-        {
-            ComplexD tmp;
-
-#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
-            cblas_zdotu_sub(a.rows(), a.data() + c, a.cols(), b.data() + c*b.cols(), 1, &tmp);
-#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
-            cblas_zdotu_sub(a.rows(), a.data() + c*a.rows(), 1, b.data() + c, b.rows(), &tmp);
-#endif
-            res += tmp;
-        }
-    });
-#endif
-
-    std::cout << std::endl;
-    std::cout << "==== A*B benchmarks" << std::endl;
-    buf = mat[0].back()*mat[1].back();
-    mulBenchmark("Naive", mat, buf,
-    [](CMat &res, const CMat &a, const CMat &b)
-    { 
-        unsigned int ni = a.rows(), nj = a.cols();
-
-        #pragma omp parallel for collapse(2)
-        for (unsigned int i = 0; i < ni; ++i)
-        for (unsigned int k = 0; k < ni; ++k)
-        {
-            res(i, k) = a(i, 0)*b(0, k);
-        }
-        #pragma omp parallel for collapse(2)
-        for (unsigned int i = 0; i < ni; ++i)
-        for (unsigned int k = 0; k < ni; ++k)
-        for (unsigned int j = 1; j < nj; ++j)
-        {
-            res(i, k) += a(i, j)*b(j, k);
-        }
-    });
-    mulBenchmark("Eigen A*B", mat, buf,
-    [](CMat &res, const CMat &a, const CMat &b)
-    { 
-        res = a*b;
-    });
-#ifdef USE_MKL
-    mulBenchmark("MKL A*B", mat, buf,
-    [](CMat &res, const CMat &a, const CMat &b)
+    INIT();
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
    {
-        const ComplexD one(1., 0.), zero(0., 0.);
-#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
-        cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
-                    a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
-                    res.data(), res.cols());
-#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
-        cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
-                    a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
-                    res.data(), res.rows());
-#endif
-    });
+        std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
+        std::cout << nMat << " couples of " << ni << "x" << nj << " matrices\n" << std::endl;
+
+        std::cout << nMpi << " MPI processes" << std::endl;
+#ifdef GRID_OMP
+        #pragma omp parallel
+        {
+            #pragma omp single
+            std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
+        }
+#else
+        std::cout << "Single-threaded\n" << std::endl; 
 #endif

-    std::cout << std::endl;
+#ifdef EIGEN_USE_MKL_ALL
+        std::cout << "Eigen uses the MKL" << std::endl;
+#endif
+        std::cout << "Eigen uses " << Eigen::nbThreads() << " threads" << std::endl;
+#ifdef USE_MKL
+        std::cout << "MKL   uses " << mkl_get_max_threads() << " threads" << std::endl;
+#endif
+        std::cout << std::endl;
+    }
+
+    fullTrBenchmark<RMCMat, RMCMat>(ni, nj, nMat);
+    fullTrBenchmark<RMCMat, CMCMat>(ni, nj, nMat);
+    fullTrBenchmark<CMCMat, RMCMat>(ni, nj, nMat);
+    fullTrBenchmark<CMCMat, CMCMat>(ni, nj, nMat);
+    fullMulBenchmark<RMCMat>(ni, nj, nMat);
+    fullMulBenchmark<CMCMat>(ni, nj, nMat);
+    FINALIZE();

    return EXIT_SUCCESS;
 }