From 4a7415e83c8827270390f044734063e698261b52 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 23 Oct 2018 21:00:54 +0100
Subject: [PATCH] Hadrons: contractor benchmark update

---
 Hadrons/Utilities/ContractorBenchmark.cc | 187 ++++++++++++++++++++---
 1 file changed, 165 insertions(+), 22 deletions(-)
diff --git a/Hadrons/Utilities/ContractorBenchmark.cc b/Hadrons/Utilities/ContractorBenchmark.cc
index 9a8c0f4e..d2ff7c9e 100644
--- a/Hadrons/Utilities/ContractorBenchmark.cc
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -1,17 +1,30 @@
 #include <Hadrons/Global.hpp>
 #include <Hadrons/DiskVector.hpp>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
 
 using namespace Grid;
 
+#define EIGEN_ROW_MAJOR 1
+#define EIGEN_COL_MAJOR 2
 #ifndef EIGEN_ORDER 
-#define EIGEN_ORDER RowMajor
+#define EIGEN_ORDER EIGEN_ROW_MAJOR
+#endif
+#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
+typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::RowMajor> CMat;
+#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
+typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::ColMajor> CMat;
 #endif
 
-typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::EIGEN_ORDER> CMat;
 typedef std::vector<std::vector<CMat>>                      CMatSet;
 
-template <typename TwoMatFn>
-inline void trBenchmark(const std::string name, const CMatSet &mat, TwoMatFn fn)
+#pragma omp declare reduction(ComplexPlus: ComplexD: omp_out += omp_in) 
+
+template <typename Function>
+inline void trBenchmark(const std::string name, const CMatSet &mat, 
+                        const ComplexD ref, Function fn)
 {
     double       t, flops, bytes, n = mat[0][0].rows()*mat[0][0].cols();
     unsigned int nMat = mat[0].size();
@@ -27,8 +40,37 @@ inline void trBenchmark(const std::string name, const CMatSet &mat, TwoMatFn fn)
     flops = nMat*(6.*n + 2.*(n - 1.));
     bytes = nMat*(2.*n*sizeof(ComplexD));
 
-    std::cout << std::setw(30) << name << ": result= "
-              << std::setw(10) << buf
+    std::cout << std::setw(30) << name << ": diff= "
+              << std::setw(12) << std::norm(buf-ref)
+              << std::setw(10) << t/1.0e6 << " sec "
+              << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+              << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+              << std::endl;
+    ::sleep(1);
+}
+
+template <typename Function>
+inline void mulBenchmark(const std::string name, const CMatSet &mat, 
+                         const CMat ref, Function fn)
+{
+    double       t, flops, bytes;
+    double       nr = mat[0][0].rows(), nc = mat[0][0].cols(), n = nr*nc;
+    unsigned int nMat = mat[0].size();
+    CMat         buf(mat[0][0].rows(), mat[0][0].rows());
+
+    t = 0.;
+    for (unsigned int i = 0; i < nMat; ++i)
+    {
+        t -= usecond();
+        fn(buf, mat[0][i], mat[1][i]);
+        t += usecond();
+    }
+    flops = nMat*(n*(6.*nc + 2.*(nc - 1.)));
+    bytes = nMat*((2.*n+nr*nr)*sizeof(ComplexD));
+
+    std::cout << std::setw(30) << name << ": diff= "
+              << std::setw(12) << (buf-ref).squaredNorm()
+              << std::setw(10) << t/1.0e6 << " sec "
               << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
               << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
               << std::endl;
@@ -51,41 +93,60 @@ int main(int argc, char *argv[])
     nj   = std::stoi(argv[2]);
     nMat = std::stoi(argv[3]);
 
-    std::cout << "==== generating random matrices" << std::endl;    
-    CMatSet mat(2);
-    CMat    buf;
+    std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
+    std::cout << nMat << " " << ni << "x" << nj << " matrices\n" << std::endl;
+#ifdef EIGEN_USE_MKL_ALL
+    std::cout << "Eigen uses the MKL" << std::endl;
+#endif
+    std::cout << "Eigen uses ";
+#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
+    std::cout << "row-major ordering" << std::endl;
+#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
+    std::cout << "column-major ordering" << std::endl;
+#endif
+    std::cout << "Eigen uses " << Eigen::nbThreads() << " thread(s)" << std::endl;
+#ifdef USE_MKL
+    std::cout << "MKL   uses " << mkl_get_max_threads() << " thread(s)" << std::endl;
+#endif
+    std::cout << std::endl;
+
+    CMatSet  mat(2);
+    CMat     buf;
+    ComplexD ref;
 
     mat[0].resize(nMat, Eigen::MatrixXcd::Random(ni, nj));
     mat[1].resize(nMat, Eigen::MatrixXcd::Random(nj, ni));
 
-    std::cout << "==== tr(A*B) benchmark" << std::endl;
-    trBenchmark("Naive loop rows first", mat,
+    std::cout << "==== tr(A*B) benchmarks" << std::endl;
+    ref = (mat[0].back()*mat[1].back()).trace();
+    trBenchmark("Naive loop rows first", mat, ref,
     [](ComplexD &res, const CMat &a, const CMat &b)
     { 
         res = 0.;
-        #pragma omp parallel for schedule(static) reduction(+:res)
+        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
         for (unsigned int i = 0; i < a.rows(); ++i)
         for (unsigned int j = 0; j < a.cols(); ++j)
         {
             res += a(i, j)*b(j, i);
         }
     });
-    trBenchmark("Naive loop cols first", mat,
+    trBenchmark("Naive loop cols first", mat, ref,
     [](ComplexD &res, const CMat &a, const CMat &b)
     { 
         res = 0.;
+        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
         for (unsigned int j = 0; j < a.cols(); ++j)
         for (unsigned int i = 0; i < a.rows(); ++i)
         {
             res += a(i, j)*b(j, i);
         }
     });
-    trBenchmark("Eigen tr(A*B)", mat,
+    trBenchmark("Eigen tr(A*B)", mat, ref,
     [](ComplexD &res, const CMat &a, const CMat &b)
     { 
         res = (a*b).trace();
     });
-    trBenchmark("Eigen global dot", mat,
+    trBenchmark("Eigen global dot", mat, ref,
     [&buf](ComplexD &res, const CMat &a, const CMat &b)
     { 
         buf = b.transpose();
@@ -94,31 +155,113 @@ int main(int argc, char *argv[])
 
         res = av.conjugate().dot(bv);
     });
-    trBenchmark("Eigen row-wise dot", mat,
+    trBenchmark("Eigen row-wise dot", mat, ref,
     [](ComplexD &res, const CMat &a, const CMat &b)
     { 
         res = 0.;
-        #pragma omp parallel for schedule(static) reduction(+:res)
+        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
         for (unsigned int r = 0; r < a.rows(); ++r)
         {
             res += a.row(r).conjugate().dot(b.col(r));
         }
     });
-    trBenchmark("Eigen col-wise dot", mat,
+    trBenchmark("Eigen col-wise dot", mat, ref,
     [](ComplexD &res, const CMat &a, const CMat &b)
     { 
         res = 0.;
-        #pragma omp parallel for schedule(static) reduction(+:res)
-        for (unsigned int r = 0; r < a.cols(); ++r)
+        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
+        for (unsigned int c = 0; c < a.cols(); ++c)
         {
-            res += a.col(r).conjugate().dot(b.row(r));
+            res += a.col(c).conjugate().dot(b.row(c));
         }
     });
-    trBenchmark("Eigen Hadamard", mat,
+    trBenchmark("Eigen Hadamard", mat, ref,
     [](ComplexD &res, const CMat &a, const CMat &b)
     { 
         res = a.cwiseProduct(b.transpose()).sum();
     });
+#ifdef USE_MKL
+    trBenchmark("MKL row-wise zdotu", mat, ref,
+    [](ComplexD &res, const CMat &a, const CMat &b)
+    { 
+        res = 0.;
+        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
+        for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
+            cblas_zdotu_sub(a.cols(), a.data() + r*a.cols(), 1, b.data() + r, b.cols(), &tmp);
+#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
+            cblas_zdotu_sub(a.cols(), a.data() + r, a.rows(), b.data() + r*b.rows(), 1, &tmp);
+#endif
+            res += tmp;
+        }
+    });
+    trBenchmark("MKL col-wise zdotu", mat, ref,
+    [](ComplexD &res, const CMat &a, const CMat &b)
+    { 
+        res = 0.;
+        #pragma omp parallel for schedule(static) reduction(ComplexPlus:res)
+        for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
+            cblas_zdotu_sub(a.rows(), a.data() + c, a.cols(), b.data() + c*b.cols(), 1, &tmp);
+#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
+            cblas_zdotu_sub(a.rows(), a.data() + c*a.rows(), 1, b.data() + c, b.rows(), &tmp);
+#endif
+            res += tmp;
+        }
+    });
+#endif
+
+    std::cout << std::endl;
+    std::cout << "==== A*B benchmarks" << std::endl;
+    buf = mat[0].back()*mat[1].back();
+    mulBenchmark("Naive", mat, buf,
+    [](CMat &res, const CMat &a, const CMat &b)
+    { 
+        unsigned int ni = a.rows(), nj = a.cols();
+
+        #pragma omp parallel for collapse(2)
+        for (unsigned int i = 0; i < ni; ++i)
+        for (unsigned int k = 0; k < ni; ++k)
+        {
+            res(i, k) = a(i, 0)*b(0, k);
+        }
+        #pragma omp parallel for collapse(2)
+        for (unsigned int i = 0; i < ni; ++i)
+        for (unsigned int k = 0; k < ni; ++k)
+        for (unsigned int j = 1; j < nj; ++j)
+        {
+            res(i, k) += a(i, j)*b(j, k);
+        }
+    });
+    mulBenchmark("Eigen A*B", mat, buf,
+    [](CMat &res, const CMat &a, const CMat &b)
+    { 
+        res = a*b;
+    });
+#ifdef USE_MKL
+    mulBenchmark("MKL A*B", mat, buf,
+    [](CMat &res, const CMat &a, const CMat &b)
+    {
+        const ComplexD one(1., 0.), zero(0., 0.);
+#if (EIGEN_ORDER == EIGEN_ROW_MAJOR)
+        cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                    a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                    res.data(), res.cols());
+#elif (EIGEN_ORDER == EIGEN_COL_MAJOR)
+        cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                    a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                    res.data(), res.rows());
+#endif
+    });
+#endif
+
+    std::cout << std::endl;
 
     return EXIT_SUCCESS;
 }