diff --git a/Hadrons/Utilities/ContractorBenchmark.cc b/Hadrons/Utilities/ContractorBenchmark.cc index 9a8c0f4e..d2ff7c9e 100644 --- a/Hadrons/Utilities/ContractorBenchmark.cc +++ b/Hadrons/Utilities/ContractorBenchmark.cc @@ -1,17 +1,30 @@ #include #include +#ifdef USE_MKL +#include "mkl.h" +#include "mkl_cblas.h" +#endif using namespace Grid; +#define EIGEN_ROW_MAJOR 1 +#define EIGEN_COL_MAJOR 2 #ifndef EIGEN_ORDER -#define EIGEN_ORDER RowMajor +#define EIGEN_ORDER EIGEN_ROW_MAJOR +#endif +#if (EIGEN_ORDER == EIGEN_ROW_MAJOR) +typedef Eigen::Matrix CMat; +#elif (EIGEN_ORDER == EIGEN_COL_MAJOR) +typedef Eigen::Matrix CMat; #endif -typedef Eigen::Matrix CMat; typedef std::vector> CMatSet; -template -inline void trBenchmark(const std::string name, const CMatSet &mat, TwoMatFn fn) +#pragma omp declare reduction(ComplexPlus: ComplexD: omp_out += omp_in) + +template +inline void trBenchmark(const std::string name, const CMatSet &mat, + const ComplexD ref, Function fn) { double t, flops, bytes, n = mat[0][0].rows()*mat[0][0].cols(); unsigned int nMat = mat[0].size(); @@ -27,8 +40,37 @@ inline void trBenchmark(const std::string name, const CMatSet &mat, TwoMatFn fn) flops = nMat*(6.*n + 2.*(n - 1.)); bytes = nMat*(2.*n*sizeof(ComplexD)); - std::cout << std::setw(30) << name << ": result= " - << std::setw(10) << buf + std::cout << std::setw(30) << name << ": diff= " + << std::setw(12) << std::norm(buf-ref) + << std::setw(10) << t/1.0e6 << " sec " + << std::setw(10) << flops/t/1.0e3 << " GFlop/s " + << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " + << std::endl; + ::sleep(1); +} + +template +inline void mulBenchmark(const std::string name, const CMatSet &mat, + const CMat ref, Function fn) +{ + double t, flops, bytes; + double nr = mat[0][0].rows(), nc = mat[0][0].cols(), n = nr*nc; + unsigned int nMat = mat[0].size(); + CMat buf(mat[0][0].rows(), mat[0][0].rows()); + + t = 0.; + for (unsigned int i = 0; i < nMat; ++i) + { + t -= usecond(); + fn(buf, mat[0][i], mat[1][i]); + t += usecond(); + } + flops = nMat*(n*(6.*nc + 2.*(nc - 1.))); + bytes = nMat*((2.*n+nr*nr)*sizeof(ComplexD)); + + std::cout << std::setw(30) << name << ": diff= " + << std::setw(12) << (buf-ref).squaredNorm() + << std::setw(10) << t/1.0e6 << " sec " << std::setw(10) << flops/t/1.0e3 << " GFlop/s " << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; @@ -51,41 +93,60 @@ int main(int argc, char *argv[]) nj = std::stoi(argv[2]); nMat = std::stoi(argv[3]); - std::cout << "==== generating random matrices" << std::endl; - CMatSet mat(2); - CMat buf; + std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl; + std::cout << nMat << " " << ni << "x" << nj << " matrices\n" << std::endl; +#ifdef EIGEN_USE_MKL_ALL + std::cout << "Eigen uses the MKL" << std::endl; +#endif + std::cout << "Eigen uses "; +#if (EIGEN_ORDER == EIGEN_ROW_MAJOR) + std::cout << "row-major ordering" << std::endl; +#elif (EIGEN_ORDER == EIGEN_COL_MAJOR) + std::cout << "column-major ordering" << std::endl; +#endif + std::cout << "Eigen uses " << Eigen::nbThreads() << " thread(s)" << std::endl; +#ifdef USE_MKL + std::cout << "MKL uses " << mkl_get_max_threads() << " thread(s)" << std::endl; +#endif + std::cout << std::endl; + + CMatSet mat(2); + CMat buf; + ComplexD ref; mat[0].resize(nMat, Eigen::MatrixXcd::Random(ni, nj)); mat[1].resize(nMat, Eigen::MatrixXcd::Random(nj, ni)); - std::cout << "==== tr(A*B) benchmark" << std::endl; - trBenchmark("Naive loop rows first", mat, + std::cout << "==== tr(A*B) benchmarks" << std::endl; + ref = (mat[0].back()*mat[1].back()).trace(); + trBenchmark("Naive loop rows first", mat, ref, [](ComplexD &res, const CMat &a, const CMat &b) { res = 0.; - #pragma omp parallel for schedule(static) reduction(+:res) + #pragma omp parallel for schedule(static) reduction(ComplexPlus:res) for (unsigned int i = 0; i < a.rows(); ++i) for (unsigned int j = 0; j < a.cols(); ++j) { res += a(i, j)*b(j, i); } }); - trBenchmark("Naive loop cols first", mat, + trBenchmark("Naive loop cols first", mat, ref, [](ComplexD &res, const CMat &a, const CMat &b) { res = 0.; + #pragma omp parallel for schedule(static) reduction(ComplexPlus:res) for (unsigned int j = 0; j < a.cols(); ++j) for (unsigned int i = 0; i < a.rows(); ++i) { res += a(i, j)*b(j, i); } }); - trBenchmark("Eigen tr(A*B)", mat, + trBenchmark("Eigen tr(A*B)", mat, ref, [](ComplexD &res, const CMat &a, const CMat &b) { res = (a*b).trace(); }); - trBenchmark("Eigen global dot", mat, + trBenchmark("Eigen global dot", mat, ref, [&buf](ComplexD &res, const CMat &a, const CMat &b) { buf = b.transpose(); @@ -94,31 +155,113 @@ int main(int argc, char *argv[]) res = av.conjugate().dot(bv); }); - trBenchmark("Eigen row-wise dot", mat, + trBenchmark("Eigen row-wise dot", mat, ref, [](ComplexD &res, const CMat &a, const CMat &b) { res = 0.; - #pragma omp parallel for schedule(static) reduction(+:res) + #pragma omp parallel for schedule(static) reduction(ComplexPlus:res) for (unsigned int r = 0; r < a.rows(); ++r) { res += a.row(r).conjugate().dot(b.col(r)); } }); - trBenchmark("Eigen col-wise dot", mat, + trBenchmark("Eigen col-wise dot", mat, ref, [](ComplexD &res, const CMat &a, const CMat &b) { res = 0.; - #pragma omp parallel for schedule(static) reduction(+:res) - for (unsigned int r = 0; r < a.cols(); ++r) + #pragma omp parallel for schedule(static) reduction(ComplexPlus:res) + for (unsigned int c = 0; c < a.cols(); ++c) { - res += a.col(r).conjugate().dot(b.row(r)); + res += a.col(c).conjugate().dot(b.row(c)); } }); - trBenchmark("Eigen Hadamard", mat, + trBenchmark("Eigen Hadamard", mat, ref, [](ComplexD &res, const CMat &a, const CMat &b) { res = a.cwiseProduct(b.transpose()).sum(); }); +#ifdef USE_MKL + trBenchmark("MKL row-wise zdotu", mat, ref, + [](ComplexD &res, const CMat &a, const CMat &b) + { + res = 0.; + #pragma omp parallel for schedule(static) reduction(ComplexPlus:res) + for (unsigned int r = 0; r < a.rows(); ++r) + { + ComplexD tmp; + +#if (EIGEN_ORDER == EIGEN_ROW_MAJOR) + cblas_zdotu_sub(a.cols(), a.data() + r*a.cols(), 1, b.data() + r, b.cols(), &tmp); +#elif (EIGEN_ORDER == EIGEN_COL_MAJOR) + cblas_zdotu_sub(a.cols(), a.data() + r, a.rows(), b.data() + r*b.rows(), 1, &tmp); +#endif + res += tmp; + } + }); + trBenchmark("MKL col-wise zdotu", mat, ref, + [](ComplexD &res, const CMat &a, const CMat &b) + { + res = 0.; + #pragma omp parallel for schedule(static) reduction(ComplexPlus:res) + for (unsigned int c = 0; c < a.cols(); ++c) + { + ComplexD tmp; + +#if (EIGEN_ORDER == EIGEN_ROW_MAJOR) + cblas_zdotu_sub(a.rows(), a.data() + c, a.cols(), b.data() + c*b.cols(), 1, &tmp); +#elif (EIGEN_ORDER == EIGEN_COL_MAJOR) + cblas_zdotu_sub(a.rows(), a.data() + c*a.rows(), 1, b.data() + c, b.rows(), &tmp); +#endif + res += tmp; + } + }); +#endif + + std::cout << std::endl; + std::cout << "==== A*B benchmarks" << std::endl; + buf = mat[0].back()*mat[1].back(); + mulBenchmark("Naive", mat, buf, + [](CMat &res, const CMat &a, const CMat &b) + { + unsigned int ni = a.rows(), nj = a.cols(); + + #pragma omp parallel for collapse(2) + for (unsigned int i = 0; i < ni; ++i) + for (unsigned int k = 0; k < ni; ++k) + { + res(i, k) = a(i, 0)*b(0, k); + } + #pragma omp parallel for collapse(2) + for (unsigned int i = 0; i < ni; ++i) + for (unsigned int k = 0; k < ni; ++k) + for (unsigned int j = 1; j < nj; ++j) + { + res(i, k) += a(i, j)*b(j, k); + } + }); + mulBenchmark("Eigen A*B", mat, buf, + [](CMat &res, const CMat &a, const CMat &b) + { + res = a*b; + }); +#ifdef USE_MKL + mulBenchmark("MKL A*B", mat, buf, + [](CMat &res, const CMat &a, const CMat &b) + { + const ComplexD one(1., 0.), zero(0., 0.); +#if (EIGEN_ORDER == EIGEN_ROW_MAJOR) + cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(), + a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero, + res.data(), res.cols()); +#elif (EIGEN_ORDER == EIGEN_COL_MAJOR) + cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(), + a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero, + res.data(), res.rows()); +#endif + }); +#endif + + std::cout << std::endl; return EXIT_SUCCESS; }