Hadrons: fast A2A matrix contraction kernels

2025-12-20 12:44:30 +00:00 · 2018-11-06 19:49:09 +00:00
parent 9734e3ee58
commit 88d9922e4f
3 changed files with 234 additions and 31 deletions
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -39,12 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <omp.h>
 // complex reductions
-#pragma omp declare reduction(ComplexPlus: Grid::Complex: omp_out += omp_in)
+#pragma omp declare reduction(ComplexPlus:Grid::ComplexD, Grid::vComplexD, Grid::ComplexF, Grid::vComplexF: omp_out += omp_in)
 #pragma omp declare reduction(GridVComplexPlus: Grid::vComplex: omp_out += omp_in)
 #pragma omp declare reduction(ComplexDPlus: Grid::ComplexD: omp_out += omp_in)
 #pragma omp declare reduction(GridVComplexDPlus: Grid::vComplexD: omp_out += omp_in)
 #pragma omp declare reduction(ComplexFPlus: Grid::ComplexF: omp_out += omp_in)
 #pragma omp declare reduction(GridVComplexFPlus: Grid::vComplexF: omp_out += omp_in)
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -32,6 +32,10 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
@@ -58,6 +62,9 @@ using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
 template <typename T>
 using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
 template <typename T>
 using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
 /******************************************************************************
 *                      Abstract class for A2A kernels                        *
 ******************************************************************************/
@@ -150,6 +157,198 @@ private:
    std::vector<IoHelper> nodeIo_;
 };
 /******************************************************************************
 *                       A2A matrix contraction kernels                       *
 ******************************************************************************/
 class A2AContraction
 {
 public:
    // accTrMul(acc, a, b): acc += tr(a*b)
    template <typename C, typename MatLeft, typename MatRight>
    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
    {
        if ((MatLeft::Options == Eigen::RowMajor) and
            (MatRight::Options == Eigen::ColMajor))
        {
            parallel_for_reduce(ComplexPlus, acc) (unsigned int r = 0; r < a.rows(); ++r)
            {
 #ifdef USE_MKL
                ComplexD tmp;
                dotuRow(tmp, r, a, b);
                acc += tmp;
 #else
                acc += a.row(r).conjugate().dot(b.col(r));
 #endif
            }
        }
        else
        {
            parallel_for_reduce(ComplexPlus, acc) (unsigned int c = 0; c < a.cols(); ++c)
            {
 #ifdef USE_MKL
                ComplexD tmp;
                dotuCol(tmp, c, a, b);
                acc += tmp;
 #else
                acc += a.col(c).conjugate().dot(b.row(c));
 #endif
            }
        }
    }
    // mul(res, a, b): res = a*b
 #ifdef USE_MKL
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexD, Opts...> &res, 
                           const Mat<ComplexD, Opts...> &a, 
                           const Mat<ComplexD, Opts...> &b)
    {
        static const ComplexD one(1., 0.), zero(0., 0.);
        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
        {
            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
        {
            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexF, Opts...> &res, 
                           const Mat<ComplexF, Opts...> &a, 
                           const Mat<ComplexF, Opts...> &b)
    {
        static const ComplexF one(1., 0.), zero(0., 0.);
        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
        {
            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
        {
            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
 #else
    template <typename Mat>
    static inline void mul(Mat &res, const Mat &a, const Mat &b)
    {
        res = a*b;
    }
 #endif
 private:
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aRow, 
                                    const MatLeft &a, const MatRight &b)
    {
        if (MatLeft::Options == Eigen::RowMajor)
        {
            aPt  = a.data() + aRow*a.cols();
            aInc = 1;
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            aPt  = a.data() + aRow;
            aInc = a.rows();
        }
        if (MatRight::Options == Eigen::RowMajor)
        {
            bPt  = b.data() + aRow;
            bInc = b.cols();
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            bPt  = b.data() + aRow*b.rows();
            bInc = 1;
        }
    }
 #ifdef USE_MKL
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aCol, 
                                    const MatLeft &a, const MatRight &b)
    {
        if (MatLeft::Options == Eigen::RowMajor)
        {
            aPt  = a.data() + aCol;
            aInc = a.cols();
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            aPt  = a.data() + aCol*a.rows();
            aInc = 1;
        }
        if (MatRight::Options == Eigen::RowMajor)
        {
            bPt  = b.data() + aCol*b.cols();
            bInc = 1;
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            bPt  = b.data() + aCol;
            bInc = b.rows();
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
 #endif
 };
 /******************************************************************************
 *                     A2AMatrixIo template implementation                    *
 ******************************************************************************/
--- a/Hadrons/Utilities/ContractorBenchmark.cc
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -1,16 +1,12 @@
 #include <Hadrons/Global.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
 #ifdef AVX2
 #include <immintrin.h>
 #endif
 using namespace Grid;
-
+using namespace Hadrons;
 typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::RowMajor> RMCMat;
 typedef Eigen::Matrix<ComplexD, -1, -1, Eigen::ColMajor> CMCMat;
 #ifdef GRID_COMMS_MPI3
 #define GET_RANK(rank, nMpi) \
@@ -50,7 +46,7 @@ inline void trBenchmark(const std::string name, const MatLeft &left,
    if (rank == 0)
    {
-        std::cout << std::setw(30) << name << ": diff= "
+        std::cout << std::setw(34) << name << ": diff= "
                  << std::setw(12) << std::norm(buf-ref)
                  << std::setw(10) << t/1.0e6 << " sec "
                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
@@ -85,16 +81,17 @@ inline void mulBenchmark(const std::string name, const MatV &left,
    if (rank == 0)
    {
-        std::cout << std::setw(30) << name << ": diff= "
+        std::cout << std::setw(34) << name << ": diff= "
-                << std::setw(12) << (buf-ref).squaredNorm()
+                  << std::setw(12) << (buf-ref).squaredNorm()
-                << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << t/1.0e6 << " sec "
-                << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
-                << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
-                << std::endl;
+                  << std::endl;
    }
    ::sleep(1);
 }
 #ifdef USE_MKL
 template <typename MatLeft, typename MatRight>
 static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
                            const MatLeft &a, const MatRight &b)
@@ -154,6 +151,7 @@ static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
    }
    cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
 }
 #endif
 template <typename MatLeft, typename MatRight>
 void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
@@ -192,12 +190,18 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    }
    BARRIER();
    ref = (left.back()*right.back()).trace();
    trBenchmark("Hadrons A2AContraction::accTrMul", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    { 
        res = 0.;
        A2AContraction::accTrMul(res, a, b);
    });
    trBenchmark("Naive loop rows first", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    { 
        res = 0.;
        auto nr = a.rows(), nc = a.cols();
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int i = 0; i < nr; ++i)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int i = 0; i < nr; ++i)
        for (unsigned int j = 0; j < nc; ++j)
        {
            res += a(i, j)*b(j, i);
@@ -208,7 +212,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    { 
        res = 0.;
        auto nr = a.rows(), nc = a.cols();
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int j = 0; j < nc; ++j)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int j = 0; j < nc; ++j)
        for (unsigned int i = 0; i < nr; ++i)
        {
            res += a(i, j)*b(j, i);
@@ -224,7 +228,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    { 
        res = 0.;
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
        {
            res += a.row(r).conjugate().dot(b.col(r));
        }
@@ -234,7 +238,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    { 
        res = 0.;
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
        {
            res += a.col(c).conjugate().dot(b.row(c));
        }
@@ -250,7 +254,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    { 
        res = 0.;
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int r = 0; r < a.rows(); ++r)
        {
            ComplexD tmp;
@@ -263,7 +267,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    { 
        res = 0.;
-        parallel_for_reduce(ComplexDPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
+        parallel_for_reduce(ComplexPlus, res) (unsigned int c = 0; c < a.cols(); ++c)
        {
            ComplexD tmp;
@@ -305,6 +309,11 @@ void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsign
    }
    BARRIER();
    ref = left.back()*right.back();
    mulBenchmark("Hadrons A2AContraction::mul", left, right, ref,
    [](Mat &res, const Mat &a, const Mat &b)
    { 
        A2AContraction::mul(res, a, b);
    });
    mulBenchmark("Eigen A*B", left, right, ref,
    [](Mat &res, const Mat &a, const Mat &b)
    { 
@@ -381,12 +390,12 @@ int main(int argc, char *argv[])
        std::cout << std::endl;
    }
-    fullTrBenchmark<RMCMat, RMCMat>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
-    fullTrBenchmark<RMCMat, CMCMat>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
-    fullTrBenchmark<CMCMat, RMCMat>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
-    fullTrBenchmark<CMCMat, CMCMat>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
-    fullMulBenchmark<RMCMat>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrix<ComplexD>>(ni, nj, nMat);
-    fullMulBenchmark<CMCMat>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrixTr<ComplexD>>(ni, nj, nMat);
    FINALIZE();
    return EXIT_SUCCESS;