From f4033ad8cb32c34debe1623d84eab7c0d79116d5 Mon Sep 17 00:00:00 2001 From: ferben Date: Mon, 27 Apr 2020 17:46:14 +0100 Subject: [PATCH 01/19] baryon speedup by a factor 2 --- Grid/qcd/utils/BaryonUtils.h | 416 ++++++++++++++++++++++++++++++++++- 1 file changed, 407 insertions(+), 9 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index d65b9176..18d6f84b 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -46,7 +46,8 @@ public: typedef typename SpinMatrixField::vector_object sobj; static const int epsilon[6][3] ; - static const Complex epsilon_sgn[6]; + //static const Complex epsilon_sgn[6]; + static const double epsilon_sgn[6]; private: template @@ -60,6 +61,62 @@ public: const int parity, const int * wick_contractions, robj &result); + template + static void baryon_site_macro(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result); + template + static void baryon_site_macro(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + const int * wick_contractions, + robj &result); + template + static inline void baryon_site_template(unsigned int mask, const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result); + template + static inline void baryon_site_template(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result); + + template + struct BaryonSiteHelper + { + template + static inline void function(const unsigned int mask, const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result); + }; public: static void ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -151,14 +208,18 @@ public: template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -template +/*template const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), Complex(1), Complex(1), Complex(-1), Complex(-1), Complex(-1)}; +*/ +template +const double BaryonUtils::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0}; +//This is the old version template template void BaryonUtils::baryon_site(const mobj &D1, @@ -188,13 +249,15 @@ void BaryonUtils::baryon_site(const mobj &D1, int a_right = epsilon[ie_right][0]; //a' int b_right = epsilon[ie_right][1]; //b' int c_right = epsilon[ie_right][2]; //c' + //complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; //This is the \delta_{456}^{123} part if (wick_contraction[0]){ auto D2g = D2 * GammaB_left; for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right +template +void BaryonUtils::baryon_site_macro(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result) +{ + + Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) + + auto gD1a = GammaA_left * GammaA_right * D1; + auto gD1b = GammaA_left * g4 * GammaA_right * D1; + auto pD1 = 0.5* (gD1a + (double)parity * gD1b); + auto gD3 = GammaB_right * D3; + + auto D2g = D2 * GammaB_left; + auto pD1g = pD1 * GammaB_left; + auto gD3g = gD3 * GammaB_left; + + for (int ie_left=0; ie_left < 6 ; ie_left++){ + int a_left = epsilon[ie_left][0]; //a + int b_left = epsilon[ie_left][1]; //b + int c_left = epsilon[ie_left][2]; //c + for (int ie_right=0; ie_right < 6 ; ie_right++){ + int a_right = epsilon[ie_right][0]; //a' + int b_right = epsilon[ie_right][1]; //b' + int c_right = epsilon[ie_right][2]; //c' + double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + //All parts together + for (int gamma_left=0; gamma_left( D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, result );\ +} + +template +template +void BaryonUtils::baryon_site_macro(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + const int * wick_contraction, + robj &result) +{ +BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); + +} + + +template +template +inline void BaryonUtils::baryon_site_template(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result) +{ + constexpr bool wick_contraction_0 = ((mask & (1 << 5)) >> 5); + constexpr bool wick_contraction_1 = ((mask & (1 << 4)) >> 4); + constexpr bool wick_contraction_2 = ((mask & (1 << 3)) >> 3); + constexpr bool wick_contraction_3 = ((mask & (1 << 2)) >> 2); + constexpr bool wick_contraction_4 = ((mask & (1 << 1)) >> 1); + constexpr bool wick_contraction_5 = ((mask & (1 << 0)) >> 0); + + Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) + + auto gD1a = GammaA_left * GammaA_right * D1; + auto gD1b = GammaA_left * g4 * GammaA_right * D1; + auto pD1 = 0.5* (gD1a + (double)parity * gD1b); + auto gD3 = GammaB_right * D3; + + auto D2g = D2 * GammaB_left; + auto pD1g = pD1 * GammaB_left; + auto gD3g = gD3 * GammaB_left; + + for (int ie_left=0; ie_left < 6 ; ie_left++){ + int a_left = epsilon[ie_left][0]; //a + int b_left = epsilon[ie_left][1]; //b + int c_left = epsilon[ie_left][2]; //c + for (int ie_right=0; ie_right < 6 ; ie_right++){ + int a_right = epsilon[ie_right][0]; //a' + int b_right = epsilon[ie_right][1]; //b' + int c_right = epsilon[ie_right][2]; //c' + double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + //All parts together + for (int gamma_left=0; gamma_left +template +template +inline void BaryonUtils::BaryonSiteHelper::function(const unsigned int mask, const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result) +{ + if (mask == maxMask) + { + baryon_site_template(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); + } + else + { + BaryonSiteHelper<(maxMask>0) ? maxMask-1 : 0>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); + } +} + +// top-level function +template +template +inline void BaryonUtils::baryon_site_template(const unsigned int mask, const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result) +{ + BaryonSiteHelper<63>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); +} + + template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -259,6 +593,10 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const int parity, ComplexField &baryon_corr) { + const std::chrono::system_clock::time_point start{ std::chrono::system_clock::now() }; + std::time_t now = std::chrono::system_clock::to_time_t( start ); + std::cout << "Setup start " << std::ctime( &now ); + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; @@ -278,6 +616,16 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto v2 = q2_left.View(); auto v3 = q3_left.View(); + const std::chrono::system_clock::time_point stop{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( stop ); + const std::chrono::duration duration_seconds = stop - start; + const double seconds{ ( duration_seconds.count() ) }; + std::cout << "Setup stop " << std::ctime( &now ) + << "Total duration " << std::fixed << std::setprecision(5) << seconds << " seconds." << std::endl; + + const std::chrono::system_clock::time_point start2{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( start2 ); + std::cout << "Normal Loop start " << std::ctime( &now ); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ //for(int ss=0; ss < grid->oSites(); ss++){ @@ -290,6 +638,56 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites + const std::chrono::system_clock::time_point stop2{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( stop2 ); + const std::chrono::duration duration_seconds2 = stop2 - start2; + const double seconds2{ ( duration_seconds2.count() ) }; + std::cout << "Normal Loop stop " << std::ctime( &now ) + << "Total duration " << std::fixed << std::setprecision(5) << seconds2 << " seconds." << std::endl; + const std::chrono::system_clock::time_point start4{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( start4 ); + std::cout << "Opt-macro Loop start " << std::ctime( &now ); + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + thread_for(ss,grid->oSites(),{ + //for(int ss=0; ss < grid->oSites(); ss++){ + + auto D1 = v1[ss]; + auto D2 = v2[ss]; + auto D3 = v3[ss]; + + vobj result=Zero(); + baryon_site_macro(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + vbaryon_corr[ss] = result; + } );//end loop over lattice sites + const std::chrono::system_clock::time_point stop4{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( stop4 ); + const std::chrono::duration duration_seconds4 = stop4 - start4; + const double seconds4{ ( duration_seconds4.count() ) }; + std::cout << "Opt-macro Loop stop " << std::ctime( &now ) + << "Total duration " << std::fixed << std::setprecision(5) << seconds4 << " seconds." << std::endl; + const std::chrono::system_clock::time_point start3{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( start3 ); + int wick_id=32*wick_contraction[0]+16*wick_contraction[1]+8*wick_contraction[2]+4*wick_contraction[3]+2*wick_contraction[4]+wick_contraction[5]; + std::cout << "Opt-template Loop start " << std::ctime( &now ); + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + thread_for(ss,grid->oSites(),{ + //for(int ss=0; ss < grid->oSites(); ss++){ + + auto D1 = v1[ss]; + auto D2 = v2[ss]; + auto D3 = v3[ss]; + + vobj result=Zero(); + baryon_site_template(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); + vbaryon_corr[ss] = result; + } );//end loop over lattice sites + const std::chrono::system_clock::time_point stop3{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( stop3 ); + const std::chrono::duration duration_seconds3 = stop3 - start3; + const double seconds3{ ( duration_seconds3.count() ) }; + std::cout << "Opt-template Loop stop " << std::ctime( &now ) + << "Total duration " << std::fixed << std::setprecision(5) << seconds3 << " seconds." << std::endl; + } template template @@ -318,7 +716,7 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); } /*********************************************************************** From 6240e02619be0b0c5bda84173c0dc6cba62aef84 Mon Sep 17 00:00:00 2001 From: ferben Date: Mon, 27 Apr 2020 18:50:53 +0100 Subject: [PATCH 02/19] added assertion to avoid potential infinite loop --- Grid/qcd/utils/BaryonUtils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 18d6f84b..a392f223 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -553,6 +553,7 @@ inline void BaryonUtils::BaryonSiteHelper::function(const unsign const int parity, robj &result) { + assert(mask <= maxMask); if (mask == maxMask) { baryon_site_template(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); From dee96cbf8296b9f16dc378c78cdcb74302da77c5 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 29 Apr 2020 10:37:11 -0400 Subject: [PATCH 03/19] Added workaround in configure to still catch Cuda compiler when nvcc with extra arguments (eg -ccbin) is used as CXX --- configure.ac | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index fb69ca0e..46559507 100644 --- a/configure.ac +++ b/configure.ac @@ -274,12 +274,20 @@ case ${ac_gen_scalar} in esac ##################### Compiler dependent choices -case ${CXX} in + +#Strip any optional compiler arguments from nvcc call (eg -ccbin) for compiler comparison +CXXBASE=${CXX} +CXXTEST=${CXX} +if echo "${CXX}" | grep -q "nvcc"; then + CXXTEST="nvcc" +fi + +case ${CXXTEST} in nvcc) # CXX="nvcc -keep -v -x cu " # CXXLD="nvcc -v -link" - CXX="nvcc -x cu " - CXXLD="nvcc -link" + CXX="${CXXBASE} -x cu " + CXXLD="${CXXBASE} -link" # CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" if test $ac_openmp = yes; then From 56e2f7d088aa36af4990742822c8cd47363d2391 Mon Sep 17 00:00:00 2001 From: ferben Date: Thu, 7 May 2020 10:03:45 +0100 Subject: [PATCH 04/19] deleted test routines. cleaned up fast version. assert Ns=4,Nc=3. --- Grid/qcd/utils/BaryonUtils.h | 414 ++++------------------------------- 1 file changed, 40 insertions(+), 374 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index a392f223..241395c3 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -46,44 +46,11 @@ public: typedef typename SpinMatrixField::vector_object sobj; static const int epsilon[6][3] ; - //static const Complex epsilon_sgn[6]; static const double epsilon_sgn[6]; private: template - static void baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contractions, - robj &result); - template - static void baryon_site_macro(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result); - template - static void baryon_site_macro(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contractions, - robj &result); - template - static inline void baryon_site_template(unsigned int mask, const mobj &D1, + static inline void baryon_site(unsigned int mask, const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -93,7 +60,7 @@ public: const int parity, robj &result); template - static inline void baryon_site_template(const mobj &D1, + static inline void baryon_site(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -208,266 +175,12 @@ public: template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -/*template -const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), - Complex(1), - Complex(1), - Complex(-1), - Complex(-1), - Complex(-1)}; -*/ template const double BaryonUtils::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0}; -//This is the old version -template -template -void BaryonUtils::baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contraction, - robj &result) -{ - - Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) - - auto gD1a = GammaA_left * GammaA_right * D1; - auto gD1b = GammaA_left * g4 * GammaA_right * D1; - auto pD1 = 0.5* (gD1a + (double)parity * gD1b); - auto gD3 = GammaB_right * D3; - - for (int ie_left=0; ie_left < 6 ; ie_left++){ - int a_left = epsilon[ie_left][0]; //a - int b_left = epsilon[ie_left][1]; //b - int c_left = epsilon[ie_left][2]; //c - for (int ie_right=0; ie_right < 6 ; ie_right++){ - int a_right = epsilon[ie_right][0]; //a' - int b_right = epsilon[ie_right][1]; //b' - int c_right = epsilon[ie_right][2]; //c' - //complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; - double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; - //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ - auto D2g = D2 * GammaB_left; - for (int alpha_right=0; alpha_right -template -void BaryonUtils::baryon_site_macro(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result) -{ - - Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) - - auto gD1a = GammaA_left * GammaA_right * D1; - auto gD1b = GammaA_left * g4 * GammaA_right * D1; - auto pD1 = 0.5* (gD1a + (double)parity * gD1b); - auto gD3 = GammaB_right * D3; - - auto D2g = D2 * GammaB_left; - auto pD1g = pD1 * GammaB_left; - auto gD3g = gD3 * GammaB_left; - - for (int ie_left=0; ie_left < 6 ; ie_left++){ - int a_left = epsilon[ie_left][0]; //a - int b_left = epsilon[ie_left][1]; //b - int c_left = epsilon[ie_left][2]; //c - for (int ie_right=0; ie_right < 6 ; ie_right++){ - int a_right = epsilon[ie_right][0]; //a' - int b_right = epsilon[ie_right][1]; //b' - int c_right = epsilon[ie_right][2]; //c' - double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; - //All parts together - for (int gamma_left=0; gamma_left( D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, result );\ -} - -template -template -void BaryonUtils::baryon_site_macro(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contraction, - robj &result) -{ -BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); - -} - - template template -inline void BaryonUtils::baryon_site_template(const mobj &D1, +inline void BaryonUtils::baryon_site(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -556,7 +269,7 @@ inline void BaryonUtils::BaryonSiteHelper::function(const unsign assert(mask <= maxMask); if (mask == maxMask) { - baryon_site_template(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); + baryon_site(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); } else { @@ -567,7 +280,7 @@ inline void BaryonUtils::BaryonSiteHelper::function(const unsign // top-level function template template -inline void BaryonUtils::baryon_site_template(const unsigned int mask, const mobj &D1, +inline void BaryonUtils::baryon_site(const unsigned int mask, const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -594,40 +307,30 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const int parity, ComplexField &baryon_corr) { - const std::chrono::system_clock::time_point start{ std::chrono::system_clock::now() }; - std::time_t now = std::chrono::system_clock::to_time_t( start ); - std::cout << "Setup start " << std::ctime( &now ); + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - GridBase *grid = q1_left.Grid(); + GridBase *grid = q1_left.Grid(); - int wick_contraction[6]; - for (int ie=0; ie < 6 ; ie++) - wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; + int wick_id; + for (int ie=0; ie < 6 ; ie++) + wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie); - auto vbaryon_corr= baryon_corr.View(); - auto v1 = q1_left.View(); - auto v2 = q2_left.View(); - auto v3 = q3_left.View(); + auto vbaryon_corr= baryon_corr.View(); + auto v1 = q1_left.View(); + auto v2 = q2_left.View(); + auto v3 = q3_left.View(); - const std::chrono::system_clock::time_point stop{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( stop ); - const std::chrono::duration duration_seconds = stop - start; - const double seconds{ ( duration_seconds.count() ) }; - std::cout << "Setup stop " << std::ctime( &now ) - << "Total duration " << std::fixed << std::setprecision(5) << seconds << " seconds." << std::endl; - - const std::chrono::system_clock::time_point start2{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( start2 ); - std::cout << "Normal Loop start " << std::ctime( &now ); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ //for(int ss=0; ss < grid->oSites(); ss++){ @@ -636,58 +339,9 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D3 = v3[ss]; vobj result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites - const std::chrono::system_clock::time_point stop2{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( stop2 ); - const std::chrono::duration duration_seconds2 = stop2 - start2; - const double seconds2{ ( duration_seconds2.count() ) }; - std::cout << "Normal Loop stop " << std::ctime( &now ) - << "Total duration " << std::fixed << std::setprecision(5) << seconds2 << " seconds." << std::endl; - const std::chrono::system_clock::time_point start4{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( start4 ); - std::cout << "Opt-macro Loop start " << std::ctime( &now ); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ - //for(int ss=0; ss < grid->oSites(); ss++){ - - auto D1 = v1[ss]; - auto D2 = v2[ss]; - auto D3 = v3[ss]; - - vobj result=Zero(); - baryon_site_macro(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); - vbaryon_corr[ss] = result; - } );//end loop over lattice sites - const std::chrono::system_clock::time_point stop4{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( stop4 ); - const std::chrono::duration duration_seconds4 = stop4 - start4; - const double seconds4{ ( duration_seconds4.count() ) }; - std::cout << "Opt-macro Loop stop " << std::ctime( &now ) - << "Total duration " << std::fixed << std::setprecision(5) << seconds4 << " seconds." << std::endl; - const std::chrono::system_clock::time_point start3{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( start3 ); - int wick_id=32*wick_contraction[0]+16*wick_contraction[1]+8*wick_contraction[2]+4*wick_contraction[3]+2*wick_contraction[4]+wick_contraction[5]; - std::cout << "Opt-template Loop start " << std::ctime( &now ); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ - //for(int ss=0; ss < grid->oSites(); ss++){ - - auto D1 = v1[ss]; - auto D2 = v2[ss]; - auto D3 = v3[ss]; - - vobj result=Zero(); - baryon_site_template(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); - vbaryon_corr[ss] = result; - } );//end loop over lattice sites - const std::chrono::system_clock::time_point stop3{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( stop3 ); - const std::chrono::duration duration_seconds3 = stop3 - start3; - const double seconds3{ ( duration_seconds3.count() ) }; - std::cout << "Opt-template Loop stop " << std::ctime( &now ) - << "Total duration " << std::fixed << std::setprecision(5) << seconds3 << " seconds." << std::endl; } template @@ -704,20 +358,24 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, const int parity, robj &result) { - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - int wick_contraction[6]; - for (int ie=0; ie < 6 ; ie++) - wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - - result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + int wick_id; + for (int ie=0; ie < 6 ; ie++) + wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie); + + result=Zero(); + baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); } /*********************************************************************** @@ -957,6 +615,10 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, const std::string op, SpinMatrixField &stn_corr) { + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); @@ -994,6 +656,10 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, const std::string op, SpinMatrixField &stn_corr) { + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); From 253bcc3426a212675cc497147c6142c6700102ee Mon Sep 17 00:00:00 2001 From: ferben Date: Thu, 7 May 2020 18:03:17 +0100 Subject: [PATCH 05/19] back to old version --- Grid/qcd/utils/BaryonUtils.h | 249 +++++++++++++++-------------------- 1 file changed, 105 insertions(+), 144 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 241395c3..6cf526c3 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -46,44 +46,20 @@ public: typedef typename SpinMatrixField::vector_object sobj; static const int epsilon[6][3] ; - static const double epsilon_sgn[6]; + static const Complex epsilon_sgn[6]; private: template - static inline void baryon_site(unsigned int mask, const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result); - template - static inline void baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result); - - template - struct BaryonSiteHelper - { - template - static inline void function(const unsigned int mask, const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result); - }; + static void baryon_site(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + const int * wick_contractions, + robj &result); public: static void ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -176,11 +152,17 @@ public: template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; template -const double BaryonUtils::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0}; +const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), + Complex(1), + Complex(1), + Complex(-1), + Complex(-1), + Complex(-1)}; +//This is the old version template -template -inline void BaryonUtils::baryon_site(const mobj &D1, +template +void BaryonUtils::baryon_site(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -188,14 +170,9 @@ inline void BaryonUtils::baryon_site(const mobj &D1, const Gamma GammaA_right, const Gamma GammaB_right, const int parity, + const int * wick_contraction, robj &result) { - constexpr bool wick_contraction_0 = ((mask & (1 << 5)) >> 5); - constexpr bool wick_contraction_1 = ((mask & (1 << 4)) >> 4); - constexpr bool wick_contraction_2 = ((mask & (1 << 3)) >> 3); - constexpr bool wick_contraction_3 = ((mask & (1 << 2)) >> 2); - constexpr bool wick_contraction_4 = ((mask & (1 << 1)) >> 1); - constexpr bool wick_contraction_5 = ((mask & (1 << 0)) >> 0); Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) @@ -216,84 +193,77 @@ inline void BaryonUtils::baryon_site(const mobj &D1, int a_right = epsilon[ie_right][0]; //a' int b_right = epsilon[ie_right][1]; //b' int c_right = epsilon[ie_right][2]; //c' - double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; - //All parts together + Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + //This is the \delta_{456}^{123} part + if (wick_contraction[0]){ for (int gamma_left=0; gamma_left -template -template -inline void BaryonUtils::BaryonSiteHelper::function(const unsigned int mask, const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result) -{ - assert(mask <= maxMask); - if (mask == maxMask) - { - baryon_site(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); - } - else - { - BaryonSiteHelper<(maxMask>0) ? maxMask-1 : 0>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); - } -} - -// top-level function -template -template -inline void BaryonUtils::baryon_site(const unsigned int mask, const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result) -{ - BaryonSiteHelper<63>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); -} - - template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -307,30 +277,30 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const int parity, ComplexField &baryon_corr) { - - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - GridBase *grid = q1_left.Grid(); + GridBase *grid = q1_left.Grid(); - int wick_id; - for (int ie=0; ie < 6 ; ie++) - wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie); + int wick_contraction[6]; + for (int ie=0; ie < 6 ; ie++) + wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - auto vbaryon_corr= baryon_corr.View(); - auto v1 = q1_left.View(); - auto v2 = q2_left.View(); - auto v3 = q3_left.View(); + auto vbaryon_corr= baryon_corr.View(); + auto v1 = q1_left.View(); + auto v2 = q2_left.View(); + auto v3 = q3_left.View(); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ //for(int ss=0; ss < grid->oSites(); ss++){ @@ -339,10 +309,9 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D3 = v3[ss]; vobj result=Zero(); - baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites - } template template @@ -359,23 +328,23 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, robj &result) { - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - int wick_id; - for (int ie=0; ie < 6 ; ie++) - wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie); - - result=Zero(); - baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); + int wick_contraction[6]; + for (int ie=0; ie < 6 ; ie++) + wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; + + result=Zero(); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); } /*********************************************************************** @@ -615,10 +584,6 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, const std::string op, SpinMatrixField &stn_corr) { - - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); @@ -656,10 +621,6 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, const std::string op, SpinMatrixField &stn_corr) { - - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); From 42bb5f0721de5cfdf0d16c6eda66a9fd9f4d13c6 Mon Sep 17 00:00:00 2001 From: ferben Date: Thu, 7 May 2020 18:06:12 +0100 Subject: [PATCH 06/19] asserrtion --- Grid/qcd/utils/BaryonUtils.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 6cf526c3..fa2f3376 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -584,6 +584,10 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, const std::string op, SpinMatrixField &stn_corr) { + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); @@ -621,6 +625,10 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, const std::string op, SpinMatrixField &stn_corr) { + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); From 21ca182c368ba2415d874c744b8def244c0b37dd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 8 May 2020 09:18:24 -0400 Subject: [PATCH 07/19] Comments remove --- Grid/algorithms/LinearOperator.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index 50600d2d..a7fa1a90 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -257,13 +257,11 @@ public: virtual RealD Mpc (const Field &in, Field &out) { Field tmp(in.Grid()); tmp.Checkerboard() = !in.Checkerboard(); - //std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl; _Mat.Meooe(in,tmp); _Mat.MooeeInv(tmp,out); _Mat.Meooe(out,tmp); - //std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl; _Mat.Mooee(in,out); return axpy_norm(out,-1.0,tmp,out); } From 6859a3e1d4cf2487b6d2f7f560f845df4fd5a7af Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 8 May 2020 09:19:12 -0400 Subject: [PATCH 08/19] Schur operator --- benchmarks/Benchmark_schur.cc | 176 ++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 benchmarks/Benchmark_schur.cc diff --git a/benchmarks/Benchmark_schur.cc b/benchmarks/Benchmark_schur.cc new file mode 100644 index 00000000..afee31b0 --- /dev/null +++ b/benchmarks/Benchmark_schur.cc @@ -0,0 +1,176 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +void benchDw(std::vector & L, int Ls); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + const int Ls=12; + std::vector< std::vector > latts; +#if 0 + latts.push_back(std::vector ({24,24,24,24}) ); + latts.push_back(std::vector ({48,24,24,24}) ); + latts.push_back(std::vector ({96,24,24,24}) ); + latts.push_back(std::vector ({96,48,24,24}) ); + // latts.push_back(std::vector ({96,48,48,24}) ); + // latts.push_back(std::vector ({96,48,48,48}) ); +#else + // latts.push_back(std::vector ({96,48,48,48}) ); + latts.push_back(std::vector ({96,96,96,192}) ); +#endif + + std::cout << GridLogMessage<< "*****************************************************************" < latt4 = latts[l]; + std::cout << GridLogMessage <<"\t"; + for(int d=0;d & latt4, int Ls) +{ + ///////////////////////////////////////////////////////////////////////////////////// + // for Nc=3 + ///////////////////////////////////////////////////////////////////////////////////// + // Dw : Ls*24*(7+48)= Ls*1320 + // + // M5D: Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72 + // Meo: Ls*24*(7+48) + Ls*72 = Ls*1392 + // + // Mee: 3*Ns*2*Nc*Ls // Chroma 6*N5*Nc*Ns + // + // LeemInv : 2*2*Nc*madd*Ls + // LeeInv : 2*2*Nc*madd*Ls + // DeeInv : 4*2*Nc*mul *Ls + // UeeInv : 2*2*Nc*madd*Ls + // UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops + // QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops + // Mpc => 1452*cbvol*2*Ls flops // + // => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16 + ///////////////////////////////////////////////////////////////////////////////////// + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + // long unsigned int single_site_flops = 8*Nc*(7+16*Nc)*Ls; + long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls; + long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls; + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + + ColourMatrixF cm = ComplexF(1.0,0.0); + + int ncall=300; + RealD mass=0.1; + RealD M5 =1.8; + RealD NP = UGrid->_Nprocessors; + double volume=1; for(int mu=0;mu Mpc(Dw); + Chebyshev Cheby(0.0,60.0,order); + + { + Mpc.Mpc(src_o,r_o); + Mpc.Mpc(src_o,r_o); + Mpc.Mpc(src_o,r_o); + + double t0=usecond(); + for(int i=0;i Date: Fri, 8 May 2020 09:19:54 -0400 Subject: [PATCH 09/19] Remove verbose --- Grid/qcd/action/fermion/MobiusFermion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/MobiusFermion.h b/Grid/qcd/action/fermion/MobiusFermion.h index 1cbb6609..1e948092 100644 --- a/Grid/qcd/action/fermion/MobiusFermion.h +++ b/Grid/qcd/action/fermion/MobiusFermion.h @@ -59,7 +59,7 @@ public: { RealD eps = 1.0; - std::cout<Ls);// eps is ignored for higham assert(zdata->n==this->Ls); From 1d65e2f62ccb3555e6fec9413960bd7b8f46b84d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 8 May 2020 09:20:54 -0400 Subject: [PATCH 10/19] Slightly faster Chebyshev; ifdef'ed out the fastest until tested numerics Lifteed from HDCR setup --- Grid/algorithms/approx/Chebyshev.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 133db2b4..c0b0646d 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -234,10 +234,9 @@ public: GridBase *grid=in.Grid(); - // std::cout << "Chevyshef(): in.Grid()="<({45,12,81,9})); - for(int lat=8;lat<=lmax;lat+=4){ + for(int lat=8;lat<=lmax;lat+=8){ Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + GridCartesian Grid(latt_size,simd_layout,mpi_layout); // NP= Grid.RankCount(); @@ -270,191 +265,8 @@ public: } }; -#if 0 - static double DWF5(int Ls,int L) - { - // RealD mass=0.1; - RealD M5 =1.8; - double mflops; - double mflops_best = 0; - double mflops_worst= 0; - std::vector mflops_all; - - /////////////////////////////////////////////////////// - // Set/Get the layout & grid size - /////////////////////////////////////////////////////// - int threads = GridThread::GetThreads(); - Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); - Coordinate local({L,L,L,L}); - - GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), - GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); - uint64_t NP = TmpGrid->RankCount(); - uint64_t NN = TmpGrid->NodeCount(); - NN_global=NN; - uint64_t SHM=NP/NN; - - Coordinate internal; - if ( SHM == 1 ) internal = Coordinate({1,1,1,1}); - else if ( SHM == 2 ) internal = Coordinate({2,1,1,1}); - else if ( SHM == 4 ) internal = Coordinate({2,2,1,1}); - else if ( SHM == 8 ) internal = Coordinate({2,2,2,1}); - else assert(0); - - Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); - Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); - - ///////// Welcome message //////////// - std::cout< seeds4({1,2,3,4}); - std::vector seeds5({5,6,7,8}); - GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5); - std::cout << GridLogMessage << "Initialised RNGs" << std::endl; - - ///////// Source preparation //////////// - LatticeFermion src (sFGrid); - LatticeFermion tmp (sFGrid); - std::cout << GridLogMessage << "allocated src and tmp" << std::endl; - random(RNG5,src); - std::cout << GridLogMessage << "intialised random source" << std::endl; - - RealD N2 = 1.0/::sqrt(norm2(src)); - src = src*N2; - - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); - - WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); - LatticeFermion src_e (sFrbGrid); - LatticeFermion src_o (sFrbGrid); - LatticeFermion r_e (sFrbGrid); - LatticeFermion r_o (sFrbGrid); - LatticeFermion r_eo (sFGrid); - LatticeFermion err (sFGrid); - { - - pickCheckerboard(Even,src_e,src); - pickCheckerboard(Odd,src_o,src); - -#if defined(AVX512) - const int num_cases = 6; - std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); -#else - const int num_cases = 4; - std::string fmt("U/S ; U/O ; G/S ; G/O "); -#endif - controls Cases [] = { -#ifdef AVX512 - { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, - { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, -#endif - { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, - { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, - { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, - { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } - }; - - for(int c=0;cBarrier(); - for(int i=0;iBarrier(); - double t1=usecond(); - - sDw.ZeroCounters(); - time_statistics timestat; - std::vector t_time(ncall); - for(uint64_t i=0;iBarrier(); - - double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; - if ( mflopsRankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global=NN; uint64_t SHM=NP/NN; - Coordinate internal; - if ( SHM == 1 ) internal = Coordinate({1,1,1,1}); - else if ( SHM == 2 ) internal = Coordinate({2,1,1,1}); - else if ( SHM == 4 ) internal = Coordinate({2,2,1,1}); - else if ( SHM == 8 ) internal = Coordinate({2,2,2,1}); - else assert(0); - - Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); - Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout< U(4,FGrid); - auto Umu_v = Umu.View(); - auto Umu5d_v = Umu5d.View(); - for(int ss=0;ssoSites();ss++){ - for(int s=0;s(Umu5d,mu); - } - for(int mu=0;muBarrier(); for(int i=0;iBarrier(); double t1=usecond(); - // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); - // if (ncall < 500) ncall = 500; - uint64_t ncall = 1000; + uint64_t ncall = 50; FGrid->Broadcast(0,&ncall,sizeof(ncall)); @@ -649,24 +406,11 @@ public: std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + + typedef ImprovedStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU3::HotConfiguration(RNG4,Umu); + + typename Action::ImplParams params; + Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + const int num_cases = 4; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops L_list({16,24,32}); int selm1=sel-1; - std::vector robust_list; std::vector wilson; std::vector dwf4; - std::vector dwf5; + std::vector staggered; - if ( do_wilson ) { - int Ls=1; - std::cout< Date: Sat, 9 May 2020 22:27:56 -0400 Subject: [PATCH 17/19] Split allocator cache into two pools of different sizes --- Grid/allocator/AlignedAllocator.cc | 69 +++++++++++++++--------------- Grid/allocator/AlignedAllocator.h | 17 ++++---- 2 files changed, 44 insertions(+), 42 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index d53c4dc2..77646410 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,21 +6,19 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; -#ifdef GRID_NVCC -#define SMALL_LIMIT (0) -#else -#define SMALL_LIMIT (4096) -#endif - -#ifdef POINTER_CACHE -int PointerCache::victim; - +int PointerCache::Victim; +int PointerCache::VictimSmall; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; +PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmall]; -void *PointerCache::Insert(void *ptr,size_t bytes) { - - if (bytes < SMALL_LIMIT ) return ptr; - +void *PointerCache::Insert(void *ptr,size_t bytes) +{ + if (bytes < GRID_ALLOC_SMALL_LIMIT ) + return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall); + return Insert(ptr,bytes,Entries,Ncache,Victim); +} +void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) +{ #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif @@ -28,8 +26,8 @@ void *PointerCache::Insert(void *ptr,size_t bytes) { void * ret = NULL; int v = -1; - for(int e=0;e #define POINTER_CACHE #define GRID_ALLOC_ALIGN (2*1024*1024) +#define GRID_ALLOC_SMALL_LIMIT (4096) NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? -#ifdef POINTER_CACHE + class PointerCache { private: /*Pinning pages is costly*/ /*Could maintain separate large and small allocation caches*/ -#ifdef GRID_NVCC - static const int Ncache=128; -#else +/* Could make these configurable, perhaps up to a max size*/ + static const int NcacheSmall=128; static const int Ncache=8; -#endif - static int victim; typedef struct { void *address; @@ -65,14 +63,17 @@ private: } PointerCacheEntry; static PointerCacheEntry Entries[Ncache]; + static int Victim; + static PointerCacheEntry EntriesSmall[NcacheSmall]; + static int VictimSmall; public: static void *Insert(void *ptr,size_t bytes) ; + static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ; static void *Lookup(size_t bytes) ; - + static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ; }; -#endif std::string sizeString(size_t bytes); From 2bb2c68e15572bcc6012bfe2694bdca10948463f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 9 May 2020 22:57:21 -0400 Subject: [PATCH 18/19] Separate pools for small and large allocations cache --- Grid/allocator/AlignedAllocator.cc | 16 ++++++++++++++-- Grid/allocator/AlignedAllocator.h | 12 +++++++----- .../CayleyFermion5DImplementation.h | 3 ++- Grid/util/Init.cc | 2 ++ 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index 77646410..976dfbdc 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,11 +6,23 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; +int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax; +int PointerCache::Ncache = PointerCache::NcacheMax; int PointerCache::Victim; int PointerCache::VictimSmall; -PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; -PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmall]; +PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax]; +PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax]; +void PointerCache::Init(void) +{ + char * str; + str= getenv("GRID_ALLOC_NCACHE_LARGE"); + if ( str ) Ncache = atoi(str); + if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax; + str= getenv("GRID_ALLOC_NCACHE_SMALL"); + if ( str ) NcacheSmall = atoi(str); + if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax; +} void *PointerCache::Insert(void *ptr,size_t bytes) { if (bytes < GRID_ALLOC_SMALL_LIMIT ) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index d6e2e073..77167299 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -53,8 +53,10 @@ private: /*Pinning pages is costly*/ /*Could maintain separate large and small allocation caches*/ /* Could make these configurable, perhaps up to a max size*/ - static const int NcacheSmall=128; - static const int Ncache=8; + static const int NcacheSmallMax=128; + static const int NcacheMax=16; + static int NcacheSmall; + static int Ncache; typedef struct { void *address; @@ -62,13 +64,13 @@ private: int valid; } PointerCacheEntry; - static PointerCacheEntry Entries[Ncache]; + static PointerCacheEntry Entries[NcacheMax]; static int Victim; - static PointerCacheEntry EntriesSmall[NcacheSmall]; + static PointerCacheEntry EntriesSmall[NcacheSmallMax]; static int VictimSmall; public: - + static void Init(void); static void *Insert(void *ptr,size_t bytes) ; static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ; static void *Lookup(size_t bytes) ; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index c80d2425..e379026c 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -779,9 +779,9 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, assert(mu>=0); assert(mu::SeqConservedCurrent(PropagatorField &q_in, #endif #ifndef GRID_NVCC + int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE //////////////////////////////////////////////// diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 570f4234..1b672141 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -355,6 +355,8 @@ void Grid_init(int *argc,char ***argv) ////////////////////////////////////////////////////////// GridGpuInit(); // Must come first to set device prior to MPI init + PointerCache::Init(); + if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){ int MB; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm"); From ea08f193e7bdd7fcb8d18a8713f0f5387def9b2f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 10 May 2020 05:24:26 -0400 Subject: [PATCH 19/19] Allocator cache spliit into large/small pools --- Grid/algorithms/approx/Chebyshev.h | 2 +- Grid/allocator/AlignedAllocator.cc | 10 +++++++++- Grid/communicator/SharedMemory.cc | 4 +++- benchmarks/Benchmark_schur.cc | 4 ++-- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index c0b0646d..584ed1d5 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -236,7 +236,6 @@ public: int vol=grid->gSites(); typedef typename Field::vector_type vector_type; - constexpr int Nsimd = vector_type::Nsimd(); Field T0(grid); T0 = in; Field T1(grid); @@ -264,6 +263,7 @@ public: auto Tn_v = Tn->View(); auto Tnp_v = Tnp->View(); auto Tnm_v = Tnm->View(); + constexpr int Nsimd = vector_type::Nsimd(); accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index 976dfbdc..ef6459ed 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -7,7 +7,11 @@ MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax; -int PointerCache::Ncache = PointerCache::NcacheMax; +#ifdef GRID_CUDA +int PointerCache::Ncache = 32; +#else +int PointerCache::Ncache = 8; +#endif int PointerCache::Victim; int PointerCache::VictimSmall; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax]; @@ -16,12 +20,16 @@ PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheS void PointerCache::Init(void) { char * str; + str= getenv("GRID_ALLOC_NCACHE_LARGE"); if ( str ) Ncache = atoi(str); if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax; + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) NcacheSmall = atoi(str); if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax; + + // printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax); } void *PointerCache::Insert(void *ptr,size_t bytes) { diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc index 5bca9764..de10da3d 100644 --- a/Grid/communicator/SharedMemory.cc +++ b/Grid/communicator/SharedMemory.cc @@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){ if (heap_bytes >= heap_size) { std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm flag" < > latts; -#if 0 +#if 1 latts.push_back(std::vector ({24,24,24,24}) ); latts.push_back(std::vector ({48,24,24,24}) ); latts.push_back(std::vector ({96,24,24,24}) ); @@ -157,7 +157,7 @@ void benchDw(std::vector & latt4, int Ls) std::cout <<"\t"<