From b4c624ece68f3bac2688e8d77112e09f481966ae Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 3 Apr 2020 15:43:23 +0200 Subject: [PATCH 001/147] added A64FX support --- Grid/simd/Grid_a64fx-2.h | 977 ++++++++++++++++++++++++++++++++++ Grid/simd/Grid_vector_types.h | 71 ++- 2 files changed, 1026 insertions(+), 22 deletions(-) create mode 100644 Grid/simd/Grid_a64fx-2.h diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h new file mode 100644 index 00000000..1bb67179 --- /dev/null +++ b/Grid/simd/Grid_a64fx-2.h @@ -0,0 +1,977 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_a64fx-1.h + + Copyright (C) 2020 + +Author: Nils Meyer + + Copyright (C) 2015 + Copyright (C) 2017 + +Author: Antonin Portelli + Andrew Lawson + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +///////////////////////////////////////////////////// +// Using SVE ACLE +///////////////////////////////////////////////////// + +static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); + +#ifdef __ARM_FEATURE_SVE + #ifdef __clang__ + //#pragma message("Using clang compiler") + #include + #endif +#else + #pragma error "Missing SVE feature" +#endif /* __ARM_FEATURE_SVE */ + +namespace Grid { +namespace Optimization { + + // type traits giving the number of elements for each vector type + template struct W; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; + +}} // Grid::Optimization + + +// low-level API +namespace Grid { +namespace Optimization { + +template +struct acle{}; + +template <> +struct acle{ + typedef svfloat64_t vt; + typedef svfloat64x2_t vt2; + typedef svfloat64x4_t vt4; + typedef float64_t pt; + typedef uint64_t uint; + typedef svuint64_t svuint; + + static inline svbool_t pg1(){return svptrue_b64();} + static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} + static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} + static inline vec tbl_swap(){ + const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + return t; + } + static inline vec tbl0(){ + const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + return t; + } + static inline vec tbl1(){ + const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} + static inline svfloat64_t zero(){return svdup_f64(0.);} +}; + +template <> +struct acle{ + typedef svfloat32_t vt; + typedef svfloat32x2_t vt2; + typedef float32_t pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + // exchange neighboring elements + static inline vec tbl_swap(){ + const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return t; + } + static inline vec tbl0(){ + const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + return t; + } + static inline vec tbl1(){ + const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + return t; + } + static inline vec tbl2(){ + const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline svfloat32_t zero(){return svdup_f32(0.);} +}; + +template <> +struct acle{ + typedef svfloat16_t vt; + typedef float16_t pt; + typedef uint16_t uint; + typedef svuint16_t svuint; + + static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg2(){return svptrue_pat_b16(SV_VL16);} + static inline svbool_t pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} + static inline svfloat16_t zero(){return svdup_f16(0.);} +}; + +template <> +struct acle{ + typedef svuint32_t vt; + typedef svuint32x2_t vt2; + typedef Integer pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + //static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} +}; + +// --------------------------------------------------- + +struct Vsplat{ + // Complex float + inline vecf operator()(float a, float b){ + + vecf out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svdup_f32(a); + typename acle::vt b_v = svdup_f32(b); + typename acle::vt r_v = svzip1(a_v, b_v); + svst1(pg1, out.v, r_v); + return out; + } + + // Real float + inline vecf operator()(float a){ + + vecf out; + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdup_f32(a); + svst1(pg1, out.v, r_v); + return out; + } + + // Complex double + inline vecd operator()(double a, double b){ + + vecd out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svdup_f64(a); + typename acle::vt b_v = svdup_f64(b); + typename acle::vt r_v = svzip1(a_v, b_v); + svst1(pg1, out.v, r_v); + return out; + } + + // Real double + inline vecd operator()(double a){ + + vecd out; + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdup_f64(a); + svst1(pg1, out.v, r_v); + return out; + } + + // Integer + inline vec operator()(Integer a){ + + vec out; + svbool_t pg1 = acle::pg1(); + // Add check whether Integer is really a uint32_t??? + typename acle::vt r_v = svdup_u32(a); + svst1(pg1, out.v, r_v); + return out; + } +}; + + struct Vstore{ + // Real + template + inline void operator()(vec a, T *D){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + // NOTE illegal '&' here causes SIGBUS at runtime, related to CAS-35230-H2H6T1 + // svst1(pg1, (typename acle::pt*)&D, a_v); + svst1(pg1, D, a_v); + + // non temporal version + //svstnt1(pg1, D, a_v); + } + }; + + struct Vstream{ + // Real + template + inline void operator()(T * a, vec b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt b_v = svld1(pg1, b.v); + // FIXME non-temporal store causes compiler crash CAS-35230-H2H6T1 + svstnt1(pg1, a, b_v); + //svst1(pg1, a, b_v); + } + }; + + struct Vset{ + // Complex + template + inline vec operator()(std::complex *a){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (T*)a); + svst1(pg1, out.v, a_v); + + return out; + } + + // Real + template + inline vec operator()(T *a){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a); + svst1(pg1, out.v, a_v); + + return out; + } + }; + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + + + struct Sum{ + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svadd_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } + }; + + struct Sub{ + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svsub_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } + }; + + +struct Mult{ + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svmul_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + + // using FCMLA + typename acle::vt z_v = acle::zero(); + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MaddRealPart{ + template + inline vec operator()(vec a, vec b, vec c){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultComplex{ + // Complex a*b + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt z_v = __svzero(z_v); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 90); + r_v = svcmla_x(pg1, r_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Div{ + // Real + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svdiv_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Conj{ + // Complex + template + inline vec operator()(vec a){ + + vec out; + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt r_v = svneg_x(pg_odd, a_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + + + struct TimesMinusI{ + // Complex + template + inline vec operator()(vec a, vec b){ + + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_odd, a_v); + svst1(pg1, out.v, r_v); + + return out; + } + }; + + struct TimesI{ + // Complex + template + inline vec operator()(vec a, vec b){ + + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_even, a_v); + svst1(pg1, out.v, r_v); + + return out; + } + }; + + +struct PrecisionChange { + static inline vech StoH (const vecf &sa,const vecf &sb) { + + vech ret; + svbool_t pg1s = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt sa_v = svld1(pg1s, sa.v); + typename acle::vt sb_v = svld1(pg1s, sb.v); + typename acle::vt ha_v = svcvt_f16_x(pg1s, sa_v); + typename acle::vt hb_v = svcvt_f16_x(pg1s, sb_v); + typename acle::vt r_v = svuzp1(ha_v, hb_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + return ret; + } + static inline void HtoS(vech h,vecf &sa,vecf &sb) { + + svbool_t pg1h = acle::pg1(); + svbool_t pg1s = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt ha_v = svzip1(h_v, h_v); + typename acle::vt hb_v = svzip2(h_v, h_v); + typename acle::vt sa_v = svcvt_f32_x(pg1s, ha_v); + typename acle::vt sb_v = svcvt_f32_x(pg1s, hb_v); + svst1(pg1s, sa.v, sa_v); + svst1(pg1s, sb.v, sb_v); + } + static inline vecf DtoS (vecd a,vecd b) { + + vecf ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1s = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt sa_v = svcvt_f32_x(pg1d, a_v); + typename acle::vt sb_v = svcvt_f32_x(pg1d, b_v); + typename acle::vt r_v = svuzp1(sa_v, sb_v); + svst1(pg1s, ret.v, r_v); + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + + svbool_t pg1s = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt s_v = svld1(pg1s, s.v); + typename acle::vt sa_v = svzip1(s_v, s_v); + typename acle::vt sb_v = svzip2(s_v, s_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, sa_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, sb_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + + vech ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt c_v = svld1(pg1d, c.v); + typename acle::vt d_v = svld1(pg1d, d.v); + typename acle::vt ha_v = svcvt_f16_x(pg1d, a_v); + typename acle::vt hb_v = svcvt_f16_x(pg1d, b_v); + typename acle::vt hc_v = svcvt_f16_x(pg1d, c_v); + typename acle::vt hd_v = svcvt_f16_x(pg1d, d_v); + typename acle::vt hab_v = svuzp1(ha_v, hb_v); + typename acle::vt hcd_v = svuzp1(hc_v, hd_v); + typename acle::vt r_v = svuzp1(hab_v, hcd_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + + return ret; +/* + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); +*/ + } + static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + + svbool_t pg1h = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt sa_v = svzip1(h_v, h_v); + typename acle::vt sb_v = svzip2(h_v, h_v); + typename acle::vt da_v = svzip1(sa_v, sa_v); + typename acle::vt db_v = svzip2(sa_v, sa_v); + typename acle::vt dc_v = svzip1(sb_v, sb_v); + typename acle::vt dd_v = svzip2(sb_v, sb_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, da_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, db_v); + typename acle::vt c_v = svcvt_f64_x(pg1d, dc_v); + typename acle::vt d_v = svcvt_f64_x(pg1d, dd_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + svst1(pg1d, c.v, c_v); + svst1(pg1d, d.v, d_v); +/* + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); +*/ + } +}; + + + struct Exchange{ + + // Exchange0 is valid for arbitrary SVE vector length + template + static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); + r1_v = svext(r1_v, a2_v, (uint64_t)W::c); + typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); + r2_v = svext(a1_v, r2_v, (uint64_t)W::c); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg4 = acle::pg4(); + typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); + typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); + typename acle::vt4 out1_v4; + typename acle::vt4 out2_v4; + out1_v4.v0 = in1_v4.v0; + out1_v4.v1 = in1_v4.v1; + out1_v4.v2 = in2_v4.v0; + out1_v4.v3 = in2_v4.v1; + out2_v4.v0 = in1_v4.v2; + out2_v4.v1 = in1_v4.v3; + out2_v4.v2 = in2_v4.v2; + out2_v4.v3 = in2_v4.v3; + svst4(pg4, (typename acle::pt*)out1.v, out1_v4); + svst4(pg4, (typename acle::pt*)out2.v, out2_v4); + } + + template + static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); + typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, (typename acle::pt*)out1.v, r1_v); + svst1(pg1, (typename acle::pt*)out2.v, r2_v); + } + + static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ + assert(0); + return; + } +}; + + +struct Permute{ + + // Permute0 is valid for any SVE vector width + template + static inline vec Permute0(vec in) { + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); + svst1(pg1, out.v, r_v); + return out; + } + + static inline vecd Permute1(vecd in) { + + vecd out; + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute1(vecf in) { + + vecf out; + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute2(vecd in) { + + vecd out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute2(vecf in) { + + vecf out; + const vec::uint> tbl_swap = acle::tbl2(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute3(vecf in) { + + vecf out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute3(vecd in) { + return in; + } + +}; + +struct Rotate{ + + template static inline vec tRotate(vec in){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(n%W::r)); + svst1(pg1, out.v, r_v); + + return out; + } + + template + static inline vec rotate(vec in, int n){ + + switch(n){ + case 0: return tRotate<0, T>(in); break; + case 1: return tRotate<1, T>(in); break; + case 2: return tRotate<2, T>(in); break; + case 3: return tRotate<3, T>(in); break; + case 4: return tRotate<4, T>(in); break; + case 5: return tRotate<5, T>(in); break; + case 6: return tRotate<6, T>(in); break; + case 7: return tRotate<7, T>(in); break; + + case 8: return tRotate<8, T>(in); break; + case 9: return tRotate<9, T>(in); break; + case 10: return tRotate<10, T>(in); break; + case 11: return tRotate<11, T>(in); break; + case 12: return tRotate<12, T>(in); break; + case 13: return tRotate<13, T>(in); break; + case 14: return tRotate<14, T>(in); break; + case 15: return tRotate<15, T>(in); break; + default: assert(0); + } + } +}; + +// ======================================================================= +/* SVE ACLE reducedoes not compile, check later + +// tree-based reduction +#define svred(pg, v)\ +svaddv(pg, v); + +// left-to-right reduction +// #define svred(pg, v)\ +// svadda(pg, 0, v) + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type &in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, in.v); + float a = svred(pg_even, a_v); + float b = svred(pg_odd, a_v); + + return Grid::ComplexF(a, b); + +} + +//Real float Reduce +template <> +inline Grid::RealF Reduce::operator()(vecf in){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + float a = svred(pg1, a_v); + + return a; +} + +//Complex double Reduce +template <> +inline Grid::ComplexD Reduce::operator()(vecd in){ + + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, in.v); + double a = svred(pg_even, a_v); + double b = svred(pg_odd, a_v); + + return Grid::ComplexD(a, b); +} + +//Real double Reduce +template <> +inline Grid::RealD Reduce::operator()(vecd in){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + double a = svred(pg1, a_v); + + return a; +} + +//Integer Reduce +template <> +inline Integer Reduce::operator()(veci in){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + Integer a = svred(pg1, a_v); + + return a; +} + +#undef svred +*/ + +// ======================================================================= + + +#define acc(v, a, off, step, n)\ +for (unsigned int i = off; i < n; i += step)\ +{\ + a += v[i];\ +} + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + float a = 0.f, b = 0.f; + + acc(in.v, a, 0, 2, W::r); + acc(in.v, b, 1, 2, W::r); + + return Grid::ComplexF(a, b); +} + +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(vecf in){ + float a = 0.; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(vecd in){ + double a = 0., b = 0.; + + acc(in.v, a, 0, 2, W::r); + acc(in.v, b, 1, 2, W::r); + + return Grid::ComplexD(a, b); +} + +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(vecd in){ + double a = 0.f; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +//Integer Reduce +template<> +inline Integer Reduce::operator()(veci in){ + Integer a = 0; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +#undef acc // EIGEN compatibility + + +} // Optimization + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + + typedef Optimization::vech SIMD_Htype; // Reduced precision type + typedef Optimization::vecf SIMD_Ftype; // Single precision type + typedef Optimization::vecd SIMD_Dtype; // Double precision type + typedef Optimization::veci SIMD_Itype; // Integer type + + // prefetch utilities + inline void v_prefetch0(int size, const char *ptr){}; + inline void prefetch_HINT_T0(const char *ptr){}; + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e2b1fd07..c726660f 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_vector_types.h @@ -73,7 +73,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; Grid_half o; - + o.x = static_cast(0x0u); unsigned int sign = f.u & sign_mask; f.u ^= sign; @@ -93,7 +93,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { o.x = static_cast(f.u - denorm_magic.u); } else { unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - + // update exponent, rounding bias part 1 f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; // rounding bias part 2 @@ -101,7 +101,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { // take the bits! o.x = static_cast(f.u >> 13); } - } + } o.x |= static_cast(sign >> 16); return o; } @@ -110,9 +110,21 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GPU_VEC #include "Grid_gpu_vec.h" #endif +/* #ifdef GEN #include "Grid_generic.h" #endif +*/ +#ifdef GEN + #if defined A64FX // breakout A64FX SVE ACLE here + #pragma message("building for A64FX / SVE ACLE") + #define ARMCLANGHOTFIX + #include "Grid_a64fx-2.h" + #endif +#else + #include "Grid_generic.h" +#endif + #ifdef SSE4 #include "Grid_sse4.h" #endif @@ -170,7 +182,7 @@ template struct is_real struct is_integer : public std::false_type {}; template struct is_integer::value, void>::type> : public std::true_type {}; - + template using IfReal = Invoke::value, int> >; template using IfComplex = Invoke::value, int> >; template using IfInteger = Invoke::value, int> >; @@ -223,6 +235,21 @@ public: return sizeof(Vector_type) / sizeof(Scalar_type); } +#ifdef ARMCLANGHOTFIX + accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { + svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); + svst1(svptrue_b8(), (int8_t*)this, tmp); + //v = rhs.v; + return *this; + }; + + accelerator_inline Grid_simd &operator=(const Grid_simd &rhs) { + svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); + svst1(svptrue_b8(), (int8_t*)this, tmp); + //v = rhs.v; + return *this; + }; +#else accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { v = rhs.v; return *this; @@ -231,7 +258,7 @@ public: v = rhs.v; return *this; }; // faster than not declaring it and leaving to the compiler - +#endif accelerator Grid_simd() = default; accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps @@ -263,7 +290,7 @@ public: const Grid_simd *__restrict__ x) { *y = (*a) * (*x) + (*y); }; - + friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r) { @@ -412,7 +439,7 @@ public: Grid_simd ret; Grid_simd::conv_t conv; Grid_simd::scalar_type s; - + conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { s = conv.s[i]; @@ -441,7 +468,7 @@ public: return ret; } /////////////////////// - // Exchange + // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh /////////////////////// friend accelerator_inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) @@ -452,20 +479,20 @@ public: Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } else if(n==1) { Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { + } else if(n==0) { Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } } - friend accelerator_inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); } //////////////////////////////////////////////////////////////////// @@ -490,7 +517,7 @@ public: int dist = perm & 0xF; y = rotate(b, dist); return; - } + } else if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); @@ -564,29 +591,29 @@ accelerator_inline Grid_simd rotate(Grid_simd b, int nrot) { ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); return ret; } -template =0> +template =0> accelerator_inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,nrot); } -template =0> +template =0> accelerator_inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,2*nrot); } -template +template accelerator_inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); -} -template =0> +} +template =0> accelerator_inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} +} @@ -877,7 +904,7 @@ accelerator_inline typename toComplexMapper::Complexified toComplex(const conv.v = in.v; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == conv.s[i]); + assert(conv.s[i + 1] == conv.s[i]); // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match From 28d49a3b6064c16ec40aed357ed8750ca295025f Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 3 Apr 2020 16:52:48 +0200 Subject: [PATCH 002/147] build problem resolved --- Grid/simd/Grid_vector_types.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index c726660f..e4e0832a 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -116,13 +116,13 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #endif */ #ifdef GEN - #if defined A64FX // breakout A64FX SVE ACLE here - #pragma message("building for A64FX / SVE ACLE") - #define ARMCLANGHOTFIX - #include "Grid_a64fx-2.h" + #if defined(A64FX) // breakout A64FX SVE ACLE here + #pragma message("building for A64FX / SVE ACLE") + #define ARMCLANGHOTFIX + #include "Grid_a64fx-2.h" + #else + #include "Grid_generic.h" #endif -#else - #include "Grid_generic.h" #endif #ifdef SSE4 From 5f8a76d490ecbc7c1b3a7ff8bfa9e5888f8397df Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 3 Apr 2020 19:18:24 +0200 Subject: [PATCH 003/147] clean up, reduction in acle --- Grid/simd/Grid_a64fx-2.h | 334 +++++++++++++++++++-------------------- 1 file changed, 161 insertions(+), 173 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 1bb67179..6968ca7a 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -2,17 +2,11 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: ./lib/simd/Grid_a64fx-1.h + Source file: Grid_a64fx-2.h Copyright (C) 2020 -Author: Nils Meyer - - Copyright (C) 2015 - Copyright (C) 2017 - -Author: Antonin Portelli - Andrew Lawson + Author: Nils Meyer This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -47,8 +41,8 @@ static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); #pragma error "Missing SVE feature" #endif /* __ARM_FEATURE_SVE */ -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); // type traits giving the number of elements for each vector type template struct W; @@ -83,12 +77,12 @@ namespace Optimization { typedef vec vech; // half precision comms typedef vec veci; -}} // Grid::Optimization - +NAMESPACE_END(Optimization) +NAMESPACE_END(Grid) // low-level API -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); template struct acle{}; @@ -242,21 +236,16 @@ struct Vsplat{ } }; - struct Vstore{ - // Real - template - inline void operator()(vec a, T *D){ +struct Vstore{ + // Real + template + inline void operator()(vec a, T *D){ - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); - // NOTE illegal '&' here causes SIGBUS at runtime, related to CAS-35230-H2H6T1 - // svst1(pg1, (typename acle::pt*)&D, a_v); - svst1(pg1, D, a_v); - - // non temporal version - //svstnt1(pg1, D, a_v); - } - }; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + svst1(pg1, D, a_v); + } +}; struct Vstream{ // Real @@ -265,7 +254,6 @@ struct Vsplat{ svbool_t pg1 = acle::pg1(); typename acle::vt b_v = svld1(pg1, b.v); - // FIXME non-temporal store causes compiler crash CAS-35230-H2H6T1 svstnt1(pg1, a, b_v); //svst1(pg1, a, b_v); } @@ -297,40 +285,40 @@ struct Vsplat{ } }; - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// - struct Sum{ - template - inline vec operator()(vec a, vec b){ +struct Sum{ + template + inline vec operator()(vec a, vec b){ - vec out; - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, a.v); - typename acle::vt b_v = svld1(pg1, b.v); - typename acle::vt r_v = svadd_x(pg1, a_v, b_v); - svst1(pg1, out.v, r_v); + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svadd_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; - struct Sub{ - template - inline vec operator()(vec a, vec b){ +struct Sub{ + template + inline vec operator()(vec a, vec b){ - vec out; - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, a.v); - typename acle::vt b_v = svld1(pg1, b.v); - typename acle::vt r_v = svsub_x(pg1, a_v, b_v); - svst1(pg1, out.v, r_v); + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svsub_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; struct Mult{ @@ -440,45 +428,45 @@ struct Conj{ }; - struct TimesMinusI{ - // Complex - template - inline vec operator()(vec a, vec b){ +struct TimesMinusI{ + // Complex + template + inline vec operator()(vec a, vec b){ - vec out; - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - svbool_t pg_odd = acle::pg_odd(); + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_odd, a_v); - svst1(pg1, out.v, r_v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_odd, a_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; - struct TimesI{ - // Complex - template - inline vec operator()(vec a, vec b){ +struct TimesI{ + // Complex + template + inline vec operator()(vec a, vec b){ - vec out; - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - svbool_t pg_even = acle::pg_even(); + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_even, a_v); - svst1(pg1, out.v, r_v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_even, a_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; struct PrecisionChange { @@ -587,71 +575,71 @@ struct PrecisionChange { }; - struct Exchange{ +struct Exchange{ - // Exchange0 is valid for arbitrary SVE vector length - template - static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ + // Exchange0 is valid for arbitrary SVE vector length + template + static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, in1.v); - typename acle::vt a2_v = svld1(pg1, in2.v); - typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); - r1_v = svext(r1_v, a2_v, (uint64_t)W::c); - typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); - r2_v = svext(a1_v, r2_v, (uint64_t)W::c); - svst1(pg1, out1.v, r1_v); - svst1(pg1, out2.v, r2_v); - } + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); + r1_v = svext(r1_v, a2_v, (uint64_t)W::c); + typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); + r2_v = svext(a1_v, r2_v, (uint64_t)W::c); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } - template - static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg4 = acle::pg4(); - typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); - typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); - typename acle::vt4 out1_v4; - typename acle::vt4 out2_v4; - out1_v4.v0 = in1_v4.v0; - out1_v4.v1 = in1_v4.v1; - out1_v4.v2 = in2_v4.v0; - out1_v4.v3 = in2_v4.v1; - out2_v4.v0 = in1_v4.v2; - out2_v4.v1 = in1_v4.v3; - out2_v4.v2 = in2_v4.v2; - out2_v4.v3 = in2_v4.v3; - svst4(pg4, (typename acle::pt*)out1.v, out1_v4); - svst4(pg4, (typename acle::pt*)out2.v, out2_v4); - } + svbool_t pg4 = acle::pg4(); + typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); + typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); + typename acle::vt4 out1_v4; + typename acle::vt4 out2_v4; + out1_v4.v0 = in1_v4.v0; + out1_v4.v1 = in1_v4.v1; + out1_v4.v2 = in2_v4.v0; + out1_v4.v3 = in2_v4.v1; + out2_v4.v0 = in1_v4.v2; + out2_v4.v1 = in1_v4.v3; + out2_v4.v2 = in2_v4.v2; + out2_v4.v3 = in2_v4.v3; + svst4(pg4, (typename acle::pt*)out1.v, out1_v4); + svst4(pg4, (typename acle::pt*)out2.v, out2_v4); + } - template - static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ + template + static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); - typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); - typename acle::vt r1_v = svtrn1(a1_v, a2_v); - typename acle::vt r2_v = svtrn2(a1_v, a2_v); - svst1(pg1, (typename acle::pt*)out1.v, r1_v); - svst1(pg1, (typename acle::pt*)out2.v, r2_v); - } + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); + typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, (typename acle::pt*)out1.v, r1_v); + svst1(pg1, (typename acle::pt*)out2.v, r2_v); + } - static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ + static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, in1.v); - typename acle::vt a2_v = svld1(pg1, in2.v); - typename acle::vt r1_v = svtrn1(a1_v, a2_v); - typename acle::vt r2_v = svtrn2(a1_v, a2_v); - svst1(pg1, out1.v, r1_v); - svst1(pg1, out2.v, r2_v); - } + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } - static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ - assert(0); - return; - } + static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ + assert(0); + return; + } }; @@ -780,7 +768,7 @@ struct Rotate{ }; // ======================================================================= -/* SVE ACLE reducedoes not compile, check later +// SVE ACLE reduce does not compile, check later // tree-based reduction #define svred(pg, v)\ @@ -864,11 +852,11 @@ inline Integer Reduce::operator()(veci in){ } #undef svred -*/ +// */ // ======================================================================= - +/* #define acc(v, a, off, step, n)\ for (unsigned int i = off; i < n; i += step)\ {\ @@ -939,39 +927,39 @@ inline Integer Reduce::operator()(veci in){ } #undef acc // EIGEN compatibility +*/ - -} // Optimization +NAMESPACE_END(Optimization) ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef Optimization::vech SIMD_Htype; // Reduced precision type - typedef Optimization::vecf SIMD_Ftype; // Single precision type - typedef Optimization::vecd SIMD_Dtype; // Double precision type - typedef Optimization::veci SIMD_Itype; // Integer type +typedef Optimization::vech SIMD_Htype; // Reduced precision type +typedef Optimization::vecf SIMD_Ftype; // Single precision type +typedef Optimization::vecd SIMD_Dtype; // Double precision type +typedef Optimization::veci SIMD_Itype; // Integer type - // prefetch utilities - inline void v_prefetch0(int size, const char *ptr){}; - inline void prefetch_HINT_T0(const char *ptr){}; +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template using ReduceSIMD = Optimization::Reduce; +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::MultRealPart MultRealPartSIMD; - typedef Optimization::MaddRealPart MaddRealPartSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; -} +NAMESPACE_END(Grid) From d8cea77707d36a4c92195cac0bf09c7cf2c670df Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 3 Apr 2020 19:22:25 +0200 Subject: [PATCH 004/147] define simd width in header --- Grid/simd/Grid_a64fx-2.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 6968ca7a..75097b0a 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -30,6 +30,10 @@ // Using SVE ACLE ///////////////////////////////////////////////////// +#ifndef GEN_SIMD_WIDTH +#define GEN_SIMD_WIDTH 64u +#endif + static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); #ifdef __ARM_FEATURE_SVE From 46927771e360717a6c58665c299ea3237b825072 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 3 Apr 2020 19:30:48 +0200 Subject: [PATCH 005/147] reduce acle still needs overhaul --- Grid/simd/Grid_a64fx-2.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 75097b0a..1f14c21f 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -772,7 +772,7 @@ struct Rotate{ }; // ======================================================================= -// SVE ACLE reduce does not compile, check later +/* SVE ACLE reduce does not compile, check later // tree-based reduction #define svred(pg, v)\ @@ -856,11 +856,11 @@ inline Integer Reduce::operator()(veci in){ } #undef svred -// */ +*/ // ======================================================================= -/* + #define acc(v, a, off, step, n)\ for (unsigned int i = off; i < n; i += step)\ {\ @@ -931,7 +931,7 @@ inline Integer Reduce::operator()(veci in){ } #undef acc // EIGEN compatibility -*/ + NAMESPACE_END(Optimization) From b27e31957a91dcd9b2bfc04fcd6c16e675d13c89 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 3 Apr 2020 19:46:15 +0200 Subject: [PATCH 006/147] reduce acle revised --- Grid/simd/Grid_a64fx-2.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 1f14c21f..b83135f6 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -772,7 +772,7 @@ struct Rotate{ }; // ======================================================================= -/* SVE ACLE reduce does not compile, check later +// SVE ACLE reduce does not compile, check later // tree-based reduction #define svred(pg, v)\ @@ -786,7 +786,7 @@ template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled - inline Out_type operator()(In_type &in){ + inline Out_type operator()(In_type in){ printf("Error, using wrong Reduce function\n"); exit(1); return 0; @@ -856,11 +856,11 @@ inline Integer Reduce::operator()(veci in){ } #undef svred -*/ + // ======================================================================= - +/* #define acc(v, a, off, step, n)\ for (unsigned int i = off; i < n; i += step)\ {\ @@ -931,7 +931,7 @@ inline Integer Reduce::operator()(veci in){ } #undef acc // EIGEN compatibility - +*/ NAMESPACE_END(Optimization) From 15238e8d5ecc1fac5a5524deec15907ff1ef7316 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 3 Apr 2020 20:40:44 +0200 Subject: [PATCH 007/147] reduce acle works, clean up --- Grid/simd/Grid_a64fx-2.h | 105 ++++----------------------------------- 1 file changed, 10 insertions(+), 95 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index b83135f6..e44d24c9 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -251,17 +251,17 @@ struct Vstore{ } }; - struct Vstream{ - // Real - template - inline void operator()(T * a, vec b){ +struct Vstream{ + // Real + template + inline void operator()(T * a, vec b){ - svbool_t pg1 = acle::pg1(); - typename acle::vt b_v = svld1(pg1, b.v); - svstnt1(pg1, a, b_v); - //svst1(pg1, a, b_v); - } - }; + svbool_t pg1 = acle::pg1(); + typename acle::vt b_v = svld1(pg1, b.v); + svstnt1(pg1, a, b_v); + //svst1(pg1, a, b_v); + } +}; struct Vset{ // Complex @@ -293,7 +293,6 @@ struct Vstore{ // Arithmetic operations ///////////////////////////////////////////////////// - struct Sum{ template inline vec operator()(vec a, vec b){ @@ -324,7 +323,6 @@ struct Sub{ } }; - struct Mult{ template inline vec operator()(vec a, vec b){ @@ -431,7 +429,6 @@ struct Conj{ } }; - struct TimesMinusI{ // Complex template @@ -472,7 +469,6 @@ struct TimesI{ } }; - struct PrecisionChange { static inline vech StoH (const vecf &sa,const vecf &sb) { @@ -578,7 +574,6 @@ struct PrecisionChange { } }; - struct Exchange{ // Exchange0 is valid for arbitrary SVE vector length @@ -646,7 +641,6 @@ struct Exchange{ } }; - struct Permute{ // Permute0 is valid for any SVE vector width @@ -771,9 +765,6 @@ struct Rotate{ } }; -// ======================================================================= -// SVE ACLE reduce does not compile, check later - // tree-based reduction #define svred(pg, v)\ svaddv(pg, v); @@ -857,82 +848,6 @@ inline Integer Reduce::operator()(veci in){ #undef svred - -// ======================================================================= - -/* -#define acc(v, a, off, step, n)\ -for (unsigned int i = off; i < n; i += step)\ -{\ - a += v[i];\ -} - -template -struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } -}; - -//Complex float Reduce -template <> -inline Grid::ComplexF Reduce::operator()(vecf in){ - float a = 0.f, b = 0.f; - - acc(in.v, a, 0, 2, W::r); - acc(in.v, b, 1, 2, W::r); - - return Grid::ComplexF(a, b); -} - -//Real float Reduce -template<> -inline Grid::RealF Reduce::operator()(vecf in){ - float a = 0.; - - acc(in.v, a, 0, 1, W::r); - - return a; -} - -//Complex double Reduce -template<> -inline Grid::ComplexD Reduce::operator()(vecd in){ - double a = 0., b = 0.; - - acc(in.v, a, 0, 2, W::r); - acc(in.v, b, 1, 2, W::r); - - return Grid::ComplexD(a, b); -} - -//Real double Reduce -template<> -inline Grid::RealD Reduce::operator()(vecd in){ - double a = 0.f; - - acc(in.v, a, 0, 1, W::r); - - return a; -} - -//Integer Reduce -template<> -inline Integer Reduce::operator()(veci in){ - Integer a = 0; - - acc(in.v, a, 0, 1, W::r); - - return a; -} - -#undef acc // EIGEN compatibility -*/ - NAMESPACE_END(Optimization) ////////////////////////////////////////////////////////////////////////////////////// From 77fa586f6c90cacefa5387b9717118b8a86a14d3 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 13:30:06 +0200 Subject: [PATCH 008/147] introduced A64FX Wilson kernels --- .../implementation/WilsonKernelsAsmA64FX.h | 660 +++++++++++++++++ .../WilsonKernelsInstantiationAsm.cc | 1 + Grid/simd/Fujitsu_A64FX_asm_double.h | 691 ++++++++++++++++++ Grid/simd/Fujitsu_A64FX_intrin_double.h | 567 ++++++++++++++ Grid/simd/Fujitsu_A64FX_undef.h | 68 ++ 5 files changed, 1987 insertions(+) create mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h create mode 100644 Grid/simd/Fujitsu_A64FX_asm_double.h create mode 100644 Grid/simd/Fujitsu_A64FX_intrin_double.h create mode 100644 Grid/simd/Fujitsu_A64FX_undef.h diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h new file mode 100644 index 00000000..4e428097 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -0,0 +1,660 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + + + Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#if defined(A64FXINTRIN) +#pragma message("A64FX Wilson kernels intrin") +#else +#pragma message("A64FX Wilson kernels asm") +#endif + +#if defined(A64FX) + /////////////////////////////////////////////////////////// + // If we are A64FX specialise the single precision routine + /////////////////////////////////////////////////////////// +#if defined(A64FXINTRIN) +#include +#else +#include +#endif + + +/// Switch off the 5d vectorised code optimisations +#undef DWFVEC5D + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, undag Kernel, single +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, dag Kernel, single +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef MAYBEPERM +//#undef MULT_2SPIN +#define MAYBEPERM(A,B) +//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) + +///////////////////////////////////////////////////////////////// +// Ls vectorised, undag Kernel, single +///////////////////////////////////////////////////////////////// + +#ifdef DWFVEC5D + +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +#undef MULT_2SPIN +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +///////////////////////////////////////////////////////////////// +// Ls vectorised, dag Kernel, single +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#endif // VEC 5D + +//#undef COMPLEX_SIGNS +//#undef MAYBEPERM +//#undef MULT_2SPIN + +// undefine everything +#include + +/////////////////////////////////////////////////////////// +// If we are A64FX specialise the double precision routine +/////////////////////////////////////////////////////////// + +#if defined(A64FXINTRIN) +#include +#else +#include +#endif + +// KNL stuff +//#define MAYBEPERM(A,perm) if (perm) { A ; } +//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) +//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; + + +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, undag Kernel, double +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, dag Kernel, double +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + + +// KNL stuff +//#undef MAYBEPERM +//#undef MULT_2SPIN +#define MAYBEPERM(A,B) +//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) + +///////////////////////////////////////////////////////////////// +// Ls vectorised, undag Kernel, double +///////////////////////////////////////////////////////////////// +#ifdef DWFVEC5D + +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +#undef MULT_2SPIN +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +///////////////////////////////////////////////////////////////// +// Ls vectorised, dag Kernel, double +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#endif // VEC 5D + +// undefs +#include + +#endif //A64FX diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc index f6f235c8..a8e9e6d9 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc @@ -37,6 +37,7 @@ directory //////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); #include +#include #include NAMESPACE_END(Grid); diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h new file mode 100644 index 00000000..57636961 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -0,0 +1,691 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd(A) +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 PERM0_A64FXd +#define PERMUTE_DIR1 PERM1_A64FXd +#define PERMUTE_DIR2 PERM2_A64FXd +#define PERMUTE_DIR3 PERM3_A64FXd +// DECLARATIONS +#define DECLARATIONS_A64FXd(x) \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ +asm ( \ + "fmov z31.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXd(base) \ +{ \ +asm ( \ + "stnt1d { z0.d }, p5, [%[storeptr], -6, mul vl] \n\t" \ + "stnt1d { z1.d }, p5, [%[storeptr], -5, mul vl] \n\t" \ + "stnt1d { z2.d }, p5, [%[storeptr], -4, mul vl] \n\t" \ + "stnt1d { z3.d }, p5, [%[storeptr], -3, mul vl] \n\t" \ + "stnt1d { z4.d }, p5, [%[storeptr], -2, mul vl] \n\t" \ + "stnt1d { z5.d }, p5, [%[storeptr], -1, mul vl] \n\t" \ + "stnt1d { z6.d }, p5, [%[storeptr], 0, mul vl] \n\t" \ + "stnt1d { z7.d }, p5, [%[storeptr], 1, mul vl] \n\t" \ + "stnt1d { z8.d }, p5, [%[storeptr], 2, mul vl] \n\t" \ + "stnt1d { z9.d }, p5, [%[storeptr], 3, mul vl] \n\t" \ + "stnt1d { z10.d }, p5, [%[storeptr], 4, mul vl] \n\t" \ + "stnt1d { z11.d }, p5, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ +asm ( \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PERM0 +#define PERM0_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM1 +#define PERM1_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM2 +#define PERM2_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM3 +#define PERM3_A64FXd + +// MULT_2SPIN +#define MULT_2SPIN_A64FXd(A) \ +{ \ + const auto & ref(U[sU][A]); \ +asm ( \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "fmov z18.d , 0 \n\t" \ + "fmov z21.d , 0 \n\t" \ + "fmov z19.d , 0 \n\t" \ + "fmov z22.d , 0 \n\t" \ + "fmov z20.d , 0 \n\t" \ + "fmov z23.d , 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ + : \ + : [fetchptr] "r" ((uint64_t)&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z24.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z25.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z26.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.d, p5/m, z12.d, z27.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z28.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z29.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z24.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z25.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z26.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z27.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.d, p5/m, z12.d, z24.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z25.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z26.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z27.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z28.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z29.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z24.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z25.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z26.d, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.d, p5/m, z12.d, z27.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z28.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z29.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z24.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z25.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z26.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z27.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.d, p5/m, z12.d, z24.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z25.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z26.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z27.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z28.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z29.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ +asm ( \ + "ptrue p5.d \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ +asm ( \ + "ptrue p5.d \n\t" \ + "fmov z0.d , 0 \n\t" \ + "fmov z1.d , 0 \n\t" \ + "fmov z2.d , 0 \n\t" \ + "fmov z3.d , 0 \n\t" \ + "fmov z4.d , 0 \n\t" \ + "fmov z5.d , 0 \n\t" \ + "fmov z6.d , 0 \n\t" \ + "fmov z7.d , 0 \n\t" \ + "fmov z8.d , 0 \n\t" \ + "fmov z9.d , 0 \n\t" \ + "fmov z10.d , 0 \n\t" \ + "fmov z11.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h new file mode 100644 index 00000000..1bafc114 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -0,0 +1,567 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd(A) +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 PERM0_A64FXd +#define PERMUTE_DIR1 PERM1_A64FXd +#define PERMUTE_DIR2 PERM2_A64FXd +#define PERMUTE_DIR3 PERM3_A64FXd +// DECLARATIONS +#define DECLARATIONS_A64FXd(x) \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ + svfloat64_t result_00; \ + svfloat64_t result_01; \ + svfloat64_t result_02; \ + svfloat64_t result_10; \ + svfloat64_t result_11; \ + svfloat64_t result_12; \ + svfloat64_t result_20; \ + svfloat64_t result_21; \ + svfloat64_t result_22; \ + svfloat64_t result_30; \ + svfloat64_t result_31; \ + svfloat64_t result_32; \ + svfloat64_t Chi_00; \ + svfloat64_t Chi_01; \ + svfloat64_t Chi_02; \ + svfloat64_t Chi_10; \ + svfloat64_t Chi_11; \ + svfloat64_t Chi_12; \ + svfloat64_t UChi_00; \ + svfloat64_t UChi_01; \ + svfloat64_t UChi_02; \ + svfloat64_t UChi_10; \ + svfloat64_t UChi_11; \ + svfloat64_t UChi_12; \ + svfloat64_t U_00; \ + svfloat64_t U_10; \ + svfloat64_t U_20; \ + svfloat64_t U_01; \ + svfloat64_t U_11; \ + svfloat64_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ + svuint64_t table0; \ + svfloat64_t zero0; \ + zero0 = __svzero(zero0); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 U_00 +#define Chimu_21 U_10 +#define Chimu_22 U_20 +#define Chimu_30 U_01 +#define Chimu_31 U_11 +#define Chimu_32 U_21 +// RESULT +#define RESULT_A64FXd(base) \ +{ \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ + Chi_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chi_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chi_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chi_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chi_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// PERM0 +#define PERM0_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM1 +#define PERM1_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM2 +#define PERM2_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM3 +#define PERM3_A64FXd + +// MULT_2SPIN +#define MULT_2SPIN_A64FXd(A) \ +{ \ + const auto & ref(U[sU][A]); \ + U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 1 * 64)); \ + UChi_00 = __svzero(UChi_00); \ + UChi_10 = __svzero(UChi_10); \ + UChi_01 = __svzero(UChi_01); \ + UChi_11 = __svzero(UChi_11); \ + UChi_02 = __svzero(UChi_02); \ + UChi_12 = __svzero(UChi_12); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -4 * 64)); \ + U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -1 * 64)); \ + U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 2 * 64)); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ + result_00 = __svzero(result_00); \ + result_01 = __svzero(result_01); \ + result_02 = __svzero(result_02); \ + result_10 = __svzero(result_10); \ + result_11 = __svzero(result_11); \ + result_12 = __svzero(result_12); \ + result_20 = __svzero(result_20); \ + result_21 = __svzero(result_21); \ + result_22 = __svzero(result_22); \ + result_30 = __svzero(result_30); \ + result_31 = __svzero(result_31); \ + result_32 = __svzero(result_32); + diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h new file mode 100644 index 00000000..07939007 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -0,0 +1,68 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_undef.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#undef LOAD_CHIMU_A64FXd +#undef LOAD_CHIMU_A64FXf +#undef PREFETCH_CHIMU_L1 +#undef PREFETCH_GAUGE_L1 +#undef PREFETCH_CHIMU_L2 +#undef PREFETCH_GAUGE_L2 +#undef PF_GAUGE +#undef PREFETCH1_CHIMU +#undef PREFETCH_CHIMU +#undef LOCK_GAUGE +#undef UNLOCK_GAUGE +#undef MASK_REGS +#undef COMPLEX_SIGNS +#undef LOAD64 +#undef SAVE_RESULT +#undef MULT_2SPIN_DIR_PF +#undef MAYBEPERM +#undef LOAD_CHI +#undef ZERO_PSI +#undef XP_PROJMEM +#undef YP_PROJMEM +#undef ZP_PROJMEM +#undef TP_PROJMEM +#undef XM_PROJMEM +#undef YM_PROJMEM +#undef ZM_PROJMEM +#undef TM_PROJMEM +#undef XP_RECON +#undef XM_RECON +#undef YM_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef XP_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef PERMUTE_DIR0 +#undef PERMUTE_DIR1 +#undef PERMUTE_DIR2 +#undef PERMUTE_DIR3 From 8fb63f1c25417d60356ce0b370c8ebb447841686 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 13:41:04 +0200 Subject: [PATCH 009/147] added A64FX Wilson kernels single precision --- Grid/simd/Fujitsu_A64FX_asm_single.h | 705 ++++++++++++++++++++++++ Grid/simd/Fujitsu_A64FX_intrin_single.h | 576 +++++++++++++++++++ 2 files changed, 1281 insertions(+) create mode 100644 Grid/simd/Fujitsu_A64FX_asm_single.h create mode 100644 Grid/simd/Fujitsu_A64FX_intrin_single.h diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h new file mode 100644 index 00000000..e2d2f8ef --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -0,0 +1,705 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXf(A) +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf +#define XP_RECON XP_RECON_A64FXf +#define XM_RECON XM_RECON_A64FXf +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf +#define PERMUTE_DIR0 PERM0_A64FXf +#define PERMUTE_DIR1 PERM1_A64FXf +#define PERMUTE_DIR2 PERM2_A64FXf +#define PERMUTE_DIR3 PERM3_A64FXf +// DECLARATIONS +#define DECLARATIONS_A64FXf(x) \ + const uint32_t lut[4][8] = { \ + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ + {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} } \ +asm ( \ + "fmov z31.f , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXf(base) \ +{ \ +asm ( \ + "stnt1d { z0.f }, p5, [%[storeptr], -6, mul vl] \n\t" \ + "stnt1d { z1.f }, p5, [%[storeptr], -5, mul vl] \n\t" \ + "stnt1d { z2.f }, p5, [%[storeptr], -4, mul vl] \n\t" \ + "stnt1d { z3.f }, p5, [%[storeptr], -3, mul vl] \n\t" \ + "stnt1d { z4.f }, p5, [%[storeptr], -2, mul vl] \n\t" \ + "stnt1d { z5.f }, p5, [%[storeptr], -1, mul vl] \n\t" \ + "stnt1d { z6.f }, p5, [%[storeptr], 0, mul vl] \n\t" \ + "stnt1d { z7.f }, p5, [%[storeptr], 1, mul vl] \n\t" \ + "stnt1d { z8.f }, p5, [%[storeptr], 2, mul vl] \n\t" \ + "stnt1d { z9.f }, p5, [%[storeptr], 3, mul vl] \n\t" \ + "stnt1d { z10.f }, p5, [%[storeptr], 4, mul vl] \n\t" \ + "stnt1d { z11.f }, p5, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXf(base) \ +{ \ +asm ( \ + "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ +{ \ +asm ( \ + "ptrue p5.f \n\t" \ + "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.f \n\t" \ + "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.f \n\t" \ + "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PERM0 +#define PERM0_A64FXf \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.f, { z12.f }, z30.f \n\t" \ + "tbl z13.f, { z13.f }, z30.f \n\t" \ + "tbl z14.f, { z14.f }, z30.f \n\t" \ + "tbl z15.f, { z15.f }, z30.f \n\t" \ + "tbl z16.f, { z16.f }, z30.f \n\t" \ + "tbl z17.f, { z17.f }, z30.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM1 +#define PERM1_A64FXf \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.f, { z12.f }, z30.f \n\t" \ + "tbl z13.f, { z13.f }, z30.f \n\t" \ + "tbl z14.f, { z14.f }, z30.f \n\t" \ + "tbl z15.f, { z15.f }, z30.f \n\t" \ + "tbl z16.f, { z16.f }, z30.f \n\t" \ + "tbl z17.f, { z17.f }, z30.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM2 +#define PERM2_A64FXf \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.f, { z12.f }, z30.f \n\t" \ + "tbl z13.f, { z13.f }, z30.f \n\t" \ + "tbl z14.f, { z14.f }, z30.f \n\t" \ + "tbl z15.f, { z15.f }, z30.f \n\t" \ + "tbl z16.f, { z16.f }, z30.f \n\t" \ + "tbl z17.f, { z17.f }, z30.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM3 +#define PERM3_A64FXf \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.f, { z12.f }, z30.f \n\t" \ + "tbl z13.f, { z13.f }, z30.f \n\t" \ + "tbl z14.f, { z14.f }, z30.f \n\t" \ + "tbl z15.f, { z15.f }, z30.f \n\t" \ + "tbl z16.f, { z16.f }, z30.f \n\t" \ + "tbl z17.f, { z17.f }, z30.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// MULT_2SPIN +#define MULT_2SPIN_A64FXf(A) \ +{ \ + const auto & ref(U[sU][A]); \ +asm ( \ + "ld1d { z24.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "fmov z18.f , 0 \n\t" \ + "fmov z21.f , 0 \n\t" \ + "fmov z19.f , 0 \n\t" \ + "fmov z22.f , 0 \n\t" \ + "fmov z20.f , 0 \n\t" \ + "fmov z23.f , 0 \n\t" \ + "fcmla z18.f, p5/m, z24.f, z12.f, 0 \n\t" \ + "fcmla z21.f, p5/m, z24.f, z15.f, 0 \n\t" \ + "fcmla z19.f, p5/m, z25.f, z12.f, 0 \n\t" \ + "fcmla z22.f, p5/m, z25.f, z15.f, 0 \n\t" \ + "fcmla z20.f, p5/m, z26.f, z12.f, 0 \n\t" \ + "fcmla z23.f, p5/m, z26.f, z15.f, 0 \n\t" \ + "fcmla z18.f, p5/m, z24.f, z12.f, 90 \n\t" \ + "fcmla z21.f, p5/m, z24.f, z15.f, 90 \n\t" \ + "fcmla z19.f, p5/m, z25.f, z12.f, 90 \n\t" \ + "fcmla z22.f, p5/m, z25.f, z15.f, 90 \n\t" \ + "fcmla z20.f, p5/m, z26.f, z12.f, 90 \n\t" \ + "fcmla z23.f, p5/m, z26.f, z15.f, 90 \n\t" \ + "ld1d { z24.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "fcmla z18.f, p5/m, z27.f, z13.f, 0 \n\t" \ + "fcmla z21.f, p5/m, z27.f, z16.f, 0 \n\t" \ + "fcmla z19.f, p5/m, z28.f, z13.f, 0 \n\t" \ + "fcmla z22.f, p5/m, z28.f, z16.f, 0 \n\t" \ + "fcmla z20.f, p5/m, z29.f, z13.f, 0 \n\t" \ + "fcmla z23.f, p5/m, z29.f, z16.f, 0 \n\t" \ + "fcmla z18.f, p5/m, z27.f, z13.f, 90 \n\t" \ + "fcmla z21.f, p5/m, z27.f, z16.f, 90 \n\t" \ + "fcmla z19.f, p5/m, z28.f, z13.f, 90 \n\t" \ + "fcmla z22.f, p5/m, z28.f, z16.f, 90 \n\t" \ + "fcmla z20.f, p5/m, z29.f, z13.f, 90 \n\t" \ + "fcmla z23.f, p5/m, z29.f, z16.f, 90 \n\t" \ + "fcmla z18.f, p5/m, z24.f, z14.f, 0 \n\t" \ + "fcmla z21.f, p5/m, z24.f, z17.f, 0 \n\t" \ + "fcmla z19.f, p5/m, z25.f, z14.f, 0 \n\t" \ + "fcmla z22.f, p5/m, z25.f, z17.f, 0 \n\t" \ + "fcmla z20.f, p5/m, z26.f, z14.f, 0 \n\t" \ + "fcmla z23.f, p5/m, z26.f, z17.f, 0 \n\t" \ + "fcmla z18.f, p5/m, z24.f, z14.f, 90 \n\t" \ + "fcmla z21.f, p5/m, z24.f, z17.f, 90 \n\t" \ + "fcmla z19.f, p5/m, z25.f, z14.f, 90 \n\t" \ + "fcmla z22.f, p5/m, z25.f, z17.f, 90 \n\t" \ + "fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \ + "fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \ + : \ + : [fetchptr] "r" ((uint64_t)&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.f, p5/m, z12.f, z27.f, 90 \n\t" \ + "fcadd z13.f, p5/m, z13.f, z28.f, 90 \n\t" \ + "fcadd z14.f, p5/m, z14.f, z29.f, 90 \n\t" \ + "fcadd z15.f, p5/m, z15.f, z24.f, 90 \n\t" \ + "fcadd z16.f, p5/m, z16.f, z25.f, 90 \n\t" \ + "fcadd z17.f, p5/m, z17.f, z26.f, 90 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXf \ +asm ( \ + "fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \ + "fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \ + "fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \ + "fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \ + "fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \ + "fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \ + "mov z0.f, z18.f \n\t" \ + "mov z1.f, z19.f \n\t" \ + "mov z2.f, z20.f \n\t" \ + "mov z3.f, z21.f \n\t" \ + "mov z4.f, z22.f \n\t" \ + "mov z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.f, p5/m, z12.f, z27.f \n\t" \ + "fsub z13.f, p5/m, z13.f, z28.f \n\t" \ + "fsub z14.f, p5/m, z14.f, z29.f \n\t" \ + "fadd z15.f, p5/m, z15.f, z24.f \n\t" \ + "fadd z16.f, p5/m, z16.f, z25.f \n\t" \ + "fadd z17.f, p5/m, z17.f, z26.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.f, p5/m, z12.f, z24.f, 90 \n\t" \ + "fcadd z13.f, p5/m, z13.f, z25.f, 90 \n\t" \ + "fcadd z14.f, p5/m, z14.f, z26.f, 90 \n\t" \ + "fcadd z15.f, p5/m, z15.f, z27.f, 270 \n\t" \ + "fcadd z16.f, p5/m, z16.f, z28.f, 270 \n\t" \ + "fcadd z17.f, p5/m, z17.f, z29.f, 270 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.f, p5/m, z12.f, z24.f \n\t" \ + "fadd z13.f, p5/m, z13.f, z25.f \n\t" \ + "fadd z14.f, p5/m, z14.f, z26.f \n\t" \ + "fadd z15.f, p5/m, z15.f, z27.f \n\t" \ + "fadd z16.f, p5/m, z16.f, z28.f \n\t" \ + "fadd z17.f, p5/m, z17.f, z29.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.f, p5/m, z12.f, z27.f, 270 \n\t" \ + "fcadd z13.f, p5/m, z13.f, z28.f, 270 \n\t" \ + "fcadd z14.f, p5/m, z14.f, z29.f, 270 \n\t" \ + "fcadd z15.f, p5/m, z15.f, z24.f, 270 \n\t" \ + "fcadd z16.f, p5/m, z16.f, z25.f, 270 \n\t" \ + "fcadd z17.f, p5/m, z17.f, z26.f, 270 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXf \ +asm ( \ + "fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \ + "fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \ + "fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \ + "fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \ + "fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \ + "fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \ + "mov z0.f, z18.f \n\t" \ + "mov z1.f, z19.f \n\t" \ + "mov z2.f, z20.f \n\t" \ + "mov z3.f, z21.f \n\t" \ + "mov z4.f, z22.f \n\t" \ + "mov z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.f, p5/m, z12.f, z27.f \n\t" \ + "fadd z13.f, p5/m, z13.f, z28.f \n\t" \ + "fadd z14.f, p5/m, z14.f, z29.f \n\t" \ + "fsub z15.f, p5/m, z15.f, z24.f \n\t" \ + "fsub z16.f, p5/m, z16.f, z25.f \n\t" \ + "fsub z17.f, p5/m, z17.f, z26.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.f, p5/m, z12.f, z24.f, 270 \n\t" \ + "fcadd z13.f, p5/m, z13.f, z25.f, 270 \n\t" \ + "fcadd z14.f, p5/m, z14.f, z26.f, 270 \n\t" \ + "fcadd z15.f, p5/m, z15.f, z27.f, 90 \n\t" \ + "fcadd z16.f, p5/m, z16.f, z28.f, 90 \n\t" \ + "fcadd z17.f, p5/m, z17.f, z29.f, 90 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.f, p5/m, z12.f, z24.f \n\t" \ + "fsub z13.f, p5/m, z13.f, z25.f \n\t" \ + "fsub z14.f, p5/m, z14.f, z26.f \n\t" \ + "fsub z15.f, p5/m, z15.f, z27.f \n\t" \ + "fsub z16.f, p5/m, z16.f, z28.f \n\t" \ + "fsub z17.f, p5/m, z17.f, z29.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \ + "fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \ + "fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \ + "fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \ + "fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \ + "fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \ + "mov z0.f, z18.f \n\t" \ + "mov z1.f, z19.f \n\t" \ + "mov z2.f, z20.f \n\t" \ + "mov z3.f, z21.f \n\t" \ + "mov z4.f, z22.f \n\t" \ + "mov z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fsub z9.f, p5/m, z9.f, z18.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fsub z10.f, p5/m, z10.f, z19.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fsub z11.f, p5/m, z11.f, z20.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fadd z6.f, p5/m, z6.f, z21.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fadd z7.f, p5/m, z7.f, z22.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fadd z8.f, p5/m, z8.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fadd z9.f, p5/m, z9.f, z18.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fadd z10.f, p5/m, z10.f, z19.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fadd z11.f, p5/m, z11.f, z20.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fsub z6.f, p5/m, z6.f, z21.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fsub z7.f, p5/m, z7.f, z22.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fsub z8.f, p5/m, z8.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z6.f, p5/m, z6.f, z18.f, 270 \n\t" \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fcadd z7.f, p5/m, z7.f, z19.f, 270 \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fcadd z8.f, p5/m, z8.f, z20.f, 270 \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fcadd z9.f, p5/m, z9.f, z21.f, 90 \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fcadd z10.f, p5/m, z10.f, z22.f, 90 \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fcadd z11.f, p5/m, z11.f, z23.f, 90 \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z6.f, p5/m, z6.f, z18.f, 90 \n\t" \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fcadd z7.f, p5/m, z7.f, z19.f, 90 \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fcadd z8.f, p5/m, z8.f, z20.f, 90 \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fcadd z9.f, p5/m, z9.f, z21.f, 270 \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fcadd z10.f, p5/m, z10.f, z22.f, 270 \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fcadd z11.f, p5/m, z11.f, z23.f, 270 \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXf \ +asm ( \ + "ptrue p5.f \n\t" \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fadd z6.f, p5/m, z6.f, z18.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fadd z7.f, p5/m, z7.f, z19.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fadd z8.f, p5/m, z8.f, z20.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fadd z9.f, p5/m, z9.f, z21.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fadd z10.f, p5/m, z10.f, z22.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fadd z11.f, p5/m, z11.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fsub z6.f, p5/m, z6.f, z18.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fsub z7.f, p5/m, z7.f, z19.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fsub z8.f, p5/m, z8.f, z20.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fsub z9.f, p5/m, z9.f, z21.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fsub z10.f, p5/m, z10.f, z22.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fsub z11.f, p5/m, z11.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXf \ +asm ( \ + "ptrue p5.f \n\t" \ + "fmov z0.f , 0 \n\t" \ + "fmov z1.f , 0 \n\t" \ + "fmov z2.f , 0 \n\t" \ + "fmov z3.f , 0 \n\t" \ + "fmov z4.f , 0 \n\t" \ + "fmov z5.f , 0 \n\t" \ + "fmov z6.f , 0 \n\t" \ + "fmov z7.f , 0 \n\t" \ + "fmov z8.f , 0 \n\t" \ + "fmov z9.f , 0 \n\t" \ + "fmov z10.f , 0 \n\t" \ + "fmov z11.f , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h new file mode 100644 index 00000000..57c0f978 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -0,0 +1,576 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXf(A) +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf +#define XP_RECON XP_RECON_A64FXf +#define XM_RECON XM_RECON_A64FXf +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf +#define PERMUTE_DIR0 PERM0_A64FXf +#define PERMUTE_DIR1 PERM1_A64FXf +#define PERMUTE_DIR2 PERM2_A64FXf +#define PERMUTE_DIR3 PERM3_A64FXf +// DECLARATIONS +#define DECLARATIONS_A64FXf(x) \ + const uint32_t lut[4][8] = { \ + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ + {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} } \ + svfloat32_t result_00; \ + svfloat32_t result_01; \ + svfloat32_t result_02; \ + svfloat32_t result_10; \ + svfloat32_t result_11; \ + svfloat32_t result_12; \ + svfloat32_t result_20; \ + svfloat32_t result_21; \ + svfloat32_t result_22; \ + svfloat32_t result_30; \ + svfloat32_t result_31; \ + svfloat32_t result_32; \ + svfloat32_t Chi_00; \ + svfloat32_t Chi_01; \ + svfloat32_t Chi_02; \ + svfloat32_t Chi_10; \ + svfloat32_t Chi_11; \ + svfloat32_t Chi_12; \ + svfloat32_t UChi_00; \ + svfloat32_t UChi_01; \ + svfloat32_t UChi_02; \ + svfloat32_t UChi_10; \ + svfloat32_t UChi_11; \ + svfloat32_t UChi_12; \ + svfloat32_t U_00; \ + svfloat32_t U_10; \ + svfloat32_t U_20; \ + svfloat32_t U_01; \ + svfloat32_t U_11; \ + svfloat32_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b32(); \ + svuint32_t table0; \ + svfloat32_t zero0; \ + zero0 = __svzero(zero0); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 U_00 +#define Chimu_21 U_10 +#define Chimu_22 U_20 +#define Chimu_30 U_01 +#define Chimu_31 U_11 +#define Chimu_32 U_21 +// RESULT +#define RESULT_A64FXf(base) \ +{ \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXf(base) \ +{ \ + Chi_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chi_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chi_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chi_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chi_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ +{ \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// PERM0 +#define PERM0_A64FXf \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM1 +#define PERM1_A64FXf \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM2 +#define PERM2_A64FXf \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM3 +#define PERM3_A64FXf \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// MULT_2SPIN +#define MULT_2SPIN_A64FXf(A) \ +{ \ + const auto & ref(U[sU][A]); \ + U_00 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 1 * 64)); \ + UChi_00 = __svzero(UChi_00); \ + UChi_10 = __svzero(UChi_10); \ + UChi_01 = __svzero(UChi_01); \ + UChi_11 = __svzero(UChi_11); \ + UChi_02 = __svzero(UChi_02); \ + UChi_12 = __svzero(UChi_12); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -4 * 64)); \ + U_10 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -1 * 64)); \ + U_20 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 2 * 64)); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXf \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXf \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXf \ + result_00 = __svzero(result_00); \ + result_01 = __svzero(result_01); \ + result_02 = __svzero(result_02); \ + result_10 = __svzero(result_10); \ + result_11 = __svzero(result_11); \ + result_12 = __svzero(result_12); \ + result_20 = __svzero(result_20); \ + result_21 = __svzero(result_21); \ + result_22 = __svzero(result_22); \ + result_30 = __svzero(result_30); \ + result_31 = __svzero(result_31); \ + result_32 = __svzero(result_32); + From 1ea85b9972548329310a7aae6e98389606c646a3 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 13:47:21 +0200 Subject: [PATCH 010/147] Disabled build message --- Grid/simd/Grid_vector_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e4e0832a..61f19a15 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -117,7 +117,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { */ #ifdef GEN #if defined(A64FX) // breakout A64FX SVE ACLE here - #pragma message("building for A64FX / SVE ACLE") + //#pragma message("building for A64FX / SVE ACLE") #define ARMCLANGHOTFIX #include "Grid_a64fx-2.h" #else From dd5a22b36b6c9e811c4f689e25714a27f27fd794 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 14:21:27 +0200 Subject: [PATCH 011/147] revised declarations --- Grid/simd/Fujitsu_A64FX_asm_double.h | 4 ++-- Grid/simd/Fujitsu_A64FX_asm_single.h | 4 ++-- Grid/simd/Fujitsu_A64FX_intrin_double.h | 4 ++-- Grid/simd/Fujitsu_A64FX_intrin_single.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 57636961..86ebba68 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -35,7 +35,7 @@ Author: Nils Meyer #define PREFETCH_CHIMU(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXd(A) +#define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A) @@ -65,7 +65,7 @@ Author: Nils Meyer #define PERMUTE_DIR2 PERM2_A64FXd #define PERMUTE_DIR3 PERM3_A64FXd // DECLARATIONS -#define DECLARATIONS_A64FXd(x) \ +#define DECLARATIONS_A64FXd \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index e2d2f8ef..c846d085 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -35,7 +35,7 @@ Author: Nils Meyer #define PREFETCH_CHIMU(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXf(A) +#define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A) @@ -65,7 +65,7 @@ Author: Nils Meyer #define PERMUTE_DIR2 PERM2_A64FXf #define PERMUTE_DIR3 PERM3_A64FXf // DECLARATIONS -#define DECLARATIONS_A64FXf(x) \ +#define DECLARATIONS_A64FXf \ const uint32_t lut[4][8] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 1bafc114..c3bb2fcc 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -35,7 +35,7 @@ Author: Nils Meyer #define PREFETCH_CHIMU(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXd(A) +#define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A) @@ -65,7 +65,7 @@ Author: Nils Meyer #define PERMUTE_DIR2 PERM2_A64FXd #define PERMUTE_DIR3 PERM3_A64FXd // DECLARATIONS -#define DECLARATIONS_A64FXd(x) \ +#define DECLARATIONS_A64FXd \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 57c0f978..858d287a 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -35,7 +35,7 @@ Author: Nils Meyer #define PREFETCH_CHIMU(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXf(A) +#define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A) @@ -65,7 +65,7 @@ Author: Nils Meyer #define PERMUTE_DIR2 PERM2_A64FXf #define PERMUTE_DIR3 PERM3_A64FXf // DECLARATIONS -#define DECLARATIONS_A64FXf(x) \ +#define DECLARATIONS_A64FXf \ const uint32_t lut[4][8] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ From bb46ba9b5fb39e5b08893de5978e4be61a75e4e1 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 14:28:45 +0200 Subject: [PATCH 012/147] fixed array size in single --- Grid/simd/Fujitsu_A64FX_asm_single.h | 2 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index c846d085..8d995c1e 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -66,7 +66,7 @@ Author: Nils Meyer #define PERMUTE_DIR3 PERM3_A64FXf // DECLARATIONS #define DECLARATIONS_A64FXf \ - const uint32_t lut[4][8] = { \ + const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 858d287a..e4aeb420 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -66,7 +66,7 @@ Author: Nils Meyer #define PERMUTE_DIR3 PERM3_A64FXf // DECLARATIONS #define DECLARATIONS_A64FXf \ - const uint32_t lut[4][8] = { \ + const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ From 9f224a1647916c6e39aec282e8f6bf41380cac47 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 14:30:21 +0200 Subject: [PATCH 013/147] fixed typo in single --- Grid/simd/Fujitsu_A64FX_asm_single.h | 2 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 8d995c1e..ea869777 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -70,7 +70,7 @@ Author: Nils Meyer {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} } \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ asm ( \ "fmov z31.f , 0 \n\t" \ : \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index e4aeb420..b80c71ec 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -70,7 +70,7 @@ Author: Nils Meyer {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} } \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ svfloat32_t result_00; \ svfloat32_t result_01; \ svfloat32_t result_02; \ From 326de3646724182b1249325b8931c1eac9f53910 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 14:44:25 +0200 Subject: [PATCH 014/147] revised sU addressing scheme --- Grid/simd/Fujitsu_A64FX_asm_double.h | 2 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 2 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 2 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 86ebba68..4ab27919 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -283,7 +283,7 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_A64FXd(A) \ { \ - const auto & ref(U[sU][A]); \ + const auto & ref(U[sU](A)); \ asm ( \ "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index ea869777..d2ba4176 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -295,7 +295,7 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_A64FXf(A) \ { \ - const auto & ref(U[sU][A]); \ + const auto & ref(U[sU](A)); \ asm ( \ "ld1d { z24.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ "ld1d { z25.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index c3bb2fcc..edfc7ab1 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -261,7 +261,7 @@ Author: Nils Meyer // MULT_2SPIN #define MULT_2SPIN_A64FXd(A) \ { \ - const auto & ref(U[sU][A]); \ + const auto & ref(U[sU](A)); \ U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -6 * 64)); \ U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -3 * 64)); \ U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 0 * 64)); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index b80c71ec..26a04dfd 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -268,7 +268,7 @@ Author: Nils Meyer // MULT_2SPIN #define MULT_2SPIN_A64FXf(A) \ { \ - const auto & ref(U[sU][A]); \ + const auto & ref(U[sU](A)); \ U_00 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -6 * 64)); \ U_10 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -3 * 64)); \ U_20 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 0 * 64)); \ From b140c6a4f90c53a206f6b7d46c3ed609d43643db Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 15:01:15 +0200 Subject: [PATCH 015/147] addressing --- Grid/simd/Fujitsu_A64FX_asm_double.h | 4 ++-- Grid/simd/Fujitsu_A64FX_asm_single.h | 4 ++-- Grid/simd/Fujitsu_A64FX_intrin_double.h | 20 ++++++++++---------- Grid/simd/Fujitsu_A64FX_intrin_single.h | 20 ++++++++++---------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 4ab27919..0a65294d 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -283,7 +283,7 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ @@ -337,7 +337,7 @@ asm ( \ "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ : \ - : [fetchptr] "r" ((uint64_t)&ref[2][0]) \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index d2ba4176..7e58a9d3 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -295,7 +295,7 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ "ld1d { z24.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ "ld1d { z25.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ @@ -349,7 +349,7 @@ asm ( \ "fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \ "fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \ : \ - : [fetchptr] "r" ((uint64_t)&ref[2][0]) \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index edfc7ab1..9cf33c23 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -261,13 +261,13 @@ Author: Nils Meyer // MULT_2SPIN #define MULT_2SPIN_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); \ - U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 1 * 64)); \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + U_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ UChi_00 = __svzero(UChi_00); \ UChi_10 = __svzero(UChi_10); \ UChi_01 = __svzero(UChi_01); \ @@ -286,9 +286,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -4 * 64)); \ - U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -1 * 64)); \ - U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 2 * 64)); \ + U_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 26a04dfd..2728f507 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -268,13 +268,13 @@ Author: Nils Meyer // MULT_2SPIN #define MULT_2SPIN_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); \ - U_00 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 1 * 64)); \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + U_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ UChi_00 = __svzero(UChi_00); \ UChi_10 = __svzero(UChi_10); \ UChi_01 = __svzero(UChi_01); \ @@ -293,9 +293,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -4 * 64)); \ - U_10 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -1 * 64)); \ - U_20 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 2 * 64)); \ + U_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ From e252c1aca3f924e6ee60cc598226d25c3d39ce5e Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 15:03:12 +0200 Subject: [PATCH 016/147] addressing --- Grid/simd/Fujitsu_A64FX_asm_double.h | 2 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 2 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 18 +++++++++--------- Grid/simd/Fujitsu_A64FX_intrin_single.h | 18 +++++++++--------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 0a65294d..bd9ebe5d 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -337,7 +337,7 @@ asm ( \ "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 7e58a9d3..2ece4299 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -349,7 +349,7 @@ asm ( \ "fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \ "fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \ : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 9cf33c23..4b85563c 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -262,12 +262,12 @@ Author: Nils Meyer #define MULT_2SPIN_A64FXd(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ UChi_00 = __svzero(UChi_00); \ UChi_10 = __svzero(UChi_10); \ UChi_01 = __svzero(UChi_01); \ @@ -286,9 +286,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 2728f507..7f8132e8 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -269,12 +269,12 @@ Author: Nils Meyer #define MULT_2SPIN_A64FXf(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ UChi_00 = __svzero(UChi_00); \ UChi_10 = __svzero(UChi_10); \ UChi_01 = __svzero(UChi_01); \ @@ -293,9 +293,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ From b367cbd422fd22618a4497c394beb50324d58795 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 15:08:45 +0200 Subject: [PATCH 017/147] defined ADD_RESULT --- Grid/Fujitsu_A64FX_asm_double.h | 692 ++++++++++++++++++++++++++++ Grid/Fujitsu_A64FX_asm_single.h | 706 +++++++++++++++++++++++++++++ Grid/Fujitsu_A64FX_intrin_double.h | 568 +++++++++++++++++++++++ Grid/Fujitsu_A64FX_intrin_single.h | 577 +++++++++++++++++++++++ Grid/Fujitsu_A64FX_undef.h | 69 +++ 5 files changed, 2612 insertions(+) create mode 100644 Grid/Fujitsu_A64FX_asm_double.h create mode 100644 Grid/Fujitsu_A64FX_asm_single.h create mode 100644 Grid/Fujitsu_A64FX_intrin_double.h create mode 100644 Grid/Fujitsu_A64FX_intrin_single.h create mode 100644 Grid/Fujitsu_A64FX_undef.h diff --git a/Grid/Fujitsu_A64FX_asm_double.h b/Grid/Fujitsu_A64FX_asm_double.h new file mode 100644 index 00000000..4da8b3fe --- /dev/null +++ b/Grid/Fujitsu_A64FX_asm_double.h @@ -0,0 +1,692 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd +#define ADD_RESULT(A,B) +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 PERM0_A64FXd +#define PERMUTE_DIR1 PERM1_A64FXd +#define PERMUTE_DIR2 PERM2_A64FXd +#define PERMUTE_DIR3 PERM3_A64FXd +// DECLARATIONS +#define DECLARATIONS_A64FXd \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ +asm ( \ + "fmov z31.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXd(base) \ +{ \ +asm ( \ + "stnt1d { z0.d }, p5, [%[storeptr], -6, mul vl] \n\t" \ + "stnt1d { z1.d }, p5, [%[storeptr], -5, mul vl] \n\t" \ + "stnt1d { z2.d }, p5, [%[storeptr], -4, mul vl] \n\t" \ + "stnt1d { z3.d }, p5, [%[storeptr], -3, mul vl] \n\t" \ + "stnt1d { z4.d }, p5, [%[storeptr], -2, mul vl] \n\t" \ + "stnt1d { z5.d }, p5, [%[storeptr], -1, mul vl] \n\t" \ + "stnt1d { z6.d }, p5, [%[storeptr], 0, mul vl] \n\t" \ + "stnt1d { z7.d }, p5, [%[storeptr], 1, mul vl] \n\t" \ + "stnt1d { z8.d }, p5, [%[storeptr], 2, mul vl] \n\t" \ + "stnt1d { z9.d }, p5, [%[storeptr], 3, mul vl] \n\t" \ + "stnt1d { z10.d }, p5, [%[storeptr], 4, mul vl] \n\t" \ + "stnt1d { z11.d }, p5, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ +asm ( \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PERM0 +#define PERM0_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM1 +#define PERM1_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM2 +#define PERM2_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM3 +#define PERM3_A64FXd + +// MULT_2SPIN +#define MULT_2SPIN_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "fmov z18.d , 0 \n\t" \ + "fmov z21.d , 0 \n\t" \ + "fmov z19.d , 0 \n\t" \ + "fmov z22.d , 0 \n\t" \ + "fmov z20.d , 0 \n\t" \ + "fmov z23.d , 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z24.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z25.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z26.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.d, p5/m, z12.d, z27.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z28.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z29.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z24.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z25.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z26.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z27.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.d, p5/m, z12.d, z24.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z25.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z26.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z27.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z28.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z29.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z24.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z25.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z26.d, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.d, p5/m, z12.d, z27.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z28.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z29.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z24.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z25.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z26.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z27.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.d, p5/m, z12.d, z24.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z25.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z26.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z27.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z28.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z29.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ +asm ( \ + "ptrue p5.d \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ +asm ( \ + "ptrue p5.d \n\t" \ + "fmov z0.d , 0 \n\t" \ + "fmov z1.d , 0 \n\t" \ + "fmov z2.d , 0 \n\t" \ + "fmov z3.d , 0 \n\t" \ + "fmov z4.d , 0 \n\t" \ + "fmov z5.d , 0 \n\t" \ + "fmov z6.d , 0 \n\t" \ + "fmov z7.d , 0 \n\t" \ + "fmov z8.d , 0 \n\t" \ + "fmov z9.d , 0 \n\t" \ + "fmov z10.d , 0 \n\t" \ + "fmov z11.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/Fujitsu_A64FX_asm_single.h b/Grid/Fujitsu_A64FX_asm_single.h new file mode 100644 index 00000000..bffd6990 --- /dev/null +++ b/Grid/Fujitsu_A64FX_asm_single.h @@ -0,0 +1,706 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXf +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf +#define ADD_RESULT(A,B) +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf +#define XP_RECON XP_RECON_A64FXf +#define XM_RECON XM_RECON_A64FXf +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf +#define PERMUTE_DIR0 PERM0_A64FXf +#define PERMUTE_DIR1 PERM1_A64FXf +#define PERMUTE_DIR2 PERM2_A64FXf +#define PERMUTE_DIR3 PERM3_A64FXf +// DECLARATIONS +#define DECLARATIONS_A64FXf \ + const uint32_t lut[4][16] = { \ + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ + {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ +asm ( \ + "fmov z31.f , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXf(base) \ +{ \ +asm ( \ + "stnt1d { z0.f }, p5, [%[storeptr], -6, mul vl] \n\t" \ + "stnt1d { z1.f }, p5, [%[storeptr], -5, mul vl] \n\t" \ + "stnt1d { z2.f }, p5, [%[storeptr], -4, mul vl] \n\t" \ + "stnt1d { z3.f }, p5, [%[storeptr], -3, mul vl] \n\t" \ + "stnt1d { z4.f }, p5, [%[storeptr], -2, mul vl] \n\t" \ + "stnt1d { z5.f }, p5, [%[storeptr], -1, mul vl] \n\t" \ + "stnt1d { z6.f }, p5, [%[storeptr], 0, mul vl] \n\t" \ + "stnt1d { z7.f }, p5, [%[storeptr], 1, mul vl] \n\t" \ + "stnt1d { z8.f }, p5, [%[storeptr], 2, mul vl] \n\t" \ + "stnt1d { z9.f }, p5, [%[storeptr], 3, mul vl] \n\t" \ + "stnt1d { z10.f }, p5, [%[storeptr], 4, mul vl] \n\t" \ + "stnt1d { z11.f }, p5, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXf(base) \ +{ \ +asm ( \ + "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ +{ \ +asm ( \ + "ptrue p5.f \n\t" \ + "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.f \n\t" \ + "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.f \n\t" \ + "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PERM0 +#define PERM0_A64FXf \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.f, { z12.f }, z30.f \n\t" \ + "tbl z13.f, { z13.f }, z30.f \n\t" \ + "tbl z14.f, { z14.f }, z30.f \n\t" \ + "tbl z15.f, { z15.f }, z30.f \n\t" \ + "tbl z16.f, { z16.f }, z30.f \n\t" \ + "tbl z17.f, { z17.f }, z30.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM1 +#define PERM1_A64FXf \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.f, { z12.f }, z30.f \n\t" \ + "tbl z13.f, { z13.f }, z30.f \n\t" \ + "tbl z14.f, { z14.f }, z30.f \n\t" \ + "tbl z15.f, { z15.f }, z30.f \n\t" \ + "tbl z16.f, { z16.f }, z30.f \n\t" \ + "tbl z17.f, { z17.f }, z30.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM2 +#define PERM2_A64FXf \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.f, { z12.f }, z30.f \n\t" \ + "tbl z13.f, { z13.f }, z30.f \n\t" \ + "tbl z14.f, { z14.f }, z30.f \n\t" \ + "tbl z15.f, { z15.f }, z30.f \n\t" \ + "tbl z16.f, { z16.f }, z30.f \n\t" \ + "tbl z17.f, { z17.f }, z30.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM3 +#define PERM3_A64FXf \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.f, { z12.f }, z30.f \n\t" \ + "tbl z13.f, { z13.f }, z30.f \n\t" \ + "tbl z14.f, { z14.f }, z30.f \n\t" \ + "tbl z15.f, { z15.f }, z30.f \n\t" \ + "tbl z16.f, { z16.f }, z30.f \n\t" \ + "tbl z17.f, { z17.f }, z30.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// MULT_2SPIN +#define MULT_2SPIN_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "ld1d { z24.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "fmov z18.f , 0 \n\t" \ + "fmov z21.f , 0 \n\t" \ + "fmov z19.f , 0 \n\t" \ + "fmov z22.f , 0 \n\t" \ + "fmov z20.f , 0 \n\t" \ + "fmov z23.f , 0 \n\t" \ + "fcmla z18.f, p5/m, z24.f, z12.f, 0 \n\t" \ + "fcmla z21.f, p5/m, z24.f, z15.f, 0 \n\t" \ + "fcmla z19.f, p5/m, z25.f, z12.f, 0 \n\t" \ + "fcmla z22.f, p5/m, z25.f, z15.f, 0 \n\t" \ + "fcmla z20.f, p5/m, z26.f, z12.f, 0 \n\t" \ + "fcmla z23.f, p5/m, z26.f, z15.f, 0 \n\t" \ + "fcmla z18.f, p5/m, z24.f, z12.f, 90 \n\t" \ + "fcmla z21.f, p5/m, z24.f, z15.f, 90 \n\t" \ + "fcmla z19.f, p5/m, z25.f, z12.f, 90 \n\t" \ + "fcmla z22.f, p5/m, z25.f, z15.f, 90 \n\t" \ + "fcmla z20.f, p5/m, z26.f, z12.f, 90 \n\t" \ + "fcmla z23.f, p5/m, z26.f, z15.f, 90 \n\t" \ + "ld1d { z24.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "fcmla z18.f, p5/m, z27.f, z13.f, 0 \n\t" \ + "fcmla z21.f, p5/m, z27.f, z16.f, 0 \n\t" \ + "fcmla z19.f, p5/m, z28.f, z13.f, 0 \n\t" \ + "fcmla z22.f, p5/m, z28.f, z16.f, 0 \n\t" \ + "fcmla z20.f, p5/m, z29.f, z13.f, 0 \n\t" \ + "fcmla z23.f, p5/m, z29.f, z16.f, 0 \n\t" \ + "fcmla z18.f, p5/m, z27.f, z13.f, 90 \n\t" \ + "fcmla z21.f, p5/m, z27.f, z16.f, 90 \n\t" \ + "fcmla z19.f, p5/m, z28.f, z13.f, 90 \n\t" \ + "fcmla z22.f, p5/m, z28.f, z16.f, 90 \n\t" \ + "fcmla z20.f, p5/m, z29.f, z13.f, 90 \n\t" \ + "fcmla z23.f, p5/m, z29.f, z16.f, 90 \n\t" \ + "fcmla z18.f, p5/m, z24.f, z14.f, 0 \n\t" \ + "fcmla z21.f, p5/m, z24.f, z17.f, 0 \n\t" \ + "fcmla z19.f, p5/m, z25.f, z14.f, 0 \n\t" \ + "fcmla z22.f, p5/m, z25.f, z17.f, 0 \n\t" \ + "fcmla z20.f, p5/m, z26.f, z14.f, 0 \n\t" \ + "fcmla z23.f, p5/m, z26.f, z17.f, 0 \n\t" \ + "fcmla z18.f, p5/m, z24.f, z14.f, 90 \n\t" \ + "fcmla z21.f, p5/m, z24.f, z17.f, 90 \n\t" \ + "fcmla z19.f, p5/m, z25.f, z14.f, 90 \n\t" \ + "fcmla z22.f, p5/m, z25.f, z17.f, 90 \n\t" \ + "fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \ + "fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.f, p5/m, z12.f, z27.f, 90 \n\t" \ + "fcadd z13.f, p5/m, z13.f, z28.f, 90 \n\t" \ + "fcadd z14.f, p5/m, z14.f, z29.f, 90 \n\t" \ + "fcadd z15.f, p5/m, z15.f, z24.f, 90 \n\t" \ + "fcadd z16.f, p5/m, z16.f, z25.f, 90 \n\t" \ + "fcadd z17.f, p5/m, z17.f, z26.f, 90 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXf \ +asm ( \ + "fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \ + "fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \ + "fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \ + "fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \ + "fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \ + "fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \ + "mov z0.f, z18.f \n\t" \ + "mov z1.f, z19.f \n\t" \ + "mov z2.f, z20.f \n\t" \ + "mov z3.f, z21.f \n\t" \ + "mov z4.f, z22.f \n\t" \ + "mov z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.f, p5/m, z12.f, z27.f \n\t" \ + "fsub z13.f, p5/m, z13.f, z28.f \n\t" \ + "fsub z14.f, p5/m, z14.f, z29.f \n\t" \ + "fadd z15.f, p5/m, z15.f, z24.f \n\t" \ + "fadd z16.f, p5/m, z16.f, z25.f \n\t" \ + "fadd z17.f, p5/m, z17.f, z26.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.f, p5/m, z12.f, z24.f, 90 \n\t" \ + "fcadd z13.f, p5/m, z13.f, z25.f, 90 \n\t" \ + "fcadd z14.f, p5/m, z14.f, z26.f, 90 \n\t" \ + "fcadd z15.f, p5/m, z15.f, z27.f, 270 \n\t" \ + "fcadd z16.f, p5/m, z16.f, z28.f, 270 \n\t" \ + "fcadd z17.f, p5/m, z17.f, z29.f, 270 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.f, p5/m, z12.f, z24.f \n\t" \ + "fadd z13.f, p5/m, z13.f, z25.f \n\t" \ + "fadd z14.f, p5/m, z14.f, z26.f \n\t" \ + "fadd z15.f, p5/m, z15.f, z27.f \n\t" \ + "fadd z16.f, p5/m, z16.f, z28.f \n\t" \ + "fadd z17.f, p5/m, z17.f, z29.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.f, p5/m, z12.f, z27.f, 270 \n\t" \ + "fcadd z13.f, p5/m, z13.f, z28.f, 270 \n\t" \ + "fcadd z14.f, p5/m, z14.f, z29.f, 270 \n\t" \ + "fcadd z15.f, p5/m, z15.f, z24.f, 270 \n\t" \ + "fcadd z16.f, p5/m, z16.f, z25.f, 270 \n\t" \ + "fcadd z17.f, p5/m, z17.f, z26.f, 270 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXf \ +asm ( \ + "fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \ + "fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \ + "fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \ + "fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \ + "fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \ + "fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \ + "mov z0.f, z18.f \n\t" \ + "mov z1.f, z19.f \n\t" \ + "mov z2.f, z20.f \n\t" \ + "mov z3.f, z21.f \n\t" \ + "mov z4.f, z22.f \n\t" \ + "mov z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.f, p5/m, z12.f, z27.f \n\t" \ + "fadd z13.f, p5/m, z13.f, z28.f \n\t" \ + "fadd z14.f, p5/m, z14.f, z29.f \n\t" \ + "fsub z15.f, p5/m, z15.f, z24.f \n\t" \ + "fsub z16.f, p5/m, z16.f, z25.f \n\t" \ + "fsub z17.f, p5/m, z17.f, z26.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.f, p5/m, z12.f, z24.f, 270 \n\t" \ + "fcadd z13.f, p5/m, z13.f, z25.f, 270 \n\t" \ + "fcadd z14.f, p5/m, z14.f, z26.f, 270 \n\t" \ + "fcadd z15.f, p5/m, z15.f, z27.f, 90 \n\t" \ + "fcadd z16.f, p5/m, z16.f, z28.f, 90 \n\t" \ + "fcadd z17.f, p5/m, z17.f, z29.f, 90 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXf \ +{ \ +asm ( \ + "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.f, p5/m, z12.f, z24.f \n\t" \ + "fsub z13.f, p5/m, z13.f, z25.f \n\t" \ + "fsub z14.f, p5/m, z14.f, z26.f \n\t" \ + "fsub z15.f, p5/m, z15.f, z27.f \n\t" \ + "fsub z16.f, p5/m, z16.f, z28.f \n\t" \ + "fsub z17.f, p5/m, z17.f, z29.f \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \ + "fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \ + "fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \ + "fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \ + "fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \ + "fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \ + "mov z0.f, z18.f \n\t" \ + "mov z1.f, z19.f \n\t" \ + "mov z2.f, z20.f \n\t" \ + "mov z3.f, z21.f \n\t" \ + "mov z4.f, z22.f \n\t" \ + "mov z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fsub z9.f, p5/m, z9.f, z18.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fsub z10.f, p5/m, z10.f, z19.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fsub z11.f, p5/m, z11.f, z20.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fadd z6.f, p5/m, z6.f, z21.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fadd z7.f, p5/m, z7.f, z22.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fadd z8.f, p5/m, z8.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fadd z9.f, p5/m, z9.f, z18.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fadd z10.f, p5/m, z10.f, z19.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fadd z11.f, p5/m, z11.f, z20.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fsub z6.f, p5/m, z6.f, z21.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fsub z7.f, p5/m, z7.f, z22.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fsub z8.f, p5/m, z8.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z6.f, p5/m, z6.f, z18.f, 270 \n\t" \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fcadd z7.f, p5/m, z7.f, z19.f, 270 \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fcadd z8.f, p5/m, z8.f, z20.f, 270 \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fcadd z9.f, p5/m, z9.f, z21.f, 90 \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fcadd z10.f, p5/m, z10.f, z22.f, 90 \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fcadd z11.f, p5/m, z11.f, z23.f, 90 \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z6.f, p5/m, z6.f, z18.f, 90 \n\t" \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fcadd z7.f, p5/m, z7.f, z19.f, 90 \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fcadd z8.f, p5/m, z8.f, z20.f, 90 \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fcadd z9.f, p5/m, z9.f, z21.f, 270 \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fcadd z10.f, p5/m, z10.f, z22.f, 270 \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fcadd z11.f, p5/m, z11.f, z23.f, 270 \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXf \ +asm ( \ + "ptrue p5.f \n\t" \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fadd z6.f, p5/m, z6.f, z18.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fadd z7.f, p5/m, z7.f, z19.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fadd z8.f, p5/m, z8.f, z20.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fadd z9.f, p5/m, z9.f, z21.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fadd z10.f, p5/m, z10.f, z22.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fadd z11.f, p5/m, z11.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ + "fsub z6.f, p5/m, z6.f, z18.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ + "fsub z7.f, p5/m, z7.f, z19.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ + "fsub z8.f, p5/m, z8.f, z20.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ + "fsub z9.f, p5/m, z9.f, z21.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ + "fsub z10.f, p5/m, z10.f, z22.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fsub z11.f, p5/m, z11.f, z23.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXf \ +asm ( \ + "ptrue p5.f \n\t" \ + "fmov z0.f , 0 \n\t" \ + "fmov z1.f , 0 \n\t" \ + "fmov z2.f , 0 \n\t" \ + "fmov z3.f , 0 \n\t" \ + "fmov z4.f , 0 \n\t" \ + "fmov z5.f , 0 \n\t" \ + "fmov z6.f , 0 \n\t" \ + "fmov z7.f , 0 \n\t" \ + "fmov z8.f , 0 \n\t" \ + "fmov z9.f , 0 \n\t" \ + "fmov z10.f , 0 \n\t" \ + "fmov z11.f , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/Fujitsu_A64FX_intrin_double.h b/Grid/Fujitsu_A64FX_intrin_double.h new file mode 100644 index 00000000..59f149eb --- /dev/null +++ b/Grid/Fujitsu_A64FX_intrin_double.h @@ -0,0 +1,568 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd +#define ADD_RESULT(A,B) +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 PERM0_A64FXd +#define PERMUTE_DIR1 PERM1_A64FXd +#define PERMUTE_DIR2 PERM2_A64FXd +#define PERMUTE_DIR3 PERM3_A64FXd +// DECLARATIONS +#define DECLARATIONS_A64FXd \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ + svfloat64_t result_00; \ + svfloat64_t result_01; \ + svfloat64_t result_02; \ + svfloat64_t result_10; \ + svfloat64_t result_11; \ + svfloat64_t result_12; \ + svfloat64_t result_20; \ + svfloat64_t result_21; \ + svfloat64_t result_22; \ + svfloat64_t result_30; \ + svfloat64_t result_31; \ + svfloat64_t result_32; \ + svfloat64_t Chi_00; \ + svfloat64_t Chi_01; \ + svfloat64_t Chi_02; \ + svfloat64_t Chi_10; \ + svfloat64_t Chi_11; \ + svfloat64_t Chi_12; \ + svfloat64_t UChi_00; \ + svfloat64_t UChi_01; \ + svfloat64_t UChi_02; \ + svfloat64_t UChi_10; \ + svfloat64_t UChi_11; \ + svfloat64_t UChi_12; \ + svfloat64_t U_00; \ + svfloat64_t U_10; \ + svfloat64_t U_20; \ + svfloat64_t U_01; \ + svfloat64_t U_11; \ + svfloat64_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ + svuint64_t table0; \ + svfloat64_t zero0; \ + zero0 = __svzero(zero0); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 U_00 +#define Chimu_21 U_10 +#define Chimu_22 U_20 +#define Chimu_30 U_01 +#define Chimu_31 U_11 +#define Chimu_32 U_21 +// RESULT +#define RESULT_A64FXd(base) \ +{ \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ + Chi_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chi_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chi_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chi_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chi_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// PERM0 +#define PERM0_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM1 +#define PERM1_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM2 +#define PERM2_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM3 +#define PERM3_A64FXd + +// MULT_2SPIN +#define MULT_2SPIN_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + UChi_00 = __svzero(UChi_00); \ + UChi_10 = __svzero(UChi_10); \ + UChi_01 = __svzero(UChi_01); \ + UChi_11 = __svzero(UChi_11); \ + UChi_02 = __svzero(UChi_02); \ + UChi_12 = __svzero(UChi_12); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ + result_00 = __svzero(result_00); \ + result_01 = __svzero(result_01); \ + result_02 = __svzero(result_02); \ + result_10 = __svzero(result_10); \ + result_11 = __svzero(result_11); \ + result_12 = __svzero(result_12); \ + result_20 = __svzero(result_20); \ + result_21 = __svzero(result_21); \ + result_22 = __svzero(result_22); \ + result_30 = __svzero(result_30); \ + result_31 = __svzero(result_31); \ + result_32 = __svzero(result_32); + diff --git a/Grid/Fujitsu_A64FX_intrin_single.h b/Grid/Fujitsu_A64FX_intrin_single.h new file mode 100644 index 00000000..333cbc1b --- /dev/null +++ b/Grid/Fujitsu_A64FX_intrin_single.h @@ -0,0 +1,577 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXf +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf +#define ADD_RESULT(A,B) +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf +#define XP_RECON XP_RECON_A64FXf +#define XM_RECON XM_RECON_A64FXf +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf +#define PERMUTE_DIR0 PERM0_A64FXf +#define PERMUTE_DIR1 PERM1_A64FXf +#define PERMUTE_DIR2 PERM2_A64FXf +#define PERMUTE_DIR3 PERM3_A64FXf +// DECLARATIONS +#define DECLARATIONS_A64FXf \ + const uint32_t lut[4][16] = { \ + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ + {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ + svfloat32_t result_00; \ + svfloat32_t result_01; \ + svfloat32_t result_02; \ + svfloat32_t result_10; \ + svfloat32_t result_11; \ + svfloat32_t result_12; \ + svfloat32_t result_20; \ + svfloat32_t result_21; \ + svfloat32_t result_22; \ + svfloat32_t result_30; \ + svfloat32_t result_31; \ + svfloat32_t result_32; \ + svfloat32_t Chi_00; \ + svfloat32_t Chi_01; \ + svfloat32_t Chi_02; \ + svfloat32_t Chi_10; \ + svfloat32_t Chi_11; \ + svfloat32_t Chi_12; \ + svfloat32_t UChi_00; \ + svfloat32_t UChi_01; \ + svfloat32_t UChi_02; \ + svfloat32_t UChi_10; \ + svfloat32_t UChi_11; \ + svfloat32_t UChi_12; \ + svfloat32_t U_00; \ + svfloat32_t U_10; \ + svfloat32_t U_20; \ + svfloat32_t U_01; \ + svfloat32_t U_11; \ + svfloat32_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b32(); \ + svuint32_t table0; \ + svfloat32_t zero0; \ + zero0 = __svzero(zero0); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 U_00 +#define Chimu_21 U_10 +#define Chimu_22 U_20 +#define Chimu_30 U_01 +#define Chimu_31 U_11 +#define Chimu_32 U_21 +// RESULT +#define RESULT_A64FXf(base) \ +{ \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXf(base) \ +{ \ + Chi_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chi_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chi_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chi_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chi_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ +{ \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// PERM0 +#define PERM0_A64FXf \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM1 +#define PERM1_A64FXf \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM2 +#define PERM2_A64FXf \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM3 +#define PERM3_A64FXf \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// MULT_2SPIN +#define MULT_2SPIN_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + UChi_00 = __svzero(UChi_00); \ + UChi_10 = __svzero(UChi_10); \ + UChi_01 = __svzero(UChi_01); \ + UChi_11 = __svzero(UChi_11); \ + UChi_02 = __svzero(UChi_02); \ + UChi_12 = __svzero(UChi_12); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXf \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXf \ +{ \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXf \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXf \ + result_00 = __svzero(result_00); \ + result_01 = __svzero(result_01); \ + result_02 = __svzero(result_02); \ + result_10 = __svzero(result_10); \ + result_11 = __svzero(result_11); \ + result_12 = __svzero(result_12); \ + result_20 = __svzero(result_20); \ + result_21 = __svzero(result_21); \ + result_22 = __svzero(result_22); \ + result_30 = __svzero(result_30); \ + result_31 = __svzero(result_31); \ + result_32 = __svzero(result_32); + diff --git a/Grid/Fujitsu_A64FX_undef.h b/Grid/Fujitsu_A64FX_undef.h new file mode 100644 index 00000000..779485d4 --- /dev/null +++ b/Grid/Fujitsu_A64FX_undef.h @@ -0,0 +1,69 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_undef.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#undef LOAD_CHIMU_A64FXd +#undef LOAD_CHIMU_A64FXf +#undef PREFETCH_CHIMU_L1 +#undef PREFETCH_GAUGE_L1 +#undef PREFETCH_CHIMU_L2 +#undef PREFETCH_GAUGE_L2 +#undef PF_GAUGE +#undef PREFETCH1_CHIMU +#undef PREFETCH_CHIMU +#undef LOCK_GAUGE +#undef UNLOCK_GAUGE +#undef MASK_REGS +#undef COMPLEX_SIGNS +#undef LOAD64 +#undef SAVE_RESULT +#undef ADD_RESULT +#undef MULT_2SPIN_DIR_PF +#undef MAYBEPERM +#undef LOAD_CHI +#undef ZERO_PSI +#undef XP_PROJMEM +#undef YP_PROJMEM +#undef ZP_PROJMEM +#undef TP_PROJMEM +#undef XM_PROJMEM +#undef YM_PROJMEM +#undef ZM_PROJMEM +#undef TM_PROJMEM +#undef XP_RECON +#undef XM_RECON +#undef YM_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef XP_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef PERMUTE_DIR0 +#undef PERMUTE_DIR1 +#undef PERMUTE_DIR2 +#undef PERMUTE_DIR3 From 2b6457dd9a43ff6d910c0c5835de5fd09f90765a Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 15:13:19 +0200 Subject: [PATCH 018/147] added xp/xm recon accum --- Grid/Fujitsu_A64FX_asm_double.h | 2 ++ Grid/Fujitsu_A64FX_asm_single.h | 2 ++ Grid/Fujitsu_A64FX_intrin_double.h | 2 ++ Grid/Fujitsu_A64FX_intrin_single.h | 2 ++ 4 files changed, 8 insertions(+) diff --git a/Grid/Fujitsu_A64FX_asm_double.h b/Grid/Fujitsu_A64FX_asm_double.h index 4da8b3fe..0bc77176 100644 --- a/Grid/Fujitsu_A64FX_asm_double.h +++ b/Grid/Fujitsu_A64FX_asm_double.h @@ -54,6 +54,8 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd #define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd diff --git a/Grid/Fujitsu_A64FX_asm_single.h b/Grid/Fujitsu_A64FX_asm_single.h index bffd6990..7bb17e75 100644 --- a/Grid/Fujitsu_A64FX_asm_single.h +++ b/Grid/Fujitsu_A64FX_asm_single.h @@ -54,6 +54,8 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf #define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf diff --git a/Grid/Fujitsu_A64FX_intrin_double.h b/Grid/Fujitsu_A64FX_intrin_double.h index 59f149eb..a5fbc68b 100644 --- a/Grid/Fujitsu_A64FX_intrin_double.h +++ b/Grid/Fujitsu_A64FX_intrin_double.h @@ -54,6 +54,8 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd #define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd diff --git a/Grid/Fujitsu_A64FX_intrin_single.h b/Grid/Fujitsu_A64FX_intrin_single.h index 333cbc1b..69d1cbbd 100644 --- a/Grid/Fujitsu_A64FX_intrin_single.h +++ b/Grid/Fujitsu_A64FX_intrin_single.h @@ -54,6 +54,8 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf #define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf From 123f6b7a61753d672bb9c7ba235f5459f09366e3 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 15:17:19 +0200 Subject: [PATCH 019/147] more changes --- Grid/Fujitsu_A64FX_undef.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Grid/Fujitsu_A64FX_undef.h b/Grid/Fujitsu_A64FX_undef.h index 779485d4..1cb143d4 100644 --- a/Grid/Fujitsu_A64FX_undef.h +++ b/Grid/Fujitsu_A64FX_undef.h @@ -56,6 +56,8 @@ Author: Nils Meyer #undef TM_PROJMEM #undef XP_RECON #undef XM_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON_ACCUM #undef YM_RECON_ACCUM #undef ZM_RECON_ACCUM #undef TM_RECON_ACCUM From d5708e0eb2e2aef69bcc59e40712afd789e558b4 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 15:43:34 +0200 Subject: [PATCH 020/147] more changes --- Grid/Fujitsu_A64FX_asm_double.h | 1 - Grid/Fujitsu_A64FX_asm_single.h | 1 - Grid/Fujitsu_A64FX_intrin_double.h | 1 - Grid/Fujitsu_A64FX_intrin_single.h | 1 - Grid/Fujitsu_A64FX_undef.h | 1 - 5 files changed, 5 deletions(-) diff --git a/Grid/Fujitsu_A64FX_asm_double.h b/Grid/Fujitsu_A64FX_asm_double.h index 0bc77176..48d07297 100644 --- a/Grid/Fujitsu_A64FX_asm_double.h +++ b/Grid/Fujitsu_A64FX_asm_double.h @@ -54,7 +54,6 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd diff --git a/Grid/Fujitsu_A64FX_asm_single.h b/Grid/Fujitsu_A64FX_asm_single.h index 7bb17e75..588fc9c4 100644 --- a/Grid/Fujitsu_A64FX_asm_single.h +++ b/Grid/Fujitsu_A64FX_asm_single.h @@ -54,7 +54,6 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf diff --git a/Grid/Fujitsu_A64FX_intrin_double.h b/Grid/Fujitsu_A64FX_intrin_double.h index a5fbc68b..e7469ffb 100644 --- a/Grid/Fujitsu_A64FX_intrin_double.h +++ b/Grid/Fujitsu_A64FX_intrin_double.h @@ -54,7 +54,6 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd diff --git a/Grid/Fujitsu_A64FX_intrin_single.h b/Grid/Fujitsu_A64FX_intrin_single.h index 69d1cbbd..e44374f0 100644 --- a/Grid/Fujitsu_A64FX_intrin_single.h +++ b/Grid/Fujitsu_A64FX_intrin_single.h @@ -54,7 +54,6 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf diff --git a/Grid/Fujitsu_A64FX_undef.h b/Grid/Fujitsu_A64FX_undef.h index 1cb143d4..a09e1698 100644 --- a/Grid/Fujitsu_A64FX_undef.h +++ b/Grid/Fujitsu_A64FX_undef.h @@ -56,7 +56,6 @@ Author: Nils Meyer #undef TM_PROJMEM #undef XP_RECON #undef XM_RECON -#undef XP_RECON_ACCUM #undef XM_RECON_ACCUM #undef YM_RECON_ACCUM #undef ZM_RECON_ACCUM From d79ab03a6cd6125a850233ff8dd9c6dc23bd8e60 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 16:19:25 +0200 Subject: [PATCH 021/147] changes --- Grid/Fujitsu_A64FX_asm_double.h | 22 +++++++++++++++++++++- Grid/Fujitsu_A64FX_asm_single.h | 22 +++++++++++++++++++++- Grid/Fujitsu_A64FX_intrin_double.h | 17 ++++++++++++++++- Grid/Fujitsu_A64FX_intrin_single.h | 17 ++++++++++++++++- 4 files changed, 74 insertions(+), 4 deletions(-) diff --git a/Grid/Fujitsu_A64FX_asm_double.h b/Grid/Fujitsu_A64FX_asm_double.h index 48d07297..4af5a91f 100644 --- a/Grid/Fujitsu_A64FX_asm_double.h +++ b/Grid/Fujitsu_A64FX_asm_double.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(A,B) +#define ADD_RESULT(base,basep) LOAD_CHI_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd @@ -691,3 +691,23 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z24.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z25.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z26.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z27.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z28.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z29.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/Fujitsu_A64FX_asm_single.h b/Grid/Fujitsu_A64FX_asm_single.h index 588fc9c4..08d2fc53 100644 --- a/Grid/Fujitsu_A64FX_asm_single.h +++ b/Grid/Fujitsu_A64FX_asm_single.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(A,B) +#define ADD_RESULT(base,basep) LOAD_CHI_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf @@ -705,3 +705,23 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z12.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z13.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z14.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z15.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z16.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z17.f \n\t" \ + "fadd z6.f, p5/m, z6.f, z24.f \n\t" \ + "fadd z7.f, p5/m, z7.f, z25.f \n\t" \ + "fadd z8.f, p5/m, z8.f, z26.f \n\t" \ + "fadd z9.f, p5/m, z9.f, z27.f \n\t" \ + "fadd z10.f, p5/m, z10.f, z28.f \n\t" \ + "fadd z11.f, p5/m, z11.f, z29.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/Fujitsu_A64FX_intrin_double.h b/Grid/Fujitsu_A64FX_intrin_double.h index e7469ffb..a0167ca9 100644 --- a/Grid/Fujitsu_A64FX_intrin_double.h +++ b/Grid/Fujitsu_A64FX_intrin_double.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(A,B) +#define ADD_RESULT(base,basep) LOAD_CHI_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd @@ -567,3 +567,18 @@ Author: Nils Meyer result_31 = __svzero(result_31); \ result_32 = __svzero(result_32); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + diff --git a/Grid/Fujitsu_A64FX_intrin_single.h b/Grid/Fujitsu_A64FX_intrin_single.h index e44374f0..6849506b 100644 --- a/Grid/Fujitsu_A64FX_intrin_single.h +++ b/Grid/Fujitsu_A64FX_intrin_single.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(A,B) +#define ADD_RESULT(base,basep) LOAD_CHI_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf @@ -576,3 +576,18 @@ Author: Nils Meyer result_31 = __svzero(result_31); \ result_32 = __svzero(result_32); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + From 304762e7ac26aadf8909412bfe590e365a4cdb1c Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 16:26:01 +0200 Subject: [PATCH 022/147] changes --- Grid/Fujitsu_A64FX_asm_double.h | 2 +- Grid/Fujitsu_A64FX_asm_single.h | 2 +- Grid/Fujitsu_A64FX_intrin_double.h | 2 +- Grid/Fujitsu_A64FX_intrin_single.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/Fujitsu_A64FX_asm_double.h b/Grid/Fujitsu_A64FX_asm_double.h index 4af5a91f..2bc5eb6e 100644 --- a/Grid/Fujitsu_A64FX_asm_double.h +++ b/Grid/Fujitsu_A64FX_asm_double.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,basep) LOAD_CHI_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd diff --git a/Grid/Fujitsu_A64FX_asm_single.h b/Grid/Fujitsu_A64FX_asm_single.h index 08d2fc53..f20396b9 100644 --- a/Grid/Fujitsu_A64FX_asm_single.h +++ b/Grid/Fujitsu_A64FX_asm_single.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,basep) LOAD_CHI_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf diff --git a/Grid/Fujitsu_A64FX_intrin_double.h b/Grid/Fujitsu_A64FX_intrin_double.h index a0167ca9..f58b86df 100644 --- a/Grid/Fujitsu_A64FX_intrin_double.h +++ b/Grid/Fujitsu_A64FX_intrin_double.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,basep) LOAD_CHI_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd diff --git a/Grid/Fujitsu_A64FX_intrin_single.h b/Grid/Fujitsu_A64FX_intrin_single.h index 6849506b..93bd1dc7 100644 --- a/Grid/Fujitsu_A64FX_intrin_single.h +++ b/Grid/Fujitsu_A64FX_intrin_single.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,basep) LOAD_CHI_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf From bd310932f7f974fc7e276fcd1bc3497ffb4bff52 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 16:32:31 +0200 Subject: [PATCH 023/147] changes --- Grid/Fujitsu_A64FX_asm_double.h | 713 ----------------- Grid/Fujitsu_A64FX_asm_single.h | 727 ------------------ Grid/Fujitsu_A64FX_intrin_double.h | 584 -------------- Grid/Fujitsu_A64FX_intrin_single.h | 593 -------------- Grid/Fujitsu_A64FX_undef.h | 70 -- .../implementation/WilsonKernelsAsmA64FX.h | 11 +- Grid/simd/Fujitsu_A64FX_asm_double.h | 22 + Grid/simd/Fujitsu_A64FX_asm_single.h | 22 + Grid/simd/Fujitsu_A64FX_intrin_double.h | 17 + Grid/simd/Fujitsu_A64FX_intrin_single.h | 17 + Grid/simd/Fujitsu_A64FX_undef.h | 2 + 11 files changed, 84 insertions(+), 2694 deletions(-) delete mode 100644 Grid/Fujitsu_A64FX_asm_double.h delete mode 100644 Grid/Fujitsu_A64FX_asm_single.h delete mode 100644 Grid/Fujitsu_A64FX_intrin_double.h delete mode 100644 Grid/Fujitsu_A64FX_intrin_single.h delete mode 100644 Grid/Fujitsu_A64FX_undef.h diff --git a/Grid/Fujitsu_A64FX_asm_double.h b/Grid/Fujitsu_A64FX_asm_double.h deleted file mode 100644 index 2bc5eb6e..00000000 --- a/Grid/Fujitsu_A64FX_asm_double.h +++ /dev/null @@ -1,713 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: XXX - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) -#define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXd -#define COMPLEX_SIGNS(A) -#define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) -#define MAYBEPERM(A,perm) if (perm) { A ; } -#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd -#define XP_RECON XP_RECON_A64FXd -#define XM_RECON XM_RECON_A64FXd -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 PERM0_A64FXd -#define PERMUTE_DIR1 PERM1_A64FXd -#define PERMUTE_DIR2 PERM2_A64FXd -#define PERMUTE_DIR3 PERM3_A64FXd -// DECLARATIONS -#define DECLARATIONS_A64FXd \ - const uint64_t lut[4][8] = { \ - {4, 5, 6, 7, 0, 1, 2, 3}, \ - {2, 3, 0, 1, 6, 7, 4, 5}, \ - {1, 0, 3, 2, 5, 4, 7, 6}, \ - {0, 1, 2, 4, 5, 6, 7, 8} };\ -asm ( \ - "fmov z31.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXd(base) \ -{ \ -asm ( \ - "stnt1d { z0.d }, p5, [%[storeptr], -6, mul vl] \n\t" \ - "stnt1d { z1.d }, p5, [%[storeptr], -5, mul vl] \n\t" \ - "stnt1d { z2.d }, p5, [%[storeptr], -4, mul vl] \n\t" \ - "stnt1d { z3.d }, p5, [%[storeptr], -3, mul vl] \n\t" \ - "stnt1d { z4.d }, p5, [%[storeptr], -2, mul vl] \n\t" \ - "stnt1d { z5.d }, p5, [%[storeptr], -1, mul vl] \n\t" \ - "stnt1d { z6.d }, p5, [%[storeptr], 0, mul vl] \n\t" \ - "stnt1d { z7.d }, p5, [%[storeptr], 1, mul vl] \n\t" \ - "stnt1d { z8.d }, p5, [%[storeptr], 2, mul vl] \n\t" \ - "stnt1d { z9.d }, p5, [%[storeptr], 3, mul vl] \n\t" \ - "stnt1d { z10.d }, p5, [%[storeptr], 4, mul vl] \n\t" \ - "stnt1d { z11.d }, p5, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ -{ \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXd(base) \ -{ \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ -{ \ -asm ( \ - "ptrue p5.d \n\t" \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ptrue p5.d \n\t" \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ptrue p5.d \n\t" \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PERM0 -#define PERM0_A64FXd \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM1 -#define PERM1_A64FXd \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM2 -#define PERM2_A64FXd \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM3 -#define PERM3_A64FXd - -// MULT_2SPIN -#define MULT_2SPIN_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ -asm ( \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "fmov z18.d , 0 \n\t" \ - "fmov z21.d , 0 \n\t" \ - "fmov z19.d , 0 \n\t" \ - "fmov z22.d , 0 \n\t" \ - "fmov z20.d , 0 \n\t" \ - "fmov z23.d , 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z24.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z25.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z26.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "mov z0.d, z18.d \n\t" \ - "mov z1.d, z19.d \n\t" \ - "mov z2.d, z20.d \n\t" \ - "mov z3.d, z21.d \n\t" \ - "mov z4.d, z22.d \n\t" \ - "mov z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXd \ -{ \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fsub z12.d, p5/m, z12.d, z27.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z28.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z29.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z24.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z25.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z26.d \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXd \ -{ \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z27.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXd \ -{ \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fadd z12.d, p5/m, z12.d, z24.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z25.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z26.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z27.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z28.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z29.d \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z24.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z25.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z26.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "mov z0.d, z18.d \n\t" \ - "mov z1.d, z19.d \n\t" \ - "mov z2.d, z20.d \n\t" \ - "mov z3.d, z21.d \n\t" \ - "mov z4.d, z22.d \n\t" \ - "mov z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXd \ -{ \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fadd z12.d, p5/m, z12.d, z27.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z28.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z29.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z24.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z25.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z26.d \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXd \ -{ \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z27.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXd \ -{ \ -asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fsub z12.d, p5/m, z12.d, z24.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z25.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z26.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z27.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z28.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z29.d \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "mov z0.d, z18.d \n\t" \ - "mov z1.d, z19.d \n\t" \ - "mov z2.d, z20.d \n\t" \ - "mov z3.d, z21.d \n\t" \ - "mov z4.d, z22.d \n\t" \ - "mov z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXd \ -asm ( \ - "ptrue p5.d \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXd \ -asm ( \ - "ptrue p5.d \n\t" \ - "fmov z0.d , 0 \n\t" \ - "fmov z1.d , 0 \n\t" \ - "fmov z2.d , 0 \n\t" \ - "fmov z3.d , 0 \n\t" \ - "fmov z4.d , 0 \n\t" \ - "fmov z5.d , 0 \n\t" \ - "fmov z6.d , 0 \n\t" \ - "fmov z7.d , 0 \n\t" \ - "fmov z8.d , 0 \n\t" \ - "fmov z9.d , 0 \n\t" \ - "fmov z10.d , 0 \n\t" \ - "fmov z11.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z24.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z25.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z26.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z27.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z28.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z29.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - diff --git a/Grid/Fujitsu_A64FX_asm_single.h b/Grid/Fujitsu_A64FX_asm_single.h deleted file mode 100644 index f20396b9..00000000 --- a/Grid/Fujitsu_A64FX_asm_single.h +++ /dev/null @@ -1,727 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: XXX - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) -#define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXf -#define COMPLEX_SIGNS(A) -#define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) -#define MAYBEPERM(A,perm) if (perm) { A ; } -#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf -#define XP_RECON XP_RECON_A64FXf -#define XM_RECON XM_RECON_A64FXf -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 PERM0_A64FXf -#define PERMUTE_DIR1 PERM1_A64FXf -#define PERMUTE_DIR2 PERM2_A64FXf -#define PERMUTE_DIR3 PERM3_A64FXf -// DECLARATIONS -#define DECLARATIONS_A64FXf \ - const uint32_t lut[4][16] = { \ - {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ - {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ - {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ -asm ( \ - "fmov z31.f , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXf(base) \ -{ \ -asm ( \ - "stnt1d { z0.f }, p5, [%[storeptr], -6, mul vl] \n\t" \ - "stnt1d { z1.f }, p5, [%[storeptr], -5, mul vl] \n\t" \ - "stnt1d { z2.f }, p5, [%[storeptr], -4, mul vl] \n\t" \ - "stnt1d { z3.f }, p5, [%[storeptr], -3, mul vl] \n\t" \ - "stnt1d { z4.f }, p5, [%[storeptr], -2, mul vl] \n\t" \ - "stnt1d { z5.f }, p5, [%[storeptr], -1, mul vl] \n\t" \ - "stnt1d { z6.f }, p5, [%[storeptr], 0, mul vl] \n\t" \ - "stnt1d { z7.f }, p5, [%[storeptr], 1, mul vl] \n\t" \ - "stnt1d { z8.f }, p5, [%[storeptr], 2, mul vl] \n\t" \ - "stnt1d { z9.f }, p5, [%[storeptr], 3, mul vl] \n\t" \ - "stnt1d { z10.f }, p5, [%[storeptr], 4, mul vl] \n\t" \ - "stnt1d { z11.f }, p5, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ -{ \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXf(base) \ -{ \ -asm ( \ - "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ -{ \ -asm ( \ - "ptrue p5.f \n\t" \ - "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ptrue p5.f \n\t" \ - "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ptrue p5.f \n\t" \ - "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PERM0 -#define PERM0_A64FXf \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.f, { z12.f }, z30.f \n\t" \ - "tbl z13.f, { z13.f }, z30.f \n\t" \ - "tbl z14.f, { z14.f }, z30.f \n\t" \ - "tbl z15.f, { z15.f }, z30.f \n\t" \ - "tbl z16.f, { z16.f }, z30.f \n\t" \ - "tbl z17.f, { z17.f }, z30.f \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM1 -#define PERM1_A64FXf \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.f, { z12.f }, z30.f \n\t" \ - "tbl z13.f, { z13.f }, z30.f \n\t" \ - "tbl z14.f, { z14.f }, z30.f \n\t" \ - "tbl z15.f, { z15.f }, z30.f \n\t" \ - "tbl z16.f, { z16.f }, z30.f \n\t" \ - "tbl z17.f, { z17.f }, z30.f \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM2 -#define PERM2_A64FXf \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.f, { z12.f }, z30.f \n\t" \ - "tbl z13.f, { z13.f }, z30.f \n\t" \ - "tbl z14.f, { z14.f }, z30.f \n\t" \ - "tbl z15.f, { z15.f }, z30.f \n\t" \ - "tbl z16.f, { z16.f }, z30.f \n\t" \ - "tbl z17.f, { z17.f }, z30.f \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM3 -#define PERM3_A64FXf \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.f, { z12.f }, z30.f \n\t" \ - "tbl z13.f, { z13.f }, z30.f \n\t" \ - "tbl z14.f, { z14.f }, z30.f \n\t" \ - "tbl z15.f, { z15.f }, z30.f \n\t" \ - "tbl z16.f, { z16.f }, z30.f \n\t" \ - "tbl z17.f, { z17.f }, z30.f \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// MULT_2SPIN -#define MULT_2SPIN_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ -asm ( \ - "ld1d { z24.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "fmov z18.f , 0 \n\t" \ - "fmov z21.f , 0 \n\t" \ - "fmov z19.f , 0 \n\t" \ - "fmov z22.f , 0 \n\t" \ - "fmov z20.f , 0 \n\t" \ - "fmov z23.f , 0 \n\t" \ - "fcmla z18.f, p5/m, z24.f, z12.f, 0 \n\t" \ - "fcmla z21.f, p5/m, z24.f, z15.f, 0 \n\t" \ - "fcmla z19.f, p5/m, z25.f, z12.f, 0 \n\t" \ - "fcmla z22.f, p5/m, z25.f, z15.f, 0 \n\t" \ - "fcmla z20.f, p5/m, z26.f, z12.f, 0 \n\t" \ - "fcmla z23.f, p5/m, z26.f, z15.f, 0 \n\t" \ - "fcmla z18.f, p5/m, z24.f, z12.f, 90 \n\t" \ - "fcmla z21.f, p5/m, z24.f, z15.f, 90 \n\t" \ - "fcmla z19.f, p5/m, z25.f, z12.f, 90 \n\t" \ - "fcmla z22.f, p5/m, z25.f, z15.f, 90 \n\t" \ - "fcmla z20.f, p5/m, z26.f, z12.f, 90 \n\t" \ - "fcmla z23.f, p5/m, z26.f, z15.f, 90 \n\t" \ - "ld1d { z24.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "fcmla z18.f, p5/m, z27.f, z13.f, 0 \n\t" \ - "fcmla z21.f, p5/m, z27.f, z16.f, 0 \n\t" \ - "fcmla z19.f, p5/m, z28.f, z13.f, 0 \n\t" \ - "fcmla z22.f, p5/m, z28.f, z16.f, 0 \n\t" \ - "fcmla z20.f, p5/m, z29.f, z13.f, 0 \n\t" \ - "fcmla z23.f, p5/m, z29.f, z16.f, 0 \n\t" \ - "fcmla z18.f, p5/m, z27.f, z13.f, 90 \n\t" \ - "fcmla z21.f, p5/m, z27.f, z16.f, 90 \n\t" \ - "fcmla z19.f, p5/m, z28.f, z13.f, 90 \n\t" \ - "fcmla z22.f, p5/m, z28.f, z16.f, 90 \n\t" \ - "fcmla z20.f, p5/m, z29.f, z13.f, 90 \n\t" \ - "fcmla z23.f, p5/m, z29.f, z16.f, 90 \n\t" \ - "fcmla z18.f, p5/m, z24.f, z14.f, 0 \n\t" \ - "fcmla z21.f, p5/m, z24.f, z17.f, 0 \n\t" \ - "fcmla z19.f, p5/m, z25.f, z14.f, 0 \n\t" \ - "fcmla z22.f, p5/m, z25.f, z17.f, 0 \n\t" \ - "fcmla z20.f, p5/m, z26.f, z14.f, 0 \n\t" \ - "fcmla z23.f, p5/m, z26.f, z17.f, 0 \n\t" \ - "fcmla z18.f, p5/m, z24.f, z14.f, 90 \n\t" \ - "fcmla z21.f, p5/m, z24.f, z17.f, 90 \n\t" \ - "fcmla z19.f, p5/m, z25.f, z14.f, 90 \n\t" \ - "fcmla z22.f, p5/m, z25.f, z17.f, 90 \n\t" \ - "fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \ - "fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXf \ -{ \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.f, p5/m, z12.f, z27.f, 90 \n\t" \ - "fcadd z13.f, p5/m, z13.f, z28.f, 90 \n\t" \ - "fcadd z14.f, p5/m, z14.f, z29.f, 90 \n\t" \ - "fcadd z15.f, p5/m, z15.f, z24.f, 90 \n\t" \ - "fcadd z16.f, p5/m, z16.f, z25.f, 90 \n\t" \ - "fcadd z17.f, p5/m, z17.f, z26.f, 90 \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXf \ -asm ( \ - "fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \ - "fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \ - "fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \ - "fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \ - "fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \ - "fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \ - "mov z0.f, z18.f \n\t" \ - "mov z1.f, z19.f \n\t" \ - "mov z2.f, z20.f \n\t" \ - "mov z3.f, z21.f \n\t" \ - "mov z4.f, z22.f \n\t" \ - "mov z5.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXf \ -{ \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fsub z12.f, p5/m, z12.f, z27.f \n\t" \ - "fsub z13.f, p5/m, z13.f, z28.f \n\t" \ - "fsub z14.f, p5/m, z14.f, z29.f \n\t" \ - "fadd z15.f, p5/m, z15.f, z24.f \n\t" \ - "fadd z16.f, p5/m, z16.f, z25.f \n\t" \ - "fadd z17.f, p5/m, z17.f, z26.f \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXf \ -{ \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.f, p5/m, z12.f, z24.f, 90 \n\t" \ - "fcadd z13.f, p5/m, z13.f, z25.f, 90 \n\t" \ - "fcadd z14.f, p5/m, z14.f, z26.f, 90 \n\t" \ - "fcadd z15.f, p5/m, z15.f, z27.f, 270 \n\t" \ - "fcadd z16.f, p5/m, z16.f, z28.f, 270 \n\t" \ - "fcadd z17.f, p5/m, z17.f, z29.f, 270 \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXf \ -{ \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fadd z12.f, p5/m, z12.f, z24.f \n\t" \ - "fadd z13.f, p5/m, z13.f, z25.f \n\t" \ - "fadd z14.f, p5/m, z14.f, z26.f \n\t" \ - "fadd z15.f, p5/m, z15.f, z27.f \n\t" \ - "fadd z16.f, p5/m, z16.f, z28.f \n\t" \ - "fadd z17.f, p5/m, z17.f, z29.f \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXf \ -{ \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.f, p5/m, z12.f, z27.f, 270 \n\t" \ - "fcadd z13.f, p5/m, z13.f, z28.f, 270 \n\t" \ - "fcadd z14.f, p5/m, z14.f, z29.f, 270 \n\t" \ - "fcadd z15.f, p5/m, z15.f, z24.f, 270 \n\t" \ - "fcadd z16.f, p5/m, z16.f, z25.f, 270 \n\t" \ - "fcadd z17.f, p5/m, z17.f, z26.f, 270 \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXf \ -asm ( \ - "fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \ - "fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \ - "fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \ - "fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \ - "fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \ - "fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \ - "mov z0.f, z18.f \n\t" \ - "mov z1.f, z19.f \n\t" \ - "mov z2.f, z20.f \n\t" \ - "mov z3.f, z21.f \n\t" \ - "mov z4.f, z22.f \n\t" \ - "mov z5.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXf \ -{ \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fadd z12.f, p5/m, z12.f, z27.f \n\t" \ - "fadd z13.f, p5/m, z13.f, z28.f \n\t" \ - "fadd z14.f, p5/m, z14.f, z29.f \n\t" \ - "fsub z15.f, p5/m, z15.f, z24.f \n\t" \ - "fsub z16.f, p5/m, z16.f, z25.f \n\t" \ - "fsub z17.f, p5/m, z17.f, z26.f \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXf \ -{ \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.f, p5/m, z12.f, z24.f, 270 \n\t" \ - "fcadd z13.f, p5/m, z13.f, z25.f, 270 \n\t" \ - "fcadd z14.f, p5/m, z14.f, z26.f, 270 \n\t" \ - "fcadd z15.f, p5/m, z15.f, z27.f, 90 \n\t" \ - "fcadd z16.f, p5/m, z16.f, z28.f, 90 \n\t" \ - "fcadd z17.f, p5/m, z17.f, z29.f, 90 \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXf \ -{ \ -asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fsub z12.f, p5/m, z12.f, z24.f \n\t" \ - "fsub z13.f, p5/m, z13.f, z25.f \n\t" \ - "fsub z14.f, p5/m, z14.f, z26.f \n\t" \ - "fsub z15.f, p5/m, z15.f, z27.f \n\t" \ - "fsub z16.f, p5/m, z16.f, z28.f \n\t" \ - "fsub z17.f, p5/m, z17.f, z29.f \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \ - "fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \ - "fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \ - "fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \ - "fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \ - "fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \ - "mov z0.f, z18.f \n\t" \ - "mov z1.f, z19.f \n\t" \ - "mov z2.f, z20.f \n\t" \ - "mov z3.f, z21.f \n\t" \ - "mov z4.f, z22.f \n\t" \ - "mov z5.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fsub z9.f, p5/m, z9.f, z18.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fsub z10.f, p5/m, z10.f, z19.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fsub z11.f, p5/m, z11.f, z20.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fadd z6.f, p5/m, z6.f, z21.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fadd z7.f, p5/m, z7.f, z22.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - "fadd z8.f, p5/m, z8.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fadd z9.f, p5/m, z9.f, z18.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fadd z10.f, p5/m, z10.f, z19.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fadd z11.f, p5/m, z11.f, z20.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fsub z6.f, p5/m, z6.f, z21.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fsub z7.f, p5/m, z7.f, z22.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - "fsub z8.f, p5/m, z8.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.f, p5/m, z6.f, z18.f, 270 \n\t" \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fcadd z7.f, p5/m, z7.f, z19.f, 270 \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fcadd z8.f, p5/m, z8.f, z20.f, 270 \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fcadd z9.f, p5/m, z9.f, z21.f, 90 \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fcadd z10.f, p5/m, z10.f, z22.f, 90 \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fcadd z11.f, p5/m, z11.f, z23.f, 90 \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.f, p5/m, z6.f, z18.f, 90 \n\t" \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fcadd z7.f, p5/m, z7.f, z19.f, 90 \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fcadd z8.f, p5/m, z8.f, z20.f, 90 \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fcadd z9.f, p5/m, z9.f, z21.f, 270 \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fcadd z10.f, p5/m, z10.f, z22.f, 270 \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fcadd z11.f, p5/m, z11.f, z23.f, 270 \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXf \ -asm ( \ - "ptrue p5.f \n\t" \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fadd z6.f, p5/m, z6.f, z18.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fadd z7.f, p5/m, z7.f, z19.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fadd z8.f, p5/m, z8.f, z20.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fadd z9.f, p5/m, z9.f, z21.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fadd z10.f, p5/m, z10.f, z22.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - "fadd z11.f, p5/m, z11.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fsub z6.f, p5/m, z6.f, z18.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fsub z7.f, p5/m, z7.f, z19.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fsub z8.f, p5/m, z8.f, z20.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fsub z9.f, p5/m, z9.f, z21.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fsub z10.f, p5/m, z10.f, z22.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - "fsub z11.f, p5/m, z11.f, z23.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXf \ -asm ( \ - "ptrue p5.f \n\t" \ - "fmov z0.f , 0 \n\t" \ - "fmov z1.f , 0 \n\t" \ - "fmov z2.f , 0 \n\t" \ - "fmov z3.f , 0 \n\t" \ - "fmov z4.f , 0 \n\t" \ - "fmov z5.f , 0 \n\t" \ - "fmov z6.f , 0 \n\t" \ - "fmov z7.f , 0 \n\t" \ - "fmov z8.f , 0 \n\t" \ - "fmov z9.f , 0 \n\t" \ - "fmov z10.f , 0 \n\t" \ - "fmov z11.f , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXf \ -asm ( \ - "fadd z0.f, p5/m, z0.f, z12.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z13.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z14.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z15.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z16.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z17.f \n\t" \ - "fadd z6.f, p5/m, z6.f, z24.f \n\t" \ - "fadd z7.f, p5/m, z7.f, z25.f \n\t" \ - "fadd z8.f, p5/m, z8.f, z26.f \n\t" \ - "fadd z9.f, p5/m, z9.f, z27.f \n\t" \ - "fadd z10.f, p5/m, z10.f, z28.f \n\t" \ - "fadd z11.f, p5/m, z11.f, z29.f \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - diff --git a/Grid/Fujitsu_A64FX_intrin_double.h b/Grid/Fujitsu_A64FX_intrin_double.h deleted file mode 100644 index f58b86df..00000000 --- a/Grid/Fujitsu_A64FX_intrin_double.h +++ /dev/null @@ -1,584 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: XXX - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) -#define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXd -#define COMPLEX_SIGNS(A) -#define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) -#define MAYBEPERM(A,perm) if (perm) { A ; } -#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd -#define XP_RECON XP_RECON_A64FXd -#define XM_RECON XM_RECON_A64FXd -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 PERM0_A64FXd -#define PERMUTE_DIR1 PERM1_A64FXd -#define PERMUTE_DIR2 PERM2_A64FXd -#define PERMUTE_DIR3 PERM3_A64FXd -// DECLARATIONS -#define DECLARATIONS_A64FXd \ - const uint64_t lut[4][8] = { \ - {4, 5, 6, 7, 0, 1, 2, 3}, \ - {2, 3, 0, 1, 6, 7, 4, 5}, \ - {1, 0, 3, 2, 5, 4, 7, 6}, \ - {0, 1, 2, 4, 5, 6, 7, 8} };\ - svfloat64_t result_00; \ - svfloat64_t result_01; \ - svfloat64_t result_02; \ - svfloat64_t result_10; \ - svfloat64_t result_11; \ - svfloat64_t result_12; \ - svfloat64_t result_20; \ - svfloat64_t result_21; \ - svfloat64_t result_22; \ - svfloat64_t result_30; \ - svfloat64_t result_31; \ - svfloat64_t result_32; \ - svfloat64_t Chi_00; \ - svfloat64_t Chi_01; \ - svfloat64_t Chi_02; \ - svfloat64_t Chi_10; \ - svfloat64_t Chi_11; \ - svfloat64_t Chi_12; \ - svfloat64_t UChi_00; \ - svfloat64_t UChi_01; \ - svfloat64_t UChi_02; \ - svfloat64_t UChi_10; \ - svfloat64_t UChi_11; \ - svfloat64_t UChi_12; \ - svfloat64_t U_00; \ - svfloat64_t U_10; \ - svfloat64_t U_20; \ - svfloat64_t U_01; \ - svfloat64_t U_11; \ - svfloat64_t U_21; \ - svbool_t pg1; \ - pg1 = svptrue_b64(); \ - svuint64_t table0; \ - svfloat64_t zero0; \ - zero0 = __svzero(zero0); - -#define Chimu_00 Chi_00 -#define Chimu_01 Chi_01 -#define Chimu_02 Chi_02 -#define Chimu_10 Chi_10 -#define Chimu_11 Chi_11 -#define Chimu_12 Chi_12 -#define Chimu_20 U_00 -#define Chimu_21 U_10 -#define Chimu_22 U_20 -#define Chimu_30 U_01 -#define Chimu_31 U_11 -#define Chimu_32 U_21 -// RESULT -#define RESULT_A64FXd(base) \ -{ \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ -{ \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ -{ \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ -{ \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXd(base) \ -{ \ - Chi_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chi_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chi_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chi_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chi_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ -{ \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ -} -// PERM0 -#define PERM0_A64FXd \ - table0 = svld1(pg1, (uint64_t*)&lut[0]); \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM1 -#define PERM1_A64FXd \ - table0 = svld1(pg1, (uint64_t*)&lut[1]); \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM2 -#define PERM2_A64FXd \ - table0 = svld1(pg1, (uint64_t*)&lut[2]); \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM3 -#define PERM3_A64FXd - -// MULT_2SPIN -#define MULT_2SPIN_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ - UChi_00 = __svzero(UChi_00); \ - UChi_10 = __svzero(UChi_10); \ - UChi_01 = __svzero(UChi_01); \ - UChi_11 = __svzero(UChi_11); \ - UChi_02 = __svzero(UChi_02); \ - UChi_12 = __svzero(UChi_12); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ -} -// XP_PROJ -#define XP_PROJ_A64FXd \ -{ \ - Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ - Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ - Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ - Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ - Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ - Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ -} -// XP_RECON -#define XP_RECON_A64FXd \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ - result_00 = UChi_00; \ - result_01 = UChi_01; \ - result_02 = UChi_02; \ - result_10 = UChi_10; \ - result_11 = UChi_11; \ - result_12 = UChi_12; - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXd \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ - result_12 = svadd_x(pg1, result_12, UChi_12); - -// YP_PROJ -#define YP_PROJ_A64FXd \ -{ \ - table0 = svld1(pg1, (uint64_t*)&lut[2]); \ - Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ - Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ - Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ - Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ - Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ - Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXd \ -{ \ - table0 = svld1(pg1, (uint64_t*)&lut[1]); \ - Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ - Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ - Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ - Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ - Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ - Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ -} -// TP_PROJ -#define TP_PROJ_A64FXd \ -{ \ - table0 = svld1(pg1, (uint64_t*)&lut[0]); \ - Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ - Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ - Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ - Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ - Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ - Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ -} -// XM_PROJ -#define XM_PROJ_A64FXd \ -{ \ - Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ - Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ - Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ - Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ - Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ - Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ -} -// XM_RECON -#define XM_RECON_A64FXd \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ - result_00 = UChi_00; \ - result_01 = UChi_01; \ - result_02 = UChi_02; \ - result_10 = UChi_10; \ - result_11 = UChi_11; \ - result_12 = UChi_12; - -// YM_PROJ -#define YM_PROJ_A64FXd \ -{ \ - table0 = svld1(pg1, (uint64_t*)&lut[2]); \ - Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ - Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ - Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ - Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ - Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ - Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXd \ -{ \ - table0 = svld1(pg1, (uint64_t*)&lut[1]); \ - Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ - Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ - Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ - Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ - Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ - Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ -} -// TM_PROJ -#define TM_PROJ_A64FXd \ -{ \ - table0 = svld1(pg1, (uint64_t*)&lut[0]); \ - Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ - Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ - Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ - Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ - Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ - Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXd \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ - result_00 = UChi_00; \ - result_01 = UChi_01; \ - result_02 = UChi_02; \ - result_10 = UChi_10; \ - result_11 = UChi_11; \ - result_12 = UChi_12; - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXd \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_30 = svsub_x(pg1, result_30, UChi_00); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_31 = svsub_x(pg1, result_31, UChi_01); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_32 = svsub_x(pg1, result_32, UChi_02); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_20 = svadd_x(pg1, result_20, UChi_10); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_21 = svadd_x(pg1, result_21, UChi_11); \ - result_12 = svadd_x(pg1, result_12, UChi_12); \ - result_22 = svadd_x(pg1, result_22, UChi_12); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXd \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_30 = svadd_x(pg1, result_30, UChi_00); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_31 = svadd_x(pg1, result_31, UChi_01); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_32 = svadd_x(pg1, result_32, UChi_02); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_20 = svsub_x(pg1, result_20, UChi_10); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_21 = svsub_x(pg1, result_21, UChi_11); \ - result_12 = svadd_x(pg1, result_12, UChi_12); \ - result_22 = svsub_x(pg1, result_22, UChi_12); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXd \ - result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ - result_12 = svadd_x(pg1, result_12, UChi_12); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXd \ - result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ - result_12 = svadd_x(pg1, result_12, UChi_12); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXd \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_20 = svadd_x(pg1, result_20, UChi_00); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_21 = svadd_x(pg1, result_21, UChi_01); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_22 = svadd_x(pg1, result_22, UChi_02); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_30 = svadd_x(pg1, result_30, UChi_10); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_31 = svadd_x(pg1, result_31, UChi_11); \ - result_12 = svadd_x(pg1, result_12, UChi_12); \ - result_32 = svadd_x(pg1, result_32, UChi_12); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXd \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_20 = svsub_x(pg1, result_20, UChi_00); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_21 = svsub_x(pg1, result_21, UChi_01); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_22 = svsub_x(pg1, result_22, UChi_02); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_30 = svsub_x(pg1, result_30, UChi_10); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_31 = svsub_x(pg1, result_31, UChi_11); \ - result_12 = svadd_x(pg1, result_12, UChi_12); \ - result_32 = svsub_x(pg1, result_32, UChi_12); - -// ZERO_PSI -#define ZERO_PSI_A64FXd \ - result_00 = __svzero(result_00); \ - result_01 = __svzero(result_01); \ - result_02 = __svzero(result_02); \ - result_10 = __svzero(result_10); \ - result_11 = __svzero(result_11); \ - result_12 = __svzero(result_12); \ - result_20 = __svzero(result_20); \ - result_21 = __svzero(result_21); \ - result_22 = __svzero(result_22); \ - result_30 = __svzero(result_30); \ - result_31 = __svzero(result_31); \ - result_32 = __svzero(result_32); - -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXd \ - result_00 = svadd_x(pg1, result_00, Chimu_00); \ - result_01 = svadd_x(pg1, result_01, Chimu_01); \ - result_02 = svadd_x(pg1, result_02, Chimu_02); \ - result_10 = svadd_x(pg1, result_10, Chimu_10); \ - result_11 = svadd_x(pg1, result_11, Chimu_11); \ - result_12 = svadd_x(pg1, result_12, Chimu_12); \ - result_20 = svadd_x(pg1, result_20, Chimu_20); \ - result_21 = svadd_x(pg1, result_21, Chimu_21); \ - result_22 = svadd_x(pg1, result_22, Chimu_22); \ - result_30 = svadd_x(pg1, result_30, Chimu_30); \ - result_31 = svadd_x(pg1, result_31, Chimu_31); \ - result_32 = svadd_x(pg1, result_32, Chimu_32); - diff --git a/Grid/Fujitsu_A64FX_intrin_single.h b/Grid/Fujitsu_A64FX_intrin_single.h deleted file mode 100644 index 93bd1dc7..00000000 --- a/Grid/Fujitsu_A64FX_intrin_single.h +++ /dev/null @@ -1,593 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: XXX - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) -#define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXf -#define COMPLEX_SIGNS(A) -#define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) -#define MAYBEPERM(A,perm) if (perm) { A ; } -#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf -#define XP_RECON XP_RECON_A64FXf -#define XM_RECON XM_RECON_A64FXf -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 PERM0_A64FXf -#define PERMUTE_DIR1 PERM1_A64FXf -#define PERMUTE_DIR2 PERM2_A64FXf -#define PERMUTE_DIR3 PERM3_A64FXf -// DECLARATIONS -#define DECLARATIONS_A64FXf \ - const uint32_t lut[4][16] = { \ - {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ - {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ - {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ - svfloat32_t result_00; \ - svfloat32_t result_01; \ - svfloat32_t result_02; \ - svfloat32_t result_10; \ - svfloat32_t result_11; \ - svfloat32_t result_12; \ - svfloat32_t result_20; \ - svfloat32_t result_21; \ - svfloat32_t result_22; \ - svfloat32_t result_30; \ - svfloat32_t result_31; \ - svfloat32_t result_32; \ - svfloat32_t Chi_00; \ - svfloat32_t Chi_01; \ - svfloat32_t Chi_02; \ - svfloat32_t Chi_10; \ - svfloat32_t Chi_11; \ - svfloat32_t Chi_12; \ - svfloat32_t UChi_00; \ - svfloat32_t UChi_01; \ - svfloat32_t UChi_02; \ - svfloat32_t UChi_10; \ - svfloat32_t UChi_11; \ - svfloat32_t UChi_12; \ - svfloat32_t U_00; \ - svfloat32_t U_10; \ - svfloat32_t U_20; \ - svfloat32_t U_01; \ - svfloat32_t U_11; \ - svfloat32_t U_21; \ - svbool_t pg1; \ - pg1 = svptrue_b32(); \ - svuint32_t table0; \ - svfloat32_t zero0; \ - zero0 = __svzero(zero0); - -#define Chimu_00 Chi_00 -#define Chimu_01 Chi_01 -#define Chimu_02 Chi_02 -#define Chimu_10 Chi_10 -#define Chimu_11 Chi_11 -#define Chimu_12 Chi_12 -#define Chimu_20 U_00 -#define Chimu_21 U_10 -#define Chimu_22 U_20 -#define Chimu_30 U_01 -#define Chimu_31 U_11 -#define Chimu_32 U_21 -// RESULT -#define RESULT_A64FXf(base) \ -{ \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ -{ \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ -{ \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ -{ \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXf(base) \ -{ \ - Chi_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chi_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chi_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chi_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chi_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ -{ \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ -} -// PERM0 -#define PERM0_A64FXf \ - table0 = svld1(pg1, (uint32_t*)&lut[0]); \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM1 -#define PERM1_A64FXf \ - table0 = svld1(pg1, (uint32_t*)&lut[1]); \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM2 -#define PERM2_A64FXf \ - table0 = svld1(pg1, (uint32_t*)&lut[2]); \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM3 -#define PERM3_A64FXf \ - table0 = svld1(pg1, (uint32_t*)&lut[3]); \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// MULT_2SPIN -#define MULT_2SPIN_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ - UChi_00 = __svzero(UChi_00); \ - UChi_10 = __svzero(UChi_10); \ - UChi_01 = __svzero(UChi_01); \ - UChi_11 = __svzero(UChi_11); \ - UChi_02 = __svzero(UChi_02); \ - UChi_12 = __svzero(UChi_12); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ -} -// XP_PROJ -#define XP_PROJ_A64FXf \ -{ \ - table0 = svld1(pg1, (uint32_t*)&lut[3]); \ - Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ - Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ - Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ - Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ - Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ - Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ -} -// XP_RECON -#define XP_RECON_A64FXf \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ - result_00 = UChi_00; \ - result_01 = UChi_01; \ - result_02 = UChi_02; \ - result_10 = UChi_10; \ - result_11 = UChi_11; \ - result_12 = UChi_12; - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXf \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ - result_12 = svadd_x(pg1, result_12, UChi_12); - -// YP_PROJ -#define YP_PROJ_A64FXf \ -{ \ - table0 = svld1(pg1, (uint32_t*)&lut[2]); \ - Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ - Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ - Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ - Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ - Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ - Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXf \ -{ \ - table0 = svld1(pg1, (uint32_t*)&lut[1]); \ - Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ - Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ - Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ - Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ - Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ - Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ -} -// TP_PROJ -#define TP_PROJ_A64FXf \ -{ \ - table0 = svld1(pg1, (uint32_t*)&lut[0]); \ - Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ - Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ - Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ - Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ - Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ - Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ -} -// XM_PROJ -#define XM_PROJ_A64FXf \ -{ \ - table0 = svld1(pg1, (uint32_t*)&lut[3]); \ - Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ - Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ - Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ - Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ - Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ - Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ -} -// XM_RECON -#define XM_RECON_A64FXf \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ - result_00 = UChi_00; \ - result_01 = UChi_01; \ - result_02 = UChi_02; \ - result_10 = UChi_10; \ - result_11 = UChi_11; \ - result_12 = UChi_12; - -// YM_PROJ -#define YM_PROJ_A64FXf \ -{ \ - table0 = svld1(pg1, (uint32_t*)&lut[2]); \ - Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ - Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ - Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ - Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ - Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ - Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXf \ -{ \ - table0 = svld1(pg1, (uint32_t*)&lut[1]); \ - Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ - Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ - Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ - Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ - Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ - Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ -} -// TM_PROJ -#define TM_PROJ_A64FXf \ -{ \ - table0 = svld1(pg1, (uint32_t*)&lut[0]); \ - Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ - Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ - Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ - Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ - Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ - Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXf \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ - result_00 = UChi_00; \ - result_01 = UChi_01; \ - result_02 = UChi_02; \ - result_10 = UChi_10; \ - result_11 = UChi_11; \ - result_12 = UChi_12; - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXf \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_30 = svsub_x(pg1, result_30, UChi_00); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_31 = svsub_x(pg1, result_31, UChi_01); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_32 = svsub_x(pg1, result_32, UChi_02); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_20 = svadd_x(pg1, result_20, UChi_10); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_21 = svadd_x(pg1, result_21, UChi_11); \ - result_12 = svadd_x(pg1, result_12, UChi_12); \ - result_22 = svadd_x(pg1, result_22, UChi_12); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXf \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_30 = svadd_x(pg1, result_30, UChi_00); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_31 = svadd_x(pg1, result_31, UChi_01); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_32 = svadd_x(pg1, result_32, UChi_02); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_20 = svsub_x(pg1, result_20, UChi_10); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_21 = svsub_x(pg1, result_21, UChi_11); \ - result_12 = svadd_x(pg1, result_12, UChi_12); \ - result_22 = svsub_x(pg1, result_22, UChi_12); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXf \ - result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ - result_12 = svadd_x(pg1, result_12, UChi_12); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXf \ - result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ - result_12 = svadd_x(pg1, result_12, UChi_12); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXf \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_20 = svadd_x(pg1, result_20, UChi_00); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_21 = svadd_x(pg1, result_21, UChi_01); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_22 = svadd_x(pg1, result_22, UChi_02); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_30 = svadd_x(pg1, result_30, UChi_10); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_31 = svadd_x(pg1, result_31, UChi_11); \ - result_12 = svadd_x(pg1, result_12, UChi_12); \ - result_32 = svadd_x(pg1, result_32, UChi_12); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXf \ - result_00 = svadd_x(pg1, result_00, UChi_00); \ - result_20 = svsub_x(pg1, result_20, UChi_00); \ - result_01 = svadd_x(pg1, result_01, UChi_01); \ - result_21 = svsub_x(pg1, result_21, UChi_01); \ - result_02 = svadd_x(pg1, result_02, UChi_02); \ - result_22 = svsub_x(pg1, result_22, UChi_02); \ - result_10 = svadd_x(pg1, result_10, UChi_10); \ - result_30 = svsub_x(pg1, result_30, UChi_10); \ - result_11 = svadd_x(pg1, result_11, UChi_11); \ - result_31 = svsub_x(pg1, result_31, UChi_11); \ - result_12 = svadd_x(pg1, result_12, UChi_12); \ - result_32 = svsub_x(pg1, result_32, UChi_12); - -// ZERO_PSI -#define ZERO_PSI_A64FXf \ - result_00 = __svzero(result_00); \ - result_01 = __svzero(result_01); \ - result_02 = __svzero(result_02); \ - result_10 = __svzero(result_10); \ - result_11 = __svzero(result_11); \ - result_12 = __svzero(result_12); \ - result_20 = __svzero(result_20); \ - result_21 = __svzero(result_21); \ - result_22 = __svzero(result_22); \ - result_30 = __svzero(result_30); \ - result_31 = __svzero(result_31); \ - result_32 = __svzero(result_32); - -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXf \ - result_00 = svadd_x(pg1, result_00, Chimu_00); \ - result_01 = svadd_x(pg1, result_01, Chimu_01); \ - result_02 = svadd_x(pg1, result_02, Chimu_02); \ - result_10 = svadd_x(pg1, result_10, Chimu_10); \ - result_11 = svadd_x(pg1, result_11, Chimu_11); \ - result_12 = svadd_x(pg1, result_12, Chimu_12); \ - result_20 = svadd_x(pg1, result_20, Chimu_20); \ - result_21 = svadd_x(pg1, result_21, Chimu_21); \ - result_22 = svadd_x(pg1, result_22, Chimu_22); \ - result_30 = svadd_x(pg1, result_30, Chimu_30); \ - result_31 = svadd_x(pg1, result_31, Chimu_31); \ - result_32 = svadd_x(pg1, result_32, Chimu_32); - diff --git a/Grid/Fujitsu_A64FX_undef.h b/Grid/Fujitsu_A64FX_undef.h deleted file mode 100644 index a09e1698..00000000 --- a/Grid/Fujitsu_A64FX_undef.h +++ /dev/null @@ -1,70 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_undef.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ - -#undef LOAD_CHIMU_A64FXd -#undef LOAD_CHIMU_A64FXf -#undef PREFETCH_CHIMU_L1 -#undef PREFETCH_GAUGE_L1 -#undef PREFETCH_CHIMU_L2 -#undef PREFETCH_GAUGE_L2 -#undef PF_GAUGE -#undef PREFETCH1_CHIMU -#undef PREFETCH_CHIMU -#undef LOCK_GAUGE -#undef UNLOCK_GAUGE -#undef MASK_REGS -#undef COMPLEX_SIGNS -#undef LOAD64 -#undef SAVE_RESULT -#undef ADD_RESULT -#undef MULT_2SPIN_DIR_PF -#undef MAYBEPERM -#undef LOAD_CHI -#undef ZERO_PSI -#undef XP_PROJMEM -#undef YP_PROJMEM -#undef ZP_PROJMEM -#undef TP_PROJMEM -#undef XM_PROJMEM -#undef YM_PROJMEM -#undef ZM_PROJMEM -#undef TM_PROJMEM -#undef XP_RECON -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef XP_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef PERMUTE_DIR0 -#undef PERMUTE_DIR1 -#undef PERMUTE_DIR2 -#undef PERMUTE_DIR3 diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 4e428097..b2a2b961 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -30,13 +30,11 @@ Author: paboyle /* END LEGAL */ #pragma once -#if defined(A64FXINTRIN) -#pragma message("A64FX Wilson kernels intrin") -#else -#pragma message("A64FX Wilson kernels asm") -#endif - #if defined(A64FX) + +// undefine everything +#include + /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine /////////////////////////////////////////////////////////// @@ -46,7 +44,6 @@ Author: paboyle #include #endif - /// Switch off the 5d vectorised code optimisations #undef DWFVEC5D diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index bd9ebe5d..2bc5eb6e 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -43,6 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd +#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd @@ -53,6 +54,7 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd #define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd @@ -689,3 +691,23 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z24.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z25.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z26.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z27.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z28.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z29.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 2ece4299..f20396b9 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -43,6 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf +#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf @@ -53,6 +54,7 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf #define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf @@ -703,3 +705,23 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z12.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z13.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z14.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z15.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z16.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z17.f \n\t" \ + "fadd z6.f, p5/m, z6.f, z24.f \n\t" \ + "fadd z7.f, p5/m, z7.f, z25.f \n\t" \ + "fadd z8.f, p5/m, z8.f, z26.f \n\t" \ + "fadd z9.f, p5/m, z9.f, z27.f \n\t" \ + "fadd z10.f, p5/m, z10.f, z28.f \n\t" \ + "fadd z11.f, p5/m, z11.f, z29.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 4b85563c..f58b86df 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -43,6 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd +#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd @@ -53,6 +54,7 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd #define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd @@ -565,3 +567,18 @@ Author: Nils Meyer result_31 = __svzero(result_31); \ result_32 = __svzero(result_32); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 7f8132e8..93bd1dc7 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -43,6 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf +#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf @@ -53,6 +54,7 @@ Author: Nils Meyer #define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf #define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf #define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf #define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf @@ -574,3 +576,18 @@ Author: Nils Meyer result_31 = __svzero(result_31); \ result_32 = __svzero(result_32); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 07939007..a09e1698 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -41,6 +41,7 @@ Author: Nils Meyer #undef COMPLEX_SIGNS #undef LOAD64 #undef SAVE_RESULT +#undef ADD_RESULT #undef MULT_2SPIN_DIR_PF #undef MAYBEPERM #undef LOAD_CHI @@ -55,6 +56,7 @@ Author: Nils Meyer #undef TM_PROJMEM #undef XP_RECON #undef XM_RECON +#undef XM_RECON_ACCUM #undef YM_RECON_ACCUM #undef ZM_RECON_ACCUM #undef TM_RECON_ACCUM From cd1efee8666b129d6fa8747cdc56efa65e1cbe78 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 16:35:13 +0200 Subject: [PATCH 024/147] changes --- Grid/simd/Fujitsu_A64FX_asm_double.h | 2 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 2 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 2 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 2bc5eb6e..b4e919b2 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index f20396b9..772eaf38 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index f58b86df..1663fc07 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 93bd1dc7..0fdec23f 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf From 86c9c4da8bc1cbf57a5fa4d7e3cf04dcea2c507c Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 16:40:06 +0200 Subject: [PATCH 025/147] changes --- Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index b2a2b961..10603ac3 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -505,7 +505,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie // KNL stuff -//#undef MAYBEPERM +#undef MAYBEPERM //#undef MULT_2SPIN #define MAYBEPERM(A,B) //#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) From 8123590a1b3d2be3388367e7490e2afcdbe44175 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 16:45:47 +0200 Subject: [PATCH 026/147] changes --- Grid/simd/Fujitsu_A64FX_asm_single.h | 678 +++++++++++++-------------- 1 file changed, 339 insertions(+), 339 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 772eaf38..d86d2ec5 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -74,7 +74,7 @@ Author: Nils Meyer {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ asm ( \ - "fmov z31.f , 0 \n\t" \ + "fmov z31.s , 0 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -84,18 +84,18 @@ asm ( \ #define RESULT_A64FXf(base) \ { \ asm ( \ - "stnt1d { z0.f }, p5, [%[storeptr], -6, mul vl] \n\t" \ - "stnt1d { z1.f }, p5, [%[storeptr], -5, mul vl] \n\t" \ - "stnt1d { z2.f }, p5, [%[storeptr], -4, mul vl] \n\t" \ - "stnt1d { z3.f }, p5, [%[storeptr], -3, mul vl] \n\t" \ - "stnt1d { z4.f }, p5, [%[storeptr], -2, mul vl] \n\t" \ - "stnt1d { z5.f }, p5, [%[storeptr], -1, mul vl] \n\t" \ - "stnt1d { z6.f }, p5, [%[storeptr], 0, mul vl] \n\t" \ - "stnt1d { z7.f }, p5, [%[storeptr], 1, mul vl] \n\t" \ - "stnt1d { z8.f }, p5, [%[storeptr], 2, mul vl] \n\t" \ - "stnt1d { z9.f }, p5, [%[storeptr], 3, mul vl] \n\t" \ - "stnt1d { z10.f }, p5, [%[storeptr], 4, mul vl] \n\t" \ - "stnt1d { z11.f }, p5, [%[storeptr], 5, mul vl] \n\t" \ + "stnt1d { z0.s }, p5, [%[storeptr], -6, mul vl] \n\t" \ + "stnt1d { z1.s }, p5, [%[storeptr], -5, mul vl] \n\t" \ + "stnt1d { z2.s }, p5, [%[storeptr], -4, mul vl] \n\t" \ + "stnt1d { z3.s }, p5, [%[storeptr], -3, mul vl] \n\t" \ + "stnt1d { z4.s }, p5, [%[storeptr], -2, mul vl] \n\t" \ + "stnt1d { z5.s }, p5, [%[storeptr], -1, mul vl] \n\t" \ + "stnt1d { z6.s }, p5, [%[storeptr], 0, mul vl] \n\t" \ + "stnt1d { z7.s }, p5, [%[storeptr], 1, mul vl] \n\t" \ + "stnt1d { z8.s }, p5, [%[storeptr], 2, mul vl] \n\t" \ + "stnt1d { z9.s }, p5, [%[storeptr], 3, mul vl] \n\t" \ + "stnt1d { z10.s }, p5, [%[storeptr], 4, mul vl] \n\t" \ + "stnt1d { z11.s }, p5, [%[storeptr], 5, mul vl] \n\t" \ : \ : [storeptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -155,12 +155,12 @@ asm ( \ #define LOAD_CHI_A64FXf(base) \ { \ asm ( \ - "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -170,19 +170,19 @@ asm ( \ #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ { \ asm ( \ - "ptrue p5.f \n\t" \ - "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ptrue p5.s \n\t" \ + "ld1d { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -193,19 +193,19 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.f \n\t" \ - "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ptrue p5.s \n\t" \ + "ld1d { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -216,19 +216,19 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.f \n\t" \ - "ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ptrue p5.s \n\t" \ + "ld1d { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -237,13 +237,13 @@ asm ( \ // PERM0 #define PERM0_A64FXf \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.f, { z12.f }, z30.f \n\t" \ - "tbl z13.f, { z13.f }, z30.f \n\t" \ - "tbl z14.f, { z14.f }, z30.f \n\t" \ - "tbl z15.f, { z15.f }, z30.f \n\t" \ - "tbl z16.f, { z16.f }, z30.f \n\t" \ - "tbl z17.f, { z17.f }, z30.f \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (0) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -252,13 +252,13 @@ asm ( \ // PERM1 #define PERM1_A64FXf \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.f, { z12.f }, z30.f \n\t" \ - "tbl z13.f, { z13.f }, z30.f \n\t" \ - "tbl z14.f, { z14.f }, z30.f \n\t" \ - "tbl z15.f, { z15.f }, z30.f \n\t" \ - "tbl z16.f, { z16.f }, z30.f \n\t" \ - "tbl z17.f, { z17.f }, z30.f \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (1) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -267,13 +267,13 @@ asm ( \ // PERM2 #define PERM2_A64FXf \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.f, { z12.f }, z30.f \n\t" \ - "tbl z13.f, { z13.f }, z30.f \n\t" \ - "tbl z14.f, { z14.f }, z30.f \n\t" \ - "tbl z15.f, { z15.f }, z30.f \n\t" \ - "tbl z16.f, { z16.f }, z30.f \n\t" \ - "tbl z17.f, { z17.f }, z30.f \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (2) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -282,13 +282,13 @@ asm ( \ // PERM3 #define PERM3_A64FXf \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.f, { z12.f }, z30.f \n\t" \ - "tbl z13.f, { z13.f }, z30.f \n\t" \ - "tbl z14.f, { z14.f }, z30.f \n\t" \ - "tbl z15.f, { z15.f }, z30.f \n\t" \ - "tbl z16.f, { z16.f }, z30.f \n\t" \ - "tbl z17.f, { z17.f }, z30.f \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (3) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -299,57 +299,57 @@ asm ( \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "ld1d { z24.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "fmov z18.f , 0 \n\t" \ - "fmov z21.f , 0 \n\t" \ - "fmov z19.f , 0 \n\t" \ - "fmov z22.f , 0 \n\t" \ - "fmov z20.f , 0 \n\t" \ - "fmov z23.f , 0 \n\t" \ - "fcmla z18.f, p5/m, z24.f, z12.f, 0 \n\t" \ - "fcmla z21.f, p5/m, z24.f, z15.f, 0 \n\t" \ - "fcmla z19.f, p5/m, z25.f, z12.f, 0 \n\t" \ - "fcmla z22.f, p5/m, z25.f, z15.f, 0 \n\t" \ - "fcmla z20.f, p5/m, z26.f, z12.f, 0 \n\t" \ - "fcmla z23.f, p5/m, z26.f, z15.f, 0 \n\t" \ - "fcmla z18.f, p5/m, z24.f, z12.f, 90 \n\t" \ - "fcmla z21.f, p5/m, z24.f, z15.f, 90 \n\t" \ - "fcmla z19.f, p5/m, z25.f, z12.f, 90 \n\t" \ - "fcmla z22.f, p5/m, z25.f, z15.f, 90 \n\t" \ - "fcmla z20.f, p5/m, z26.f, z12.f, 90 \n\t" \ - "fcmla z23.f, p5/m, z26.f, z15.f, 90 \n\t" \ - "ld1d { z24.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z25.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "fcmla z18.f, p5/m, z27.f, z13.f, 0 \n\t" \ - "fcmla z21.f, p5/m, z27.f, z16.f, 0 \n\t" \ - "fcmla z19.f, p5/m, z28.f, z13.f, 0 \n\t" \ - "fcmla z22.f, p5/m, z28.f, z16.f, 0 \n\t" \ - "fcmla z20.f, p5/m, z29.f, z13.f, 0 \n\t" \ - "fcmla z23.f, p5/m, z29.f, z16.f, 0 \n\t" \ - "fcmla z18.f, p5/m, z27.f, z13.f, 90 \n\t" \ - "fcmla z21.f, p5/m, z27.f, z16.f, 90 \n\t" \ - "fcmla z19.f, p5/m, z28.f, z13.f, 90 \n\t" \ - "fcmla z22.f, p5/m, z28.f, z16.f, 90 \n\t" \ - "fcmla z20.f, p5/m, z29.f, z13.f, 90 \n\t" \ - "fcmla z23.f, p5/m, z29.f, z16.f, 90 \n\t" \ - "fcmla z18.f, p5/m, z24.f, z14.f, 0 \n\t" \ - "fcmla z21.f, p5/m, z24.f, z17.f, 0 \n\t" \ - "fcmla z19.f, p5/m, z25.f, z14.f, 0 \n\t" \ - "fcmla z22.f, p5/m, z25.f, z17.f, 0 \n\t" \ - "fcmla z20.f, p5/m, z26.f, z14.f, 0 \n\t" \ - "fcmla z23.f, p5/m, z26.f, z17.f, 0 \n\t" \ - "fcmla z18.f, p5/m, z24.f, z14.f, 90 \n\t" \ - "fcmla z21.f, p5/m, z24.f, z17.f, 90 \n\t" \ - "fcmla z19.f, p5/m, z25.f, z14.f, 90 \n\t" \ - "fcmla z22.f, p5/m, z25.f, z17.f, 90 \n\t" \ - "fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \ - "fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \ + "ld1d { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "fmov z18.s , 0 \n\t" \ + "fmov z21.s , 0 \n\t" \ + "fmov z19.s , 0 \n\t" \ + "fmov z22.s , 0 \n\t" \ + "fmov z20.s , 0 \n\t" \ + "fmov z23.s , 0 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ + "ld1d { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ + "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ + "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ + "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \ + "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \ + "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -359,13 +359,13 @@ asm ( \ #define XP_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.f, p5/m, z12.f, z27.f, 90 \n\t" \ - "fcadd z13.f, p5/m, z13.f, z28.f, 90 \n\t" \ - "fcadd z14.f, p5/m, z14.f, z29.f, 90 \n\t" \ - "fcadd z15.f, p5/m, z15.f, z24.f, 90 \n\t" \ - "fcadd z16.f, p5/m, z16.f, z25.f, 90 \n\t" \ - "fcadd z17.f, p5/m, z17.f, z26.f, 90 \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.s, p5/m, z12.s, z27.s, 90 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z28.s, 90 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z29.s, 90 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z24.s, 90 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z25.s, 90 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z26.s, 90 \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (3) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -374,18 +374,18 @@ asm ( \ // XP_RECON #define XP_RECON_A64FXf \ asm ( \ - "fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \ - "fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \ - "fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \ - "fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \ - "fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \ - "fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \ - "mov z0.f, z18.f \n\t" \ - "mov z1.f, z19.f \n\t" \ - "mov z2.f, z20.f \n\t" \ - "mov z3.f, z21.f \n\t" \ - "mov z4.f, z22.f \n\t" \ - "mov z5.f, z23.f \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ + "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ + "mov z0.s, z18.s \n\t" \ + "mov z1.s, z19.s \n\t" \ + "mov z2.s, z20.s \n\t" \ + "mov z3.s, z21.s \n\t" \ + "mov z4.s, z22.s \n\t" \ + "mov z5.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -394,18 +394,18 @@ asm ( \ // XP_RECON_ACCUM #define XP_RECON_ACCUM_A64FXf \ asm ( \ - "fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -415,13 +415,13 @@ asm ( \ #define YP_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fsub z12.f, p5/m, z12.f, z27.f \n\t" \ - "fsub z13.f, p5/m, z13.f, z28.f \n\t" \ - "fsub z14.f, p5/m, z14.f, z29.f \n\t" \ - "fadd z15.f, p5/m, z15.f, z24.f \n\t" \ - "fadd z16.f, p5/m, z16.f, z25.f \n\t" \ - "fadd z17.f, p5/m, z17.f, z26.f \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.s, p5/m, z12.s, z27.s \n\t" \ + "fsub z13.s, p5/m, z13.s, z28.s \n\t" \ + "fsub z14.s, p5/m, z14.s, z29.s \n\t" \ + "fadd z15.s, p5/m, z15.s, z24.s \n\t" \ + "fadd z16.s, p5/m, z16.s, z25.s \n\t" \ + "fadd z17.s, p5/m, z17.s, z26.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (2) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -431,13 +431,13 @@ asm ( \ #define ZP_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.f, p5/m, z12.f, z24.f, 90 \n\t" \ - "fcadd z13.f, p5/m, z13.f, z25.f, 90 \n\t" \ - "fcadd z14.f, p5/m, z14.f, z26.f, 90 \n\t" \ - "fcadd z15.f, p5/m, z15.f, z27.f, 270 \n\t" \ - "fcadd z16.f, p5/m, z16.f, z28.f, 270 \n\t" \ - "fcadd z17.f, p5/m, z17.f, z29.f, 270 \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.s, p5/m, z12.s, z24.s, 90 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z25.s, 90 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z26.s, 90 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z27.s, 270 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z28.s, 270 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z29.s, 270 \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (1) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -447,13 +447,13 @@ asm ( \ #define TP_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fadd z12.f, p5/m, z12.f, z24.f \n\t" \ - "fadd z13.f, p5/m, z13.f, z25.f \n\t" \ - "fadd z14.f, p5/m, z14.f, z26.f \n\t" \ - "fadd z15.f, p5/m, z15.f, z27.f \n\t" \ - "fadd z16.f, p5/m, z16.f, z28.f \n\t" \ - "fadd z17.f, p5/m, z17.f, z29.f \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.s, p5/m, z12.s, z24.s \n\t" \ + "fadd z13.s, p5/m, z13.s, z25.s \n\t" \ + "fadd z14.s, p5/m, z14.s, z26.s \n\t" \ + "fadd z15.s, p5/m, z15.s, z27.s \n\t" \ + "fadd z16.s, p5/m, z16.s, z28.s \n\t" \ + "fadd z17.s, p5/m, z17.s, z29.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (0) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -463,13 +463,13 @@ asm ( \ #define XM_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.f, p5/m, z12.f, z27.f, 270 \n\t" \ - "fcadd z13.f, p5/m, z13.f, z28.f, 270 \n\t" \ - "fcadd z14.f, p5/m, z14.f, z29.f, 270 \n\t" \ - "fcadd z15.f, p5/m, z15.f, z24.f, 270 \n\t" \ - "fcadd z16.f, p5/m, z16.f, z25.f, 270 \n\t" \ - "fcadd z17.f, p5/m, z17.f, z26.f, 270 \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.s, p5/m, z12.s, z27.s, 270 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z28.s, 270 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z29.s, 270 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z24.s, 270 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z25.s, 270 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z26.s, 270 \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (3) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -478,18 +478,18 @@ asm ( \ // XM_RECON #define XM_RECON_A64FXf \ asm ( \ - "fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \ - "fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \ - "fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \ - "fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \ - "fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \ - "fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \ - "mov z0.f, z18.f \n\t" \ - "mov z1.f, z19.f \n\t" \ - "mov z2.f, z20.f \n\t" \ - "mov z3.f, z21.f \n\t" \ - "mov z4.f, z22.f \n\t" \ - "mov z5.f, z23.f \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ + "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ + "mov z0.s, z18.s \n\t" \ + "mov z1.s, z19.s \n\t" \ + "mov z2.s, z20.s \n\t" \ + "mov z3.s, z21.s \n\t" \ + "mov z4.s, z22.s \n\t" \ + "mov z5.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -499,13 +499,13 @@ asm ( \ #define YM_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fadd z12.f, p5/m, z12.f, z27.f \n\t" \ - "fadd z13.f, p5/m, z13.f, z28.f \n\t" \ - "fadd z14.f, p5/m, z14.f, z29.f \n\t" \ - "fsub z15.f, p5/m, z15.f, z24.f \n\t" \ - "fsub z16.f, p5/m, z16.f, z25.f \n\t" \ - "fsub z17.f, p5/m, z17.f, z26.f \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.s, p5/m, z12.s, z27.s \n\t" \ + "fadd z13.s, p5/m, z13.s, z28.s \n\t" \ + "fadd z14.s, p5/m, z14.s, z29.s \n\t" \ + "fsub z15.s, p5/m, z15.s, z24.s \n\t" \ + "fsub z16.s, p5/m, z16.s, z25.s \n\t" \ + "fsub z17.s, p5/m, z17.s, z26.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (2) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -515,13 +515,13 @@ asm ( \ #define ZM_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fcadd z12.f, p5/m, z12.f, z24.f, 270 \n\t" \ - "fcadd z13.f, p5/m, z13.f, z25.f, 270 \n\t" \ - "fcadd z14.f, p5/m, z14.f, z26.f, 270 \n\t" \ - "fcadd z15.f, p5/m, z15.f, z27.f, 90 \n\t" \ - "fcadd z16.f, p5/m, z16.f, z28.f, 90 \n\t" \ - "fcadd z17.f, p5/m, z17.f, z29.f, 90 \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.s, p5/m, z12.s, z24.s, 270 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z25.s, 270 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z26.s, 270 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z27.s, 90 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z28.s, 90 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z29.s, 90 \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (1) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -531,13 +531,13 @@ asm ( \ #define TM_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "fsub z12.f, p5/m, z12.f, z24.f \n\t" \ - "fsub z13.f, p5/m, z13.f, z25.f \n\t" \ - "fsub z14.f, p5/m, z14.f, z26.f \n\t" \ - "fsub z15.f, p5/m, z15.f, z27.f \n\t" \ - "fsub z16.f, p5/m, z16.f, z28.f \n\t" \ - "fsub z17.f, p5/m, z17.f, z29.f \n\t" \ + "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.s, p5/m, z12.s, z24.s \n\t" \ + "fsub z13.s, p5/m, z13.s, z25.s \n\t" \ + "fsub z14.s, p5/m, z14.s, z26.s \n\t" \ + "fsub z15.s, p5/m, z15.s, z27.s \n\t" \ + "fsub z16.s, p5/m, z16.s, z28.s \n\t" \ + "fsub z17.s, p5/m, z17.s, z29.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (0) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -546,18 +546,18 @@ asm ( \ // XM_RECON_ACCUM #define XM_RECON_ACCUM_A64FXf \ asm ( \ - "fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \ - "fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \ - "fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \ - "fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \ - "fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \ - "fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \ - "mov z0.f, z18.f \n\t" \ - "mov z1.f, z19.f \n\t" \ - "mov z2.f, z20.f \n\t" \ - "mov z3.f, z21.f \n\t" \ - "mov z4.f, z22.f \n\t" \ - "mov z5.f, z23.f \n\t" \ + "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ + "mov z0.s, z18.s \n\t" \ + "mov z1.s, z19.s \n\t" \ + "mov z2.s, z20.s \n\t" \ + "mov z3.s, z21.s \n\t" \ + "mov z4.s, z22.s \n\t" \ + "mov z5.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -566,18 +566,18 @@ asm ( \ // YP_RECON_ACCUM #define YP_RECON_ACCUM_A64FXf \ asm ( \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fsub z9.f, p5/m, z9.f, z18.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fsub z10.f, p5/m, z10.f, z19.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fsub z11.f, p5/m, z11.f, z20.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fadd z6.f, p5/m, z6.f, z21.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fadd z7.f, p5/m, z7.f, z22.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - "fadd z8.f, p5/m, z8.f, z23.f \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fsub z9.s, p5/m, z9.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fsub z10.s, p5/m, z10.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fsub z11.s, p5/m, z11.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -586,18 +586,18 @@ asm ( \ // YM_RECON_ACCUM #define YM_RECON_ACCUM_A64FXf \ asm ( \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fadd z9.f, p5/m, z9.f, z18.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fadd z10.f, p5/m, z10.f, z19.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fadd z11.f, p5/m, z11.f, z20.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fsub z6.f, p5/m, z6.f, z21.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fsub z7.f, p5/m, z7.f, z22.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - "fsub z8.f, p5/m, z8.f, z23.f \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fsub z6.s, p5/m, z6.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fsub z7.s, p5/m, z7.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fsub z8.s, p5/m, z8.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -606,18 +606,18 @@ asm ( \ // ZP_RECON_ACCUM #define ZP_RECON_ACCUM_A64FXf \ asm ( \ - "fcadd z6.f, p5/m, z6.f, z18.f, 270 \n\t" \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fcadd z7.f, p5/m, z7.f, z19.f, 270 \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fcadd z8.f, p5/m, z8.f, z20.f, 270 \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fcadd z9.f, p5/m, z9.f, z21.f, 90 \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fcadd z10.f, p5/m, z10.f, z22.f, 90 \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fcadd z11.f, p5/m, z11.f, z23.f, 90 \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -626,18 +626,18 @@ asm ( \ // ZM_RECON_ACCUM #define ZM_RECON_ACCUM_A64FXf \ asm ( \ - "fcadd z6.f, p5/m, z6.f, z18.f, 90 \n\t" \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fcadd z7.f, p5/m, z7.f, z19.f, 90 \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fcadd z8.f, p5/m, z8.f, z20.f, 90 \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fcadd z9.f, p5/m, z9.f, z21.f, 270 \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fcadd z10.f, p5/m, z10.f, z22.f, 270 \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fcadd z11.f, p5/m, z11.f, z23.f, 270 \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ + "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -646,19 +646,19 @@ asm ( \ // TP_RECON_ACCUM #define TP_RECON_ACCUM_A64FXf \ asm ( \ - "ptrue p5.f \n\t" \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fadd z6.f, p5/m, z6.f, z18.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fadd z7.f, p5/m, z7.f, z19.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fadd z8.f, p5/m, z8.f, z20.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fadd z9.f, p5/m, z9.f, z21.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fadd z10.f, p5/m, z10.f, z22.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - "fadd z11.f, p5/m, z11.f, z23.f \n\t" \ + "ptrue p5.s \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -667,18 +667,18 @@ asm ( \ // TM_RECON_ACCUM #define TM_RECON_ACCUM_A64FXf \ asm ( \ - "fadd z0.f, p5/m, z0.f, z18.f \n\t" \ - "fsub z6.f, p5/m, z6.f, z18.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z19.f \n\t" \ - "fsub z7.f, p5/m, z7.f, z19.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z20.f \n\t" \ - "fsub z8.f, p5/m, z8.f, z20.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z21.f \n\t" \ - "fsub z9.f, p5/m, z9.f, z21.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z22.f \n\t" \ - "fsub z10.f, p5/m, z10.f, z22.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z23.f \n\t" \ - "fsub z11.f, p5/m, z11.f, z23.f \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fsub z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fsub z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fsub z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fsub z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fsub z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fsub z11.s, p5/m, z11.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -687,19 +687,19 @@ asm ( \ // ZERO_PSI #define ZERO_PSI_A64FXf \ asm ( \ - "ptrue p5.f \n\t" \ - "fmov z0.f , 0 \n\t" \ - "fmov z1.f , 0 \n\t" \ - "fmov z2.f , 0 \n\t" \ - "fmov z3.f , 0 \n\t" \ - "fmov z4.f , 0 \n\t" \ - "fmov z5.f , 0 \n\t" \ - "fmov z6.f , 0 \n\t" \ - "fmov z7.f , 0 \n\t" \ - "fmov z8.f , 0 \n\t" \ - "fmov z9.f , 0 \n\t" \ - "fmov z10.f , 0 \n\t" \ - "fmov z11.f , 0 \n\t" \ + "ptrue p5.s \n\t" \ + "fmov z0.s , 0 \n\t" \ + "fmov z1.s , 0 \n\t" \ + "fmov z2.s , 0 \n\t" \ + "fmov z3.s , 0 \n\t" \ + "fmov z4.s , 0 \n\t" \ + "fmov z5.s , 0 \n\t" \ + "fmov z6.s , 0 \n\t" \ + "fmov z7.s , 0 \n\t" \ + "fmov z8.s , 0 \n\t" \ + "fmov z9.s , 0 \n\t" \ + "fmov z10.s , 0 \n\t" \ + "fmov z11.s , 0 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -708,18 +708,18 @@ asm ( \ // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXf \ asm ( \ - "fadd z0.f, p5/m, z0.f, z12.f \n\t" \ - "fadd z1.f, p5/m, z1.f, z13.f \n\t" \ - "fadd z2.f, p5/m, z2.f, z14.f \n\t" \ - "fadd z3.f, p5/m, z3.f, z15.f \n\t" \ - "fadd z4.f, p5/m, z4.f, z16.f \n\t" \ - "fadd z5.f, p5/m, z5.f, z17.f \n\t" \ - "fadd z6.f, p5/m, z6.f, z24.f \n\t" \ - "fadd z7.f, p5/m, z7.f, z25.f \n\t" \ - "fadd z8.f, p5/m, z8.f, z26.f \n\t" \ - "fadd z9.f, p5/m, z9.f, z27.f \n\t" \ - "fadd z10.f, p5/m, z10.f, z28.f \n\t" \ - "fadd z11.f, p5/m, z11.f, z29.f \n\t" \ + "fadd z0.s, p5/m, z0.s, z12.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z13.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z14.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z15.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z16.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z17.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z24.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z25.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z26.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z27.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z28.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z29.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ From 5cdbb7e71e6f149ff4721593395a8998d24e87f0 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Thu, 9 Apr 2020 21:23:39 +0200 Subject: [PATCH 027/147] fixed A64FX Dslash; compiles, but does not specialize -> assertion --- .../implementation/WilsonKernelsAsmA64FX.h | 4 +- Grid/simd/Fujitsu_A64FX_asm_double.h | 36 ++-- Grid/simd/Fujitsu_A64FX_asm_single.h | 186 +++++++++--------- Grid/simd/Fujitsu_A64FX_intrin_single.h | 24 +-- Grid/simd/Fujitsu_A64FX_undef.h | 1 + 5 files changed, 126 insertions(+), 125 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 10603ac3..46fbf6f3 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -30,7 +30,7 @@ Author: paboyle /* END LEGAL */ #pragma once -#if defined(A64FX) +#if defined(DSLASHA64FX) // undefine everything #include @@ -38,7 +38,7 @@ Author: paboyle /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine /////////////////////////////////////////////////////////// -#if defined(A64FXINTRIN) +#if defined(DSLASHINTRIN) #include #else #include diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index b4e919b2..9269ec2a 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -367,12 +367,12 @@ asm ( \ "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "mov z0.d, z18.d \n\t" \ - "mov z1.d, z19.d \n\t" \ - "mov z2.d, z20.d \n\t" \ - "mov z3.d, z21.d \n\t" \ - "mov z4.d, z22.d \n\t" \ - "mov z5.d, z23.d \n\t" \ + "mov z0.d, p5/m, z18.d \n\t" \ + "mov z1.d, p5/m, z19.d \n\t" \ + "mov z2.d, p5/m, z20.d \n\t" \ + "mov z3.d, p5/m, z21.d \n\t" \ + "mov z4.d, p5/m, z22.d \n\t" \ + "mov z5.d, p5/m, z23.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -470,12 +470,12 @@ asm ( \ "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "mov z0.d, z18.d \n\t" \ - "mov z1.d, z19.d \n\t" \ - "mov z2.d, z20.d \n\t" \ - "mov z3.d, z21.d \n\t" \ - "mov z4.d, z22.d \n\t" \ - "mov z5.d, z23.d \n\t" \ + "mov z0.d, p5/m, z18.d \n\t" \ + "mov z1.d, p5/m, z19.d \n\t" \ + "mov z2.d, p5/m, z20.d \n\t" \ + "mov z3.d, p5/m, z21.d \n\t" \ + "mov z4.d, p5/m, z22.d \n\t" \ + "mov z5.d, p5/m, z23.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -538,12 +538,12 @@ asm ( \ "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "mov z0.d, z18.d \n\t" \ - "mov z1.d, z19.d \n\t" \ - "mov z2.d, z20.d \n\t" \ - "mov z3.d, z21.d \n\t" \ - "mov z4.d, z22.d \n\t" \ - "mov z5.d, z23.d \n\t" \ + "mov z0.d, p5/m, z18.d \n\t" \ + "mov z1.d, p5/m, z19.d \n\t" \ + "mov z2.d, p5/m, z20.d \n\t" \ + "mov z3.d, p5/m, z21.d \n\t" \ + "mov z4.d, p5/m, z22.d \n\t" \ + "mov z5.d, p5/m, z23.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index d86d2ec5..ac710a97 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -84,18 +84,18 @@ asm ( \ #define RESULT_A64FXf(base) \ { \ asm ( \ - "stnt1d { z0.s }, p5, [%[storeptr], -6, mul vl] \n\t" \ - "stnt1d { z1.s }, p5, [%[storeptr], -5, mul vl] \n\t" \ - "stnt1d { z2.s }, p5, [%[storeptr], -4, mul vl] \n\t" \ - "stnt1d { z3.s }, p5, [%[storeptr], -3, mul vl] \n\t" \ - "stnt1d { z4.s }, p5, [%[storeptr], -2, mul vl] \n\t" \ - "stnt1d { z5.s }, p5, [%[storeptr], -1, mul vl] \n\t" \ - "stnt1d { z6.s }, p5, [%[storeptr], 0, mul vl] \n\t" \ - "stnt1d { z7.s }, p5, [%[storeptr], 1, mul vl] \n\t" \ - "stnt1d { z8.s }, p5, [%[storeptr], 2, mul vl] \n\t" \ - "stnt1d { z9.s }, p5, [%[storeptr], 3, mul vl] \n\t" \ - "stnt1d { z10.s }, p5, [%[storeptr], 4, mul vl] \n\t" \ - "stnt1d { z11.s }, p5, [%[storeptr], 5, mul vl] \n\t" \ + "stnt1w { z0.s }, p5, [%[storeptr], -6, mul vl] \n\t" \ + "stnt1w { z1.s }, p5, [%[storeptr], -5, mul vl] \n\t" \ + "stnt1w { z2.s }, p5, [%[storeptr], -4, mul vl] \n\t" \ + "stnt1w { z3.s }, p5, [%[storeptr], -3, mul vl] \n\t" \ + "stnt1w { z4.s }, p5, [%[storeptr], -2, mul vl] \n\t" \ + "stnt1w { z5.s }, p5, [%[storeptr], -1, mul vl] \n\t" \ + "stnt1w { z6.s }, p5, [%[storeptr], 0, mul vl] \n\t" \ + "stnt1w { z7.s }, p5, [%[storeptr], 1, mul vl] \n\t" \ + "stnt1w { z8.s }, p5, [%[storeptr], 2, mul vl] \n\t" \ + "stnt1w { z9.s }, p5, [%[storeptr], 3, mul vl] \n\t" \ + "stnt1w { z10.s }, p5, [%[storeptr], 4, mul vl] \n\t" \ + "stnt1w { z11.s }, p5, [%[storeptr], 5, mul vl] \n\t" \ : \ : [storeptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -155,12 +155,12 @@ asm ( \ #define LOAD_CHI_A64FXf(base) \ { \ asm ( \ - "ld1d { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -171,18 +171,18 @@ asm ( \ { \ asm ( \ "ptrue p5.s \n\t" \ - "ld1d { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -194,18 +194,18 @@ asm ( \ const SiteSpinor & ref(in[offset]); \ asm ( \ "ptrue p5.s \n\t" \ - "ld1d { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -217,18 +217,18 @@ asm ( \ const SiteSpinor & ref(in[offset]); \ asm ( \ "ptrue p5.s \n\t" \ - "ld1d { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -237,7 +237,7 @@ asm ( \ // PERM0 #define PERM0_A64FXf \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "tbl z12.s, { z12.s }, z30.s \n\t" \ "tbl z13.s, { z13.s }, z30.s \n\t" \ "tbl z14.s, { z14.s }, z30.s \n\t" \ @@ -252,7 +252,7 @@ asm ( \ // PERM1 #define PERM1_A64FXf \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "tbl z12.s, { z12.s }, z30.s \n\t" \ "tbl z13.s, { z13.s }, z30.s \n\t" \ "tbl z14.s, { z14.s }, z30.s \n\t" \ @@ -267,7 +267,7 @@ asm ( \ // PERM2 #define PERM2_A64FXf \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "tbl z12.s, { z12.s }, z30.s \n\t" \ "tbl z13.s, { z13.s }, z30.s \n\t" \ "tbl z14.s, { z14.s }, z30.s \n\t" \ @@ -282,7 +282,7 @@ asm ( \ // PERM3 #define PERM3_A64FXf \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "tbl z12.s, { z12.s }, z30.s \n\t" \ "tbl z13.s, { z13.s }, z30.s \n\t" \ "tbl z14.s, { z14.s }, z30.s \n\t" \ @@ -299,12 +299,12 @@ asm ( \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "ld1d { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ "fmov z18.s , 0 \n\t" \ "fmov z21.s , 0 \n\t" \ "fmov z19.s , 0 \n\t" \ @@ -323,9 +323,9 @@ asm ( \ "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ld1d { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ @@ -359,7 +359,7 @@ asm ( \ #define XP_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "fcadd z12.s, p5/m, z12.s, z27.s, 90 \n\t" \ "fcadd z13.s, p5/m, z13.s, z28.s, 90 \n\t" \ "fcadd z14.s, p5/m, z14.s, z29.s, 90 \n\t" \ @@ -380,12 +380,12 @@ asm ( \ "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "mov z0.s, z18.s \n\t" \ - "mov z1.s, z19.s \n\t" \ - "mov z2.s, z20.s \n\t" \ - "mov z3.s, z21.s \n\t" \ - "mov z4.s, z22.s \n\t" \ - "mov z5.s, z23.s \n\t" \ + "mov z0.s, p5/m, z18.s \n\t" \ + "mov z1.s, p5/m, z19.s \n\t" \ + "mov z2.s, p5/m, z20.s \n\t" \ + "mov z3.s, p5/m, z21.s \n\t" \ + "mov z4.s, p5/m, z22.s \n\t" \ + "mov z5.s, p5/m, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -415,7 +415,7 @@ asm ( \ #define YP_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "fsub z12.s, p5/m, z12.s, z27.s \n\t" \ "fsub z13.s, p5/m, z13.s, z28.s \n\t" \ "fsub z14.s, p5/m, z14.s, z29.s \n\t" \ @@ -431,7 +431,7 @@ asm ( \ #define ZP_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "fcadd z12.s, p5/m, z12.s, z24.s, 90 \n\t" \ "fcadd z13.s, p5/m, z13.s, z25.s, 90 \n\t" \ "fcadd z14.s, p5/m, z14.s, z26.s, 90 \n\t" \ @@ -447,7 +447,7 @@ asm ( \ #define TP_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "fadd z12.s, p5/m, z12.s, z24.s \n\t" \ "fadd z13.s, p5/m, z13.s, z25.s \n\t" \ "fadd z14.s, p5/m, z14.s, z26.s \n\t" \ @@ -463,7 +463,7 @@ asm ( \ #define XM_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "fcadd z12.s, p5/m, z12.s, z27.s, 270 \n\t" \ "fcadd z13.s, p5/m, z13.s, z28.s, 270 \n\t" \ "fcadd z14.s, p5/m, z14.s, z29.s, 270 \n\t" \ @@ -484,12 +484,12 @@ asm ( \ "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "mov z0.s, z18.s \n\t" \ - "mov z1.s, z19.s \n\t" \ - "mov z2.s, z20.s \n\t" \ - "mov z3.s, z21.s \n\t" \ - "mov z4.s, z22.s \n\t" \ - "mov z5.s, z23.s \n\t" \ + "mov z0.s, p5/m, z18.s \n\t" \ + "mov z1.s, p5/m, z19.s \n\t" \ + "mov z2.s, p5/m, z20.s \n\t" \ + "mov z3.s, p5/m, z21.s \n\t" \ + "mov z4.s, p5/m, z22.s \n\t" \ + "mov z5.s, p5/m, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -499,7 +499,7 @@ asm ( \ #define YM_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "fadd z12.s, p5/m, z12.s, z27.s \n\t" \ "fadd z13.s, p5/m, z13.s, z28.s \n\t" \ "fadd z14.s, p5/m, z14.s, z29.s \n\t" \ @@ -515,7 +515,7 @@ asm ( \ #define ZM_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "fcadd z12.s, p5/m, z12.s, z24.s, 270 \n\t" \ "fcadd z13.s, p5/m, z13.s, z25.s, 270 \n\t" \ "fcadd z14.s, p5/m, z14.s, z26.s, 270 \n\t" \ @@ -531,7 +531,7 @@ asm ( \ #define TM_PROJ_A64FXf \ { \ asm ( \ - "ld1d { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ "fsub z12.s, p5/m, z12.s, z24.s \n\t" \ "fsub z13.s, p5/m, z13.s, z25.s \n\t" \ "fsub z14.s, p5/m, z14.s, z26.s \n\t" \ @@ -552,12 +552,12 @@ asm ( \ "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "mov z0.s, z18.s \n\t" \ - "mov z1.s, z19.s \n\t" \ - "mov z2.s, z20.s \n\t" \ - "mov z3.s, z21.s \n\t" \ - "mov z4.s, z22.s \n\t" \ - "mov z5.s, z23.s \n\t" \ + "mov z0.s, p5/m, z18.s \n\t" \ + "mov z1.s, p5/m, z19.s \n\t" \ + "mov z2.s, p5/m, z20.s \n\t" \ + "mov z3.s, p5/m, z21.s \n\t" \ + "mov z4.s, p5/m, z22.s \n\t" \ + "mov z5.s, p5/m, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 0fdec23f..c8e12652 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -229,7 +229,7 @@ Author: Nils Meyer } // PERM0 #define PERM0_A64FXf \ - table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + table0 = svld1(pg1, (float32_t*)&lut[0]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -239,7 +239,7 @@ Author: Nils Meyer // PERM1 #define PERM1_A64FXf \ - table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + table0 = svld1(pg1, (float32_t*)&lut[1]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -249,7 +249,7 @@ Author: Nils Meyer // PERM2 #define PERM2_A64FXf \ - table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + table0 = svld1(pg1, (float32_t*)&lut[2]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -259,7 +259,7 @@ Author: Nils Meyer // PERM3 #define PERM3_A64FXf \ - table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + table0 = svld1(pg1, (float32_t*)&lut[3]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -326,7 +326,7 @@ Author: Nils Meyer // XP_PROJ #define XP_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + table0 = svld1(pg1, (float32_t*)&lut[3]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ @@ -367,7 +367,7 @@ Author: Nils Meyer // YP_PROJ #define YP_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + table0 = svld1(pg1, (float32_t*)&lut[2]); \ Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ @@ -378,7 +378,7 @@ Author: Nils Meyer // ZP_PROJ #define ZP_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + table0 = svld1(pg1, (float32_t*)&lut[1]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ @@ -389,7 +389,7 @@ Author: Nils Meyer // TP_PROJ #define TP_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + table0 = svld1(pg1, (float32_t*)&lut[0]); \ Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ @@ -400,7 +400,7 @@ Author: Nils Meyer // XM_PROJ #define XM_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (uint32_t*)&lut[3]); \ + table0 = svld1(pg1, (float32_t*)&lut[3]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ @@ -426,7 +426,7 @@ Author: Nils Meyer // YM_PROJ #define YM_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (uint32_t*)&lut[2]); \ + table0 = svld1(pg1, (float32_t*)&lut[2]); \ Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ @@ -437,7 +437,7 @@ Author: Nils Meyer // ZM_PROJ #define ZM_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (uint32_t*)&lut[1]); \ + table0 = svld1(pg1, (float32_t*)&lut[1]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ @@ -448,7 +448,7 @@ Author: Nils Meyer // TM_PROJ #define TM_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (uint32_t*)&lut[0]); \ + table0 = svld1(pg1, (float32_t*)&lut[0]); \ Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index a09e1698..31abc038 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -32,6 +32,7 @@ Author: Nils Meyer #undef PREFETCH_GAUGE_L1 #undef PREFETCH_CHIMU_L2 #undef PREFETCH_GAUGE_L2 +#undef PREFETCH_GAUGE_L1_INTERNAL #undef PF_GAUGE #undef PREFETCH1_CHIMU #undef PREFETCH_CHIMU From 635246ce50346273b26b5db80c86ecba0618021b Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Apr 2020 21:42:50 +0200 Subject: [PATCH 028/147] corrected typo --- Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 46fbf6f3..c789632f 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -353,7 +353,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double // If we are A64FX specialise the double precision routine /////////////////////////////////////////////////////////// -#if defined(A64FXINTRIN) +#if defined(DSLASHINTRIN) #include #else #include From 19eef97503199aead4b681192622e8c64641f6ff Mon Sep 17 00:00:00 2001 From: nils meyer Date: Thu, 9 Apr 2020 23:25:25 +0200 Subject: [PATCH 029/147] specialized A64FX Dslash kernels --- .../fermion/implementation/WilsonKernelsAsmA64FX.h | 12 +++++++----- .../WilsonKernelsInstantiationWilsonAdjImplD.cc | 2 ++ .../WilsonKernelsInstantiationWilsonAdjImplF.cc | 2 ++ .../WilsonKernelsInstantiationWilsonImplD.cc | 2 ++ .../WilsonKernelsInstantiationWilsonImplDF.cc | 2 ++ .../WilsonKernelsInstantiationWilsonImplF.cc | 2 ++ .../WilsonKernelsInstantiationWilsonImplFH.cc | 2 ++ .../WilsonKernelsInstantiation.cc.master | 2 ++ ...sInstantiationWilsonTwoIndexAntiSymmetricImplD.cc | 2 ++ ...sInstantiationWilsonTwoIndexAntiSymmetricImplF.cc | 2 ++ ...rnelsInstantiationWilsonTwoIndexSymmetricImplD.cc | 2 ++ ...rnelsInstantiationWilsonTwoIndexSymmetricImplF.cc | 2 ++ .../WilsonKernelsInstantiationZWilsonImplD.cc | 2 ++ .../WilsonKernelsInstantiationZWilsonImplDF.cc | 2 ++ .../WilsonKernelsInstantiationZWilsonImplF.cc | 2 ++ .../WilsonKernelsInstantiationZWilsonImplFH.cc | 2 ++ Grid/util/version.cc | 2 +- 17 files changed, 38 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 46fbf6f3..e3c40fe6 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -30,7 +30,9 @@ Author: paboyle /* END LEGAL */ #pragma once -#if defined(DSLASHA64FX) +#if defined(A64FX) + +#pragma message("invoking A64FX Dslash") // undefine everything #include @@ -194,9 +196,9 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#undef MAYBEPERM +//#undef MAYBEPERM //#undef MULT_2SPIN -#define MAYBEPERM(A,B) +//#define MAYBEPERM(A,B) //#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) ///////////////////////////////////////////////////////////////// @@ -346,7 +348,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double //#undef MAYBEPERM //#undef MULT_2SPIN -// undefine everything +// undefine #include /////////////////////////////////////////////////////////// @@ -359,7 +361,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double #include #endif -// KNL stuff +// former KNL //#define MAYBEPERM(A,perm) if (perm) { A ; } //#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) //#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc index 9af5ed85..f2c0f9d2 100644 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc @@ -34,9 +34,11 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX #include #endif #endif +#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/util/version.cc b/Grid/util/version.cc index c9507137..984765d1 100644 --- a/Grid/util/version.cc +++ b/Grid/util/version.cc @@ -1,5 +1,5 @@ #include -#include "Version.h" +//#include "Version.h" namespace Grid { void printHash(){ #ifdef GITHASH From dc9c8340bbef174a98fec501049c5f07d2979ad4 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Thu, 9 Apr 2020 23:30:23 +0200 Subject: [PATCH 030/147] switched to DSLASHINTRIN for A64FX Dslash intrinsics --- Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index e3c40fe6..139721e6 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -355,7 +355,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double // If we are A64FX specialise the double precision routine /////////////////////////////////////////////////////////// -#if defined(A64FXINTRIN) +#if defined(DSLASHINTRIN) #include #else #include From 0ad2e0815c96d2ed473df7b0c289a07a4e8220dc Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 10 Apr 2020 11:56:29 +0200 Subject: [PATCH 031/147] debug output in WilsonHand --- .../WilsonKernelsHandImplementation.h | 604 ++++++++++++++-- .../WilsonKernelsHandImplementation.h.orig | 684 ++++++++++++++++++ 2 files changed, 1216 insertions(+), 72 deletions(-) create mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.orig diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index f7b018fa..19ed3e06 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -32,29 +32,29 @@ Author: paboyle #include -#undef LOAD_CHIMU -#undef LOAD_CHI +#undef LOAD_CHIMU +#undef LOAD_CHI #undef MULT_2SPIN #undef PERMUTE_DIR -#undef XP_PROJ -#undef YP_PROJ -#undef ZP_PROJ -#undef TP_PROJ -#undef XM_PROJ -#undef YM_PROJ -#undef ZM_PROJ -#undef TM_PROJ -#undef XP_RECON -#undef XP_RECON_ACCUM -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef ZERO_RESULT +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT #undef Chimu_00 #undef Chimu_01 #undef Chimu_02 @@ -89,7 +89,33 @@ Author: paboyle Chimu_22=ref()(2)(2);\ Chimu_30=ref()(3)(0);\ Chimu_31=ref()(3)(1);\ - Chimu_32=ref()(3)(2);} + Chimu_32=ref()(3)(2);\ + std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_00); \ + std::cout << "Chimu_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_01); \ + std::cout << "Chimu_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_02); \ + std::cout << "Chimu_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_10); \ + std::cout << "Chimu_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_11); \ + std::cout << "Chimu_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_12); \ + std::cout << "Chimu_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_20); \ + std::cout << "Chimu_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_21); \ + std::cout << "Chimu_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_22); \ + std::cout << "Chimu_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_30); \ + std::cout << "Chimu_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_31); \ + std::cout << "Chimu_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chimu_32); \ + std::cout << "Chimu_32 -- " << debugreg << std::endl; \ +} #define LOAD_CHI\ {const SiteHalfSpinor &ref(buf[offset]); \ @@ -98,7 +124,21 @@ Author: paboyle Chi_02 = ref()(0)(2);\ Chi_10 = ref()(1)(0);\ Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2);} + Chi_12 = ref()(1)(2);\ + std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; \ + } // To splat or not to splat depends on the implementation #define MULT_2SPIN(A)\ @@ -129,16 +169,56 @@ Author: paboyle UChi_01+= U_10*Chi_02;\ UChi_11+= U_10*Chi_12;\ UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12;} + UChi_12+= U_20*Chi_12;\ + std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, UChi_00); \ + std::cout << "UChi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, UChi_01); \ + std::cout << "UChi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, UChi_02); \ + std::cout << "UChi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, UChi_10); \ + std::cout << "UChi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, UChi_11); \ + std::cout << "UChi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, UChi_12); \ + std::cout << "UChi_12 -- " << debugreg << std::endl; \ + } #define PERMUTE_DIR(dir) \ +std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \ +svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ +std::cout << "Chi_00 -- " << debugreg << std::endl; \ +svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ +std::cout << "Chi_01 -- " << debugreg << std::endl; \ +svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ +std::cout << "Chi_02 -- " << debugreg << std::endl; \ +svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ +std::cout << "Chi_10 -- " << debugreg << std::endl; \ +svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ +std::cout << "Chi_11 -- " << debugreg << std::endl; \ +svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ +std::cout << "Chi_12 -- " << debugreg << std::endl; \ permute##dir(Chi_00,Chi_00);\ permute##dir(Chi_01,Chi_01);\ permute##dir(Chi_02,Chi_02);\ permute##dir(Chi_10,Chi_10);\ permute##dir(Chi_11,Chi_11);\ - permute##dir(Chi_12,Chi_12); + permute##dir(Chi_12,Chi_12);\ + std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); @@ -148,7 +228,20 @@ Author: paboyle Chi_02 = Chimu_02+timesI(Chimu_32);\ Chi_10 = Chimu_10+timesI(Chimu_20);\ Chi_11 = Chimu_11+timesI(Chimu_21);\ - Chi_12 = Chimu_12+timesI(Chimu_22); + Chi_12 = Chimu_12+timesI(Chimu_22);\ + std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; #define YP_PROJ \ Chi_00 = Chimu_00-Chimu_30;\ @@ -156,7 +249,20 @@ Author: paboyle Chi_02 = Chimu_02-Chimu_32;\ Chi_10 = Chimu_10+Chimu_20;\ Chi_11 = Chimu_11+Chimu_21;\ - Chi_12 = Chimu_12+Chimu_22; + Chi_12 = Chimu_12+Chimu_22;\ + std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; #define ZP_PROJ \ Chi_00 = Chimu_00+timesI(Chimu_20); \ @@ -164,7 +270,20 @@ Author: paboyle Chi_02 = Chimu_02+timesI(Chimu_22); \ Chi_10 = Chimu_10-timesI(Chimu_30); \ Chi_11 = Chimu_11-timesI(Chimu_31); \ - Chi_12 = Chimu_12-timesI(Chimu_32); + Chi_12 = Chimu_12-timesI(Chimu_32);\ + std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; #define TP_PROJ \ Chi_00 = Chimu_00+Chimu_20; \ @@ -172,7 +291,20 @@ Author: paboyle Chi_02 = Chimu_02+Chimu_22; \ Chi_10 = Chimu_10+Chimu_30; \ Chi_11 = Chimu_11+Chimu_31; \ - Chi_12 = Chimu_12+Chimu_32; + Chi_12 = Chimu_12+Chimu_32;\ + std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; // hspin(0)=fspin(0)-timesI(fspin(3)); @@ -183,7 +315,20 @@ Author: paboyle Chi_02 = Chimu_02-timesI(Chimu_32);\ Chi_10 = Chimu_10-timesI(Chimu_20);\ Chi_11 = Chimu_11-timesI(Chimu_21);\ - Chi_12 = Chimu_12-timesI(Chimu_22); + Chi_12 = Chimu_12-timesI(Chimu_22);\ + std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; #define YM_PROJ \ Chi_00 = Chimu_00+Chimu_30;\ @@ -191,7 +336,20 @@ Author: paboyle Chi_02 = Chimu_02+Chimu_32;\ Chi_10 = Chimu_10-Chimu_20;\ Chi_11 = Chimu_11-Chimu_21;\ - Chi_12 = Chimu_12-Chimu_22; + Chi_12 = Chimu_12-Chimu_22;\ + std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; #define ZM_PROJ \ Chi_00 = Chimu_00-timesI(Chimu_20); \ @@ -199,7 +357,20 @@ Author: paboyle Chi_02 = Chimu_02-timesI(Chimu_22); \ Chi_10 = Chimu_10+timesI(Chimu_30); \ Chi_11 = Chimu_11+timesI(Chimu_31); \ - Chi_12 = Chimu_12+timesI(Chimu_32); + Chi_12 = Chimu_12+timesI(Chimu_32);\ + std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; #define TM_PROJ \ Chi_00 = Chimu_00-Chimu_20; \ @@ -207,7 +378,20 @@ Author: paboyle Chi_02 = Chimu_02-Chimu_22; \ Chi_10 = Chimu_10-Chimu_30; \ Chi_11 = Chimu_11-Chimu_31; \ - Chi_12 = Chimu_12-Chimu_32; + Chi_12 = Chimu_12-Chimu_32;\ + std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + std::cout << "Chi_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + std::cout << "Chi_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + std::cout << "Chi_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + std::cout << "Chi_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + std::cout << "Chi_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + std::cout << "Chi_12 -- " << debugreg << std::endl; // fspin(0)=hspin(0); // fspin(1)=hspin(1); @@ -225,7 +409,32 @@ Author: paboyle result_22 = timesMinusI(UChi_12);\ result_30 = timesMinusI(UChi_00);\ result_31 = timesMinusI(UChi_01);\ - result_32 = timesMinusI(UChi_02); + result_32 = timesMinusI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define XP_RECON_ACCUM\ result_00+=UChi_00;\ @@ -239,7 +448,32 @@ Author: paboyle result_22-=timesI(UChi_12);\ result_30-=timesI(UChi_00);\ result_31-=timesI(UChi_01);\ - result_32-=timesI(UChi_02); + result_32-=timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define XM_RECON\ result_00 = UChi_00;\ @@ -253,7 +487,32 @@ Author: paboyle result_22 = timesI(UChi_12);\ result_30 = timesI(UChi_00);\ result_31 = timesI(UChi_01);\ - result_32 = timesI(UChi_02); + result_32 = timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define XM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -267,7 +526,32 @@ Author: paboyle result_22+= timesI(UChi_12);\ result_30+= timesI(UChi_00);\ result_31+= timesI(UChi_01);\ - result_32+= timesI(UChi_02); + result_32+= timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define YP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -281,7 +565,32 @@ Author: paboyle result_22+= UChi_12;\ result_30-= UChi_00;\ result_31-= UChi_01;\ - result_32-= UChi_02; + result_32-= UChi_02;\ + std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define YM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -295,7 +604,32 @@ Author: paboyle result_22-= UChi_12;\ result_30+= UChi_00;\ result_31+= UChi_01;\ - result_32+= UChi_02; + result_32+= UChi_02;\ + std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define ZP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -309,7 +643,32 @@ Author: paboyle result_22-= timesI(UChi_02); \ result_30+= timesI(UChi_10); \ result_31+= timesI(UChi_11); \ - result_32+= timesI(UChi_12); + result_32+= timesI(UChi_12);\ + std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define ZM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -323,7 +682,32 @@ Author: paboyle result_22+= timesI(UChi_02); \ result_30-= timesI(UChi_10); \ result_31-= timesI(UChi_11); \ - result_32-= timesI(UChi_12); + result_32-= timesI(UChi_12);\ + std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define TP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -337,7 +721,32 @@ Author: paboyle result_22+= UChi_02; \ result_30+= UChi_10; \ result_31+= UChi_11; \ - result_32+= UChi_12; + result_32+= UChi_12;\ + std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define TM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -351,7 +760,32 @@ Author: paboyle result_22-= UChi_02; \ result_30-= UChi_10; \ result_31-= UChi_11; \ - result_32-= UChi_12; + result_32-= UChi_12;\ + std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ @@ -368,7 +802,7 @@ Author: paboyle LOAD_CHI; \ } \ MULT_2SPIN(DIR); \ - RECON; + RECON; #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ @@ -431,6 +865,31 @@ Author: paboyle ref()(3)(0)+=result_30; \ ref()(3)(1)+=result_31; \ ref()(3)(2)+=result_32; \ + std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + std::cout << "result_00 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + std::cout << "result_01 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + std::cout << "result_02 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + std::cout << "result_10 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + std::cout << "result_11 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + std::cout << "result_12 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + std::cout << "result_20 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + std::cout << "result_21 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + std::cout << "result_22 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + std::cout << "result_30 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + std::cout << "result_31 -- " << debugreg << std::endl; \ + svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + std::cout << "result_32 -- " << debugreg << std::endl; \ } @@ -464,7 +923,8 @@ Author: paboyle Simd U_20; \ Simd U_01; \ Simd U_11; \ - Simd U_21; + Simd U_21;\ + Simd debugreg; #define ZERO_RESULT \ result_00=Zero(); \ @@ -478,7 +938,7 @@ Author: paboyle result_22=Zero(); \ result_30=Zero(); \ result_31=Zero(); \ - result_32=Zero(); + result_32=Zero(); #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -495,7 +955,7 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -template void +template void WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -530,7 +990,7 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView StencilEntry *SE; int offset,local,perm, ptype; - + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); @@ -542,7 +1002,7 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView HAND_RESULT(ss); } -template void +template void WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -589,7 +1049,7 @@ void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi HAND_RESULT(ss); } -template void +template void WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -641,29 +1101,29 @@ void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi ////////////// Wilson ; uses this implementation ///////////////////// NAMESPACE_END(Grid); -#undef LOAD_CHIMU -#undef LOAD_CHI +#undef LOAD_CHIMU +#undef LOAD_CHI #undef MULT_2SPIN #undef PERMUTE_DIR -#undef XP_PROJ -#undef YP_PROJ -#undef ZP_PROJ -#undef TP_PROJ -#undef XM_PROJ -#undef YM_PROJ -#undef ZM_PROJ -#undef TM_PROJ -#undef XP_RECON -#undef XP_RECON_ACCUM -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef ZERO_RESULT +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT #undef Chimu_00 #undef Chimu_01 #undef Chimu_02 diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.orig b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.orig new file mode 100644 index 00000000..f7b018fa --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.orig @@ -0,0 +1,684 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +#pragma once + +#include + + +#undef LOAD_CHIMU +#undef LOAD_CHI +#undef MULT_2SPIN +#undef PERMUTE_DIR +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT +#undef Chimu_00 +#undef Chimu_01 +#undef Chimu_02 +#undef Chimu_10 +#undef Chimu_11 +#undef Chimu_12 +#undef Chimu_20 +#undef Chimu_21 +#undef Chimu_22 +#undef Chimu_30 +#undef Chimu_31 +#undef Chimu_32 +#undef HAND_STENCIL_LEG +#undef HAND_STENCIL_LEG_INT +#undef HAND_STENCIL_LEG_EXT +#undef HAND_RESULT +#undef HAND_RESULT_INT +#undef HAND_RESULT_EXT + +#define REGISTER + +#define LOAD_CHIMU \ + {const SiteSpinor & ref (in[offset]); \ + Chimu_00=ref()(0)(0);\ + Chimu_01=ref()(0)(1);\ + Chimu_02=ref()(0)(2);\ + Chimu_10=ref()(1)(0);\ + Chimu_11=ref()(1)(1);\ + Chimu_12=ref()(1)(2);\ + Chimu_20=ref()(2)(0);\ + Chimu_21=ref()(2)(1);\ + Chimu_22=ref()(2)(2);\ + Chimu_30=ref()(3)(0);\ + Chimu_31=ref()(3)(1);\ + Chimu_32=ref()(3)(2);} + +#define LOAD_CHI\ + {const SiteHalfSpinor &ref(buf[offset]); \ + Chi_00 = ref()(0)(0);\ + Chi_01 = ref()(0)(1);\ + Chi_02 = ref()(0)(2);\ + Chi_10 = ref()(1)(0);\ + Chi_11 = ref()(1)(1);\ + Chi_12 = ref()(1)(2);} + +// To splat or not to splat depends on the implementation +#define MULT_2SPIN(A)\ + {auto & ref(U[sU](A)); \ + Impl::loadLinkElement(U_00,ref()(0,0)); \ + Impl::loadLinkElement(U_10,ref()(1,0)); \ + Impl::loadLinkElement(U_20,ref()(2,0)); \ + Impl::loadLinkElement(U_01,ref()(0,1)); \ + Impl::loadLinkElement(U_11,ref()(1,1)); \ + Impl::loadLinkElement(U_21,ref()(2,1)); \ + UChi_00 = U_00*Chi_00;\ + UChi_10 = U_00*Chi_10;\ + UChi_01 = U_10*Chi_00;\ + UChi_11 = U_10*Chi_10;\ + UChi_02 = U_20*Chi_00;\ + UChi_12 = U_20*Chi_10;\ + UChi_00+= U_01*Chi_01;\ + UChi_10+= U_01*Chi_11;\ + UChi_01+= U_11*Chi_01;\ + UChi_11+= U_11*Chi_11;\ + UChi_02+= U_21*Chi_01;\ + UChi_12+= U_21*Chi_11;\ + Impl::loadLinkElement(U_00,ref()(0,2)); \ + Impl::loadLinkElement(U_10,ref()(1,2)); \ + Impl::loadLinkElement(U_20,ref()(2,2)); \ + UChi_00+= U_00*Chi_02;\ + UChi_10+= U_00*Chi_12;\ + UChi_01+= U_10*Chi_02;\ + UChi_11+= U_10*Chi_12;\ + UChi_02+= U_20*Chi_02;\ + UChi_12+= U_20*Chi_12;} + + +#define PERMUTE_DIR(dir) \ + permute##dir(Chi_00,Chi_00);\ + permute##dir(Chi_01,Chi_01);\ + permute##dir(Chi_02,Chi_02);\ + permute##dir(Chi_10,Chi_10);\ + permute##dir(Chi_11,Chi_11);\ + permute##dir(Chi_12,Chi_12); + +// hspin(0)=fspin(0)+timesI(fspin(3)); +// hspin(1)=fspin(1)+timesI(fspin(2)); +#define XP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_30);\ + Chi_01 = Chimu_01+timesI(Chimu_31);\ + Chi_02 = Chimu_02+timesI(Chimu_32);\ + Chi_10 = Chimu_10+timesI(Chimu_20);\ + Chi_11 = Chimu_11+timesI(Chimu_21);\ + Chi_12 = Chimu_12+timesI(Chimu_22); + +#define YP_PROJ \ + Chi_00 = Chimu_00-Chimu_30;\ + Chi_01 = Chimu_01-Chimu_31;\ + Chi_02 = Chimu_02-Chimu_32;\ + Chi_10 = Chimu_10+Chimu_20;\ + Chi_11 = Chimu_11+Chimu_21;\ + Chi_12 = Chimu_12+Chimu_22; + +#define ZP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_20); \ + Chi_01 = Chimu_01+timesI(Chimu_21); \ + Chi_02 = Chimu_02+timesI(Chimu_22); \ + Chi_10 = Chimu_10-timesI(Chimu_30); \ + Chi_11 = Chimu_11-timesI(Chimu_31); \ + Chi_12 = Chimu_12-timesI(Chimu_32); + +#define TP_PROJ \ + Chi_00 = Chimu_00+Chimu_20; \ + Chi_01 = Chimu_01+Chimu_21; \ + Chi_02 = Chimu_02+Chimu_22; \ + Chi_10 = Chimu_10+Chimu_30; \ + Chi_11 = Chimu_11+Chimu_31; \ + Chi_12 = Chimu_12+Chimu_32; + + +// hspin(0)=fspin(0)-timesI(fspin(3)); +// hspin(1)=fspin(1)-timesI(fspin(2)); +#define XM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_30);\ + Chi_01 = Chimu_01-timesI(Chimu_31);\ + Chi_02 = Chimu_02-timesI(Chimu_32);\ + Chi_10 = Chimu_10-timesI(Chimu_20);\ + Chi_11 = Chimu_11-timesI(Chimu_21);\ + Chi_12 = Chimu_12-timesI(Chimu_22); + +#define YM_PROJ \ + Chi_00 = Chimu_00+Chimu_30;\ + Chi_01 = Chimu_01+Chimu_31;\ + Chi_02 = Chimu_02+Chimu_32;\ + Chi_10 = Chimu_10-Chimu_20;\ + Chi_11 = Chimu_11-Chimu_21;\ + Chi_12 = Chimu_12-Chimu_22; + +#define ZM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_20); \ + Chi_01 = Chimu_01-timesI(Chimu_21); \ + Chi_02 = Chimu_02-timesI(Chimu_22); \ + Chi_10 = Chimu_10+timesI(Chimu_30); \ + Chi_11 = Chimu_11+timesI(Chimu_31); \ + Chi_12 = Chimu_12+timesI(Chimu_32); + +#define TM_PROJ \ + Chi_00 = Chimu_00-Chimu_20; \ + Chi_01 = Chimu_01-Chimu_21; \ + Chi_02 = Chimu_02-Chimu_22; \ + Chi_10 = Chimu_10-Chimu_30; \ + Chi_11 = Chimu_11-Chimu_31; \ + Chi_12 = Chimu_12-Chimu_32; + +// fspin(0)=hspin(0); +// fspin(1)=hspin(1); +// fspin(2)=timesMinusI(hspin(1)); +// fspin(3)=timesMinusI(hspin(0)); +#define XP_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesMinusI(UChi_10);\ + result_21 = timesMinusI(UChi_11);\ + result_22 = timesMinusI(UChi_12);\ + result_30 = timesMinusI(UChi_00);\ + result_31 = timesMinusI(UChi_01);\ + result_32 = timesMinusI(UChi_02); + +#define XP_RECON_ACCUM\ + result_00+=UChi_00;\ + result_01+=UChi_01;\ + result_02+=UChi_02;\ + result_10+=UChi_10;\ + result_11+=UChi_11;\ + result_12+=UChi_12;\ + result_20-=timesI(UChi_10);\ + result_21-=timesI(UChi_11);\ + result_22-=timesI(UChi_12);\ + result_30-=timesI(UChi_00);\ + result_31-=timesI(UChi_01);\ + result_32-=timesI(UChi_02); + +#define XM_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesI(UChi_10);\ + result_21 = timesI(UChi_11);\ + result_22 = timesI(UChi_12);\ + result_30 = timesI(UChi_00);\ + result_31 = timesI(UChi_01);\ + result_32 = timesI(UChi_02); + +#define XM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_10);\ + result_21+= timesI(UChi_11);\ + result_22+= timesI(UChi_12);\ + result_30+= timesI(UChi_00);\ + result_31+= timesI(UChi_01);\ + result_32+= timesI(UChi_02); + +#define YP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_10;\ + result_21+= UChi_11;\ + result_22+= UChi_12;\ + result_30-= UChi_00;\ + result_31-= UChi_01;\ + result_32-= UChi_02; + +#define YM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_10;\ + result_21-= UChi_11;\ + result_22-= UChi_12;\ + result_30+= UChi_00;\ + result_31+= UChi_01;\ + result_32+= UChi_02; + +#define ZP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= timesI(UChi_00); \ + result_21-= timesI(UChi_01); \ + result_22-= timesI(UChi_02); \ + result_30+= timesI(UChi_10); \ + result_31+= timesI(UChi_11); \ + result_32+= timesI(UChi_12); + +#define ZM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_00); \ + result_21+= timesI(UChi_01); \ + result_22+= timesI(UChi_02); \ + result_30-= timesI(UChi_10); \ + result_31-= timesI(UChi_11); \ + result_32-= timesI(UChi_12); + +#define TP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_00; \ + result_21+= UChi_01; \ + result_22+= UChi_02; \ + result_30+= UChi_10; \ + result_31+= UChi_11; \ + result_32+= UChi_12; + +#define TM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_00; \ + result_21-= UChi_01; \ + result_22-= UChi_02; \ + result_30-= UChi_10; \ + result_31-= UChi_11; \ + result_32-= UChi_12; + +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ + } \ + if (local || st.same_node[DIR] ) { \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ + LOAD_CHI; \ + MULT_2SPIN(DIR); \ + RECON; \ + nmu++; \ + } + +#define HAND_RESULT(ss) \ + { \ + SiteSpinor & ref (out[ss]); \ + vstream(ref()(0)(0),result_00); \ + vstream(ref()(0)(1),result_01); \ + vstream(ref()(0)(2),result_02); \ + vstream(ref()(1)(0),result_10); \ + vstream(ref()(1)(1),result_11); \ + vstream(ref()(1)(2),result_12); \ + vstream(ref()(2)(0),result_20); \ + vstream(ref()(2)(1),result_21); \ + vstream(ref()(2)(2),result_22); \ + vstream(ref()(3)(0),result_30); \ + vstream(ref()(3)(1),result_31); \ + vstream(ref()(3)(2),result_32); \ + } + +#define HAND_RESULT_EXT(ss) \ + if (nmu){ \ + SiteSpinor & ref (out[ss]); \ + ref()(0)(0)+=result_00; \ + ref()(0)(1)+=result_01; \ + ref()(0)(2)+=result_02; \ + ref()(1)(0)+=result_10; \ + ref()(1)(1)+=result_11; \ + ref()(1)(2)+=result_12; \ + ref()(2)(0)+=result_20; \ + ref()(2)(1)+=result_21; \ + ref()(2)(2)+=result_22; \ + ref()(3)(0)+=result_30; \ + ref()(3)(1)+=result_31; \ + ref()(3)(2)+=result_32; \ + } + + +#define HAND_DECLARATIONS(a) \ + Simd result_00; \ + Simd result_01; \ + Simd result_02; \ + Simd result_10; \ + Simd result_11; \ + Simd result_12; \ + Simd result_20; \ + Simd result_21; \ + Simd result_22; \ + Simd result_30; \ + Simd result_31; \ + Simd result_32; \ + Simd Chi_00; \ + Simd Chi_01; \ + Simd Chi_02; \ + Simd Chi_10; \ + Simd Chi_11; \ + Simd Chi_12; \ + Simd UChi_00; \ + Simd UChi_01; \ + Simd UChi_02; \ + Simd UChi_10; \ + Simd UChi_11; \ + Simd UChi_12; \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21; + +#define ZERO_RESULT \ + result_00=Zero(); \ + result_01=Zero(); \ + result_02=Zero(); \ + result_10=Zero(); \ + result_11=Zero(); \ + result_12=Zero(); \ + result_20=Zero(); \ + result_21=Zero(); \ + result_22=Zero(); \ + result_30=Zero(); \ + result_31=Zero(); \ + result_32=Zero(); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 + +NAMESPACE_BEGIN(Grid); + +template void +WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset, ptype; + StencilEntry *SE; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset, ptype; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +////////////// Wilson ; uses this implementation ///////////////////// + +NAMESPACE_END(Grid); +#undef LOAD_CHIMU +#undef LOAD_CHI +#undef MULT_2SPIN +#undef PERMUTE_DIR +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT +#undef Chimu_00 +#undef Chimu_01 +#undef Chimu_02 +#undef Chimu_10 +#undef Chimu_11 +#undef Chimu_12 +#undef Chimu_20 +#undef Chimu_21 +#undef Chimu_22 +#undef Chimu_30 +#undef Chimu_31 +#undef Chimu_32 +#undef HAND_STENCIL_LEG +#undef HAND_STENCIL_LEG_INT +#undef HAND_STENCIL_LEG_EXT +#undef HAND_RESULT +#undef HAND_RESULT_INT +#undef HAND_RESULT_EXT From 14d0fe4d6cd7c5209a2701fe1adf61c30fc09477 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 10 Apr 2020 12:04:00 +0200 Subject: [PATCH 032/147] added predication in WilsonHand --- .../fermion/implementation/WilsonKernelsHandImplementation.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index 19ed3e06..44d4bb46 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -924,7 +924,9 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Simd U_01; \ Simd U_11; \ Simd U_21;\ - Simd debugreg; + Simd debugreg;\ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ #define ZERO_RESULT \ result_00=Zero(); \ From a28bc0de90678d2144262928aca37df6df23b05d Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 10 Apr 2020 12:07:45 +0200 Subject: [PATCH 033/147] debug register address test in WilsonHand --- .../WilsonKernelsHandImplementation.h | 432 +++++++++--------- 1 file changed, 216 insertions(+), 216 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index 44d4bb46..cdf1ade5 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -91,29 +91,29 @@ Author: paboyle Chimu_31=ref()(3)(1);\ Chimu_32=ref()(3)(2);\ std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_00); \ + svst1(pg1, &debugreg.v, Chimu_00); \ std::cout << "Chimu_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_01); \ + svst1(pg1, &debugreg.v, Chimu_01); \ std::cout << "Chimu_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_02); \ + svst1(pg1, &debugreg.v, Chimu_02); \ std::cout << "Chimu_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_10); \ + svst1(pg1, &debugreg.v, Chimu_10); \ std::cout << "Chimu_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_11); \ + svst1(pg1, &debugreg.v, Chimu_11); \ std::cout << "Chimu_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_12); \ + svst1(pg1, &debugreg.v, Chimu_12); \ std::cout << "Chimu_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_20); \ + svst1(pg1, &debugreg.v, Chimu_20); \ std::cout << "Chimu_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_21); \ + svst1(pg1, &debugreg.v, Chimu_21); \ std::cout << "Chimu_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_22); \ + svst1(pg1, &debugreg.v, Chimu_22); \ std::cout << "Chimu_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_30); \ + svst1(pg1, &debugreg.v, Chimu_30); \ std::cout << "Chimu_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_31); \ + svst1(pg1, &debugreg.v, Chimu_31); \ std::cout << "Chimu_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chimu_32); \ + svst1(pg1, &debugreg.v, Chimu_32); \ std::cout << "Chimu_32 -- " << debugreg << std::endl; \ } @@ -126,17 +126,17 @@ Author: paboyle Chi_11 = ref()(1)(1);\ Chi_12 = ref()(1)(2);\ std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; \ } @@ -171,34 +171,34 @@ Author: paboyle UChi_02+= U_20*Chi_02;\ UChi_12+= U_20*Chi_12;\ std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, UChi_00); \ + svst1(pg1, &debugreg.v, UChi_00); \ std::cout << "UChi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, UChi_01); \ + svst1(pg1, &debugreg.v, UChi_01); \ std::cout << "UChi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, UChi_02); \ + svst1(pg1, &debugreg.v, UChi_02); \ std::cout << "UChi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, UChi_10); \ + svst1(pg1, &debugreg.v, UChi_10); \ std::cout << "UChi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, UChi_11); \ + svst1(pg1, &debugreg.v, UChi_11); \ std::cout << "UChi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, UChi_12); \ + svst1(pg1, &debugreg.v, UChi_12); \ std::cout << "UChi_12 -- " << debugreg << std::endl; \ } #define PERMUTE_DIR(dir) \ std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \ -svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ +svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ -svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ +svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ -svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ +svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ -svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ +svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ -svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ +svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ -svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ +svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; \ permute##dir(Chi_00,Chi_00);\ permute##dir(Chi_01,Chi_01);\ @@ -207,17 +207,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ permute##dir(Chi_11,Chi_11);\ permute##dir(Chi_12,Chi_12);\ std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; // hspin(0)=fspin(0)+timesI(fspin(3)); @@ -230,17 +230,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11+timesI(Chimu_21);\ Chi_12 = Chimu_12+timesI(Chimu_22);\ std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; #define YP_PROJ \ @@ -251,17 +251,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11+Chimu_21;\ Chi_12 = Chimu_12+Chimu_22;\ std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; #define ZP_PROJ \ @@ -272,17 +272,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11-timesI(Chimu_31); \ Chi_12 = Chimu_12-timesI(Chimu_32);\ std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; #define TP_PROJ \ @@ -293,17 +293,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11+Chimu_31; \ Chi_12 = Chimu_12+Chimu_32;\ std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; @@ -317,17 +317,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11-timesI(Chimu_21);\ Chi_12 = Chimu_12-timesI(Chimu_22);\ std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; #define YM_PROJ \ @@ -338,17 +338,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11-Chimu_21;\ Chi_12 = Chimu_12-Chimu_22;\ std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; #define ZM_PROJ \ @@ -359,17 +359,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11+timesI(Chimu_31); \ Chi_12 = Chimu_12+timesI(Chimu_32);\ std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; #define TM_PROJ \ @@ -380,17 +380,17 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11-Chimu_31; \ Chi_12 = Chimu_12-Chimu_32;\ std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_00); \ + svst1(pg1, &debugreg.v, Chi_00); \ std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_01); \ + svst1(pg1, &debugreg.v, Chi_01); \ std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_02); \ + svst1(pg1, &debugreg.v, Chi_02); \ std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_10); \ + svst1(pg1, &debugreg.v, Chi_10); \ std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_11); \ + svst1(pg1, &debugreg.v, Chi_11); \ std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, Chi_12); \ + svst1(pg1, &debugreg.v, Chi_12); \ std::cout << "Chi_12 -- " << debugreg << std::endl; // fspin(0)=hspin(0); @@ -411,29 +411,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31 = timesMinusI(UChi_01);\ result_32 = timesMinusI(UChi_02);\ std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define XP_RECON_ACCUM\ @@ -450,29 +450,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31-=timesI(UChi_01);\ result_32-=timesI(UChi_02);\ std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define XM_RECON\ @@ -489,29 +489,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31 = timesI(UChi_01);\ result_32 = timesI(UChi_02);\ std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define XM_RECON_ACCUM\ @@ -528,29 +528,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31+= timesI(UChi_01);\ result_32+= timesI(UChi_02);\ std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define YP_RECON_ACCUM\ @@ -567,29 +567,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31-= UChi_01;\ result_32-= UChi_02;\ std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define YM_RECON_ACCUM\ @@ -606,29 +606,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31+= UChi_01;\ result_32+= UChi_02;\ std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define ZP_RECON_ACCUM\ @@ -645,29 +645,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31+= timesI(UChi_11); \ result_32+= timesI(UChi_12);\ std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define ZM_RECON_ACCUM\ @@ -684,29 +684,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31-= timesI(UChi_11); \ result_32-= timesI(UChi_12);\ std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define TP_RECON_ACCUM\ @@ -723,29 +723,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31+= UChi_11; \ result_32+= UChi_12;\ std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define TM_RECON_ACCUM\ @@ -762,29 +762,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31-= UChi_11; \ result_32-= UChi_12;\ std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ @@ -866,29 +866,29 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ ref()(3)(1)+=result_31; \ ref()(3)(2)+=result_32; \ std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_00); \ + svst1(pg1, &debugreg.v, result_00); \ std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_01); \ + svst1(pg1, &debugreg.v, result_01); \ std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_02); \ + svst1(pg1, &debugreg.v, result_02); \ std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_10); \ + svst1(pg1, &debugreg.v, result_10); \ std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_11); \ + svst1(pg1, &debugreg.v, result_11); \ std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_12); \ + svst1(pg1, &debugreg.v, result_12); \ std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_20); \ + svst1(pg1, &debugreg.v, result_20); \ std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_21); \ + svst1(pg1, &debugreg.v, result_21); \ std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_22); \ + svst1(pg1, &debugreg.v, result_22); \ std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_30); \ + svst1(pg1, &debugreg.v, result_30); \ std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_31); \ + svst1(pg1, &debugreg.v, result_31); \ std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, (float64_t*)&debugreg.v, result_32); \ + svst1(pg1, &debugreg.v, result_32); \ std::cout << "result_32 -- " << debugreg << std::endl; \ } From e699b7e9f9c2643f156fd1cba1b6a01131f7107d Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 10 Apr 2020 12:18:30 +0200 Subject: [PATCH 034/147] changed debug output to variable direct --- .../WilsonKernelsHandImplementation.h | 612 ++++++------------ 1 file changed, 204 insertions(+), 408 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index cdf1ade5..6eec803a 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -91,30 +91,18 @@ Author: paboyle Chimu_31=ref()(3)(1);\ Chimu_32=ref()(3)(2);\ std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_00); \ - std::cout << "Chimu_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_01); \ - std::cout << "Chimu_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_02); \ - std::cout << "Chimu_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_10); \ - std::cout << "Chimu_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_11); \ - std::cout << "Chimu_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_12); \ - std::cout << "Chimu_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_20); \ - std::cout << "Chimu_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_21); \ - std::cout << "Chimu_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_22); \ - std::cout << "Chimu_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_30); \ - std::cout << "Chimu_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_31); \ - std::cout << "Chimu_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chimu_32); \ - std::cout << "Chimu_32 -- " << debugreg << std::endl; \ + std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \ + std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \ + std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \ + std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \ + std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \ + std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \ + std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \ + std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \ + std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \ + std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \ + std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \ + std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \ } #define LOAD_CHI\ @@ -126,18 +114,12 @@ Author: paboyle Chi_11 = ref()(1)(1);\ Chi_12 = ref()(1)(2);\ std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ } // To splat or not to splat depends on the implementation @@ -171,35 +153,23 @@ Author: paboyle UChi_02+= U_20*Chi_02;\ UChi_12+= U_20*Chi_12;\ std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \ - svst1(pg1, &debugreg.v, UChi_00); \ - std::cout << "UChi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, UChi_01); \ - std::cout << "UChi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, UChi_02); \ - std::cout << "UChi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, UChi_10); \ - std::cout << "UChi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, UChi_11); \ - std::cout << "UChi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, UChi_12); \ - std::cout << "UChi_12 -- " << debugreg << std::endl; \ + std::cout << "UChi_00 -- " << UChi_00 << std::endl; \ + std::cout << "UChi_01 -- " << UChi_01 << std::endl; \ + std::cout << "UChi_02 -- " << UChi_02 << std::endl; \ + std::cout << "UChi_10 -- " << UChi_10 << std::endl; \ + std::cout << "UChi_11 -- " << UChi_11 << std::endl; \ + std::cout << "UChi_12 -- " << UChi_12 << std::endl; \ } #define PERMUTE_DIR(dir) \ std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \ -svst1(pg1, &debugreg.v, Chi_00); \ -std::cout << "Chi_00 -- " << debugreg << std::endl; \ -svst1(pg1, &debugreg.v, Chi_01); \ -std::cout << "Chi_01 -- " << debugreg << std::endl; \ -svst1(pg1, &debugreg.v, Chi_02); \ -std::cout << "Chi_02 -- " << debugreg << std::endl; \ -svst1(pg1, &debugreg.v, Chi_10); \ -std::cout << "Chi_10 -- " << debugreg << std::endl; \ -svst1(pg1, &debugreg.v, Chi_11); \ -std::cout << "Chi_11 -- " << debugreg << std::endl; \ -svst1(pg1, &debugreg.v, Chi_12); \ -std::cout << "Chi_12 -- " << debugreg << std::endl; \ +std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ +std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ +std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ +std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ +std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ +std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ permute##dir(Chi_00,Chi_00);\ permute##dir(Chi_01,Chi_01);\ permute##dir(Chi_02,Chi_02);\ @@ -207,18 +177,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ permute##dir(Chi_11,Chi_11);\ permute##dir(Chi_12,Chi_12);\ std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); @@ -230,18 +194,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11+timesI(Chimu_21);\ Chi_12 = Chimu_12+timesI(Chimu_22);\ std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; #define YP_PROJ \ Chi_00 = Chimu_00-Chimu_30;\ @@ -251,18 +209,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11+Chimu_21;\ Chi_12 = Chimu_12+Chimu_22;\ std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; #define ZP_PROJ \ Chi_00 = Chimu_00+timesI(Chimu_20); \ @@ -272,18 +224,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11-timesI(Chimu_31); \ Chi_12 = Chimu_12-timesI(Chimu_32);\ std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; #define TP_PROJ \ Chi_00 = Chimu_00+Chimu_20; \ @@ -293,18 +239,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11+Chimu_31; \ Chi_12 = Chimu_12+Chimu_32;\ std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; // hspin(0)=fspin(0)-timesI(fspin(3)); @@ -317,18 +257,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11-timesI(Chimu_21);\ Chi_12 = Chimu_12-timesI(Chimu_22);\ std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; #define YM_PROJ \ Chi_00 = Chimu_00+Chimu_30;\ @@ -338,18 +272,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11-Chimu_21;\ Chi_12 = Chimu_12-Chimu_22;\ std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; #define ZM_PROJ \ Chi_00 = Chimu_00-timesI(Chimu_20); \ @@ -359,18 +287,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11+timesI(Chimu_31); \ Chi_12 = Chimu_12+timesI(Chimu_32);\ std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; #define TM_PROJ \ Chi_00 = Chimu_00-Chimu_20; \ @@ -380,18 +302,12 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ Chi_11 = Chimu_11-Chimu_31; \ Chi_12 = Chimu_12-Chimu_32;\ std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \ - svst1(pg1, &debugreg.v, Chi_00); \ - std::cout << "Chi_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_01); \ - std::cout << "Chi_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_02); \ - std::cout << "Chi_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_10); \ - std::cout << "Chi_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_11); \ - std::cout << "Chi_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, Chi_12); \ - std::cout << "Chi_12 -- " << debugreg << std::endl; + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; // fspin(0)=hspin(0); // fspin(1)=hspin(1); @@ -411,30 +327,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31 = timesMinusI(UChi_01);\ result_32 = timesMinusI(UChi_02);\ std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define XP_RECON_ACCUM\ result_00+=UChi_00;\ @@ -450,30 +354,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31-=timesI(UChi_01);\ result_32-=timesI(UChi_02);\ std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define XM_RECON\ result_00 = UChi_00;\ @@ -489,30 +381,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31 = timesI(UChi_01);\ result_32 = timesI(UChi_02);\ std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define XM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -528,30 +408,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31+= timesI(UChi_01);\ result_32+= timesI(UChi_02);\ std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define YP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -567,30 +435,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31-= UChi_01;\ result_32-= UChi_02;\ std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define YM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -606,30 +462,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31+= UChi_01;\ result_32+= UChi_02;\ std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define ZP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -645,30 +489,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31+= timesI(UChi_11); \ result_32+= timesI(UChi_12);\ std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define ZM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -684,30 +516,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31-= timesI(UChi_11); \ result_32-= timesI(UChi_12);\ std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define TP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -723,30 +543,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31+= UChi_11; \ result_32+= UChi_12;\ std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define TM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -762,30 +570,18 @@ std::cout << "Chi_12 -- " << debugreg << std::endl; \ result_31-= UChi_11; \ result_32-= UChi_12;\ std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ From 7e4e1bbbc21a107e49da68cdd44f41ed4507c144 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 10 Apr 2020 12:22:04 +0200 Subject: [PATCH 035/147] changed debug output to variable direct 2 --- .../WilsonKernelsHandImplementation.h | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index 6eec803a..f951bc85 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -581,7 +581,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ std::cout << "result_22 -- " << result_22 << std::endl; \ std::cout << "result_30 -- " << result_30 << std::endl; \ std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + std::cout << "result_32 -- " << result_32 << std::endl; #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ @@ -644,6 +644,19 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ vstream(ref()(3)(0),result_30); \ vstream(ref()(3)(1),result_31); \ vstream(ref()(3)(2),result_32); \ + std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; } #define HAND_RESULT_EXT(ss) \ @@ -661,31 +674,19 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ ref()(3)(0)+=result_30; \ ref()(3)(1)+=result_31; \ ref()(3)(2)+=result_32; \ - std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \ - svst1(pg1, &debugreg.v, result_00); \ - std::cout << "result_00 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_01); \ - std::cout << "result_01 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_02); \ - std::cout << "result_02 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_10); \ - std::cout << "result_10 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_11); \ - std::cout << "result_11 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_12); \ - std::cout << "result_12 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_20); \ - std::cout << "result_20 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_21); \ - std::cout << "result_21 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_22); \ - std::cout << "result_22 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_30); \ - std::cout << "result_30 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_31); \ - std::cout << "result_31 -- " << debugreg << std::endl; \ - svst1(pg1, &debugreg.v, result_32); \ - std::cout << "result_32 -- " << debugreg << std::endl; \ + std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; } From 160f78c1e4b56e4052f358c9bc6a517be9943df0 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 10 Apr 2020 12:23:07 +0200 Subject: [PATCH 036/147] changed debug output to variable direct 3 --- .../fermion/implementation/WilsonKernelsHandImplementation.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index f951bc85..6f3edbb5 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -656,7 +656,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ std::cout << "result_22 -- " << result_22 << std::endl; \ std::cout << "result_30 -- " << result_30 << std::endl; \ std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + std::cout << "result_32 -- " << result_32 << std::endl;\ } #define HAND_RESULT_EXT(ss) \ @@ -686,7 +686,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ std::cout << "result_22 -- " << result_22 << std::endl; \ std::cout << "result_30 -- " << result_30 << std::endl; \ std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + std::cout << "result_32 -- " << result_32 << std::endl;\ } From 974586bedce1c97c87cc9ed6788056f8869e2134 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Fri, 10 Apr 2020 22:26:40 +0200 Subject: [PATCH 037/147] Dslash finally works; cleaned up; uses MOVPRFX in assembly --- .../WilsonKernelsImplementation.h | 6 +- Grid/simd/Fujitsu_A64FX_asm_double.h | 176 +++++++++------ Grid/simd/Fujitsu_A64FX_asm_single.h | 201 +++++++++++------- Grid/simd/Fujitsu_A64FX_intrin_double.h | 105 ++++----- Grid/simd/Fujitsu_A64FX_intrin_single.h | 108 +++++----- 5 files changed, 344 insertions(+), 252 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index a787fa79..57306ff5 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -372,19 +372,19 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); /* printf("."); */ return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); /* printf("-"); */ return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); /* printf("+"); */ return;} #endif } assert(0 && " Kernel optimisation case not covered "); diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 9269ec2a..e2d731e3 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -2,7 +2,7 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: XXX + Source file: Fujitsu_A64FX_asm_double.h Copyright (C) 2020 @@ -40,9 +40,9 @@ Author: Nils Meyer #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A) #define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) -#define MAYBEPERM(A,perm) if (perm) { A ; } +#define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI ZERO_PSI_A64FXd +#define ZERO_PSI #define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd @@ -62,10 +62,10 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 PERM0_A64FXd -#define PERMUTE_DIR1 PERM1_A64FXd -#define PERMUTE_DIR2 PERM2_A64FXd -#define PERMUTE_DIR3 PERM3_A64FXd +#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; } +#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; } +#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; } +#define PERMUTE_DIR3 // DECLARATIONS #define DECLARATIONS_A64FXd \ const uint64_t lut[4][8] = { \ @@ -155,14 +155,14 @@ asm ( \ #define LOAD_CHI_A64FXd(base) \ { \ asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ + : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } @@ -234,40 +234,45 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } -// PERM0 -#define PERM0_A64FXd \ +// LOAD_TABLE0 +#define LOAD_TABLE0 \ asm ( \ "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (0) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM1 -#define PERM1_A64FXd \ +// LOAD_TABLE1 +#define LOAD_TABLE1 \ asm ( \ "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (1) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM2 -#define PERM2_A64FXd \ +// LOAD_TABLE2 +#define LOAD_TABLE2 \ asm ( \ "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM0 +#define PERM0_A64FXd \ +asm ( \ "tbl z12.d, { z12.d }, z30.d \n\t" \ "tbl z13.d, { z13.d }, z30.d \n\t" \ "tbl z14.d, { z14.d }, z30.d \n\t" \ @@ -275,8 +280,36 @@ asm ( \ "tbl z16.d, { z16.d }, z30.d \n\t" \ "tbl z17.d, { z17.d }, z30.d \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM1 +#define PERM1_A64FXd \ +asm ( \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM2 +#define PERM2_A64FXd \ +asm ( \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); // PERM3 @@ -287,23 +320,24 @@ asm ( \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ + "ptrue p5.d \n\t" \ "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "fmov z18.d , 0 \n\t" \ - "fmov z21.d , 0 \n\t" \ - "fmov z19.d , 0 \n\t" \ - "fmov z22.d , 0 \n\t" \ - "fmov z20.d , 0 \n\t" \ - "fmov z23.d , 0 \n\t" \ + "movprfx z18.d, p5/m, z31.d \n\t" \ "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ + "movprfx z21.d, p5/m, z31.d \n\t" \ "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ + "movprfx z19.d, p5/m, z31.d \n\t" \ "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ + "movprfx z22.d, p5/m, z31.d \n\t" \ "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ + "movprfx z20.d, p5/m, z31.d \n\t" \ "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ + "movprfx z23.d, p5/m, z31.d \n\t" \ "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ @@ -347,6 +381,7 @@ asm ( \ #define XP_PROJ_A64FXd \ { \ asm ( \ + "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \ "fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \ "fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \ @@ -361,11 +396,17 @@ asm ( \ // XP_RECON #define XP_RECON_A64FXd \ asm ( \ + "movprfx z6.d, p5/m, z31.d \n\t" \ "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "movprfx z7.d, p5/m, z31.d \n\t" \ "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "movprfx z8.d, p5/m, z31.d \n\t" \ "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "movprfx z9.d, p5/m, z31.d \n\t" \ "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "movprfx z10.d, p5/m, z31.d \n\t" \ "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "movprfx z11.d, p5/m, z31.d \n\t" \ "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ "mov z0.d, p5/m, z18.d \n\t" \ "mov z1.d, p5/m, z19.d \n\t" \ @@ -402,7 +443,7 @@ asm ( \ #define YP_PROJ_A64FXd \ { \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.d \n\t" \ "fsub z12.d, p5/m, z12.d, z27.d \n\t" \ "fsub z13.d, p5/m, z13.d, z28.d \n\t" \ "fsub z14.d, p5/m, z14.d, z29.d \n\t" \ @@ -410,15 +451,15 @@ asm ( \ "fadd z16.d, p5/m, z16.d, z25.d \n\t" \ "fadd z17.d, p5/m, z17.d, z26.d \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // ZP_PROJ #define ZP_PROJ_A64FXd \ { \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \ "fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \ "fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \ @@ -426,15 +467,15 @@ asm ( \ "fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \ "fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // TP_PROJ #define TP_PROJ_A64FXd \ { \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.d \n\t" \ "fadd z12.d, p5/m, z12.d, z24.d \n\t" \ "fadd z13.d, p5/m, z13.d, z25.d \n\t" \ "fadd z14.d, p5/m, z14.d, z26.d \n\t" \ @@ -442,14 +483,15 @@ asm ( \ "fadd z16.d, p5/m, z16.d, z28.d \n\t" \ "fadd z17.d, p5/m, z17.d, z29.d \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XM_PROJ #define XM_PROJ_A64FXd \ { \ asm ( \ + "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \ "fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \ "fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \ @@ -464,11 +506,17 @@ asm ( \ // XM_RECON #define XM_RECON_A64FXd \ asm ( \ + "movprfx z6.d, p5/m, z31.d \n\t" \ "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "movprfx z7.d, p5/m, z31.d \n\t" \ "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "movprfx z8.d, p5/m, z31.d \n\t" \ "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "movprfx z9.d, p5/m, z31.d \n\t" \ "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "movprfx z10.d, p5/m, z31.d \n\t" \ "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "movprfx z11.d, p5/m, z31.d \n\t" \ "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ "mov z0.d, p5/m, z18.d \n\t" \ "mov z1.d, p5/m, z19.d \n\t" \ @@ -485,7 +533,7 @@ asm ( \ #define YM_PROJ_A64FXd \ { \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.d \n\t" \ "fadd z12.d, p5/m, z12.d, z27.d \n\t" \ "fadd z13.d, p5/m, z13.d, z28.d \n\t" \ "fadd z14.d, p5/m, z14.d, z29.d \n\t" \ @@ -493,15 +541,15 @@ asm ( \ "fsub z16.d, p5/m, z16.d, z25.d \n\t" \ "fsub z17.d, p5/m, z17.d, z26.d \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // ZM_PROJ #define ZM_PROJ_A64FXd \ { \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \ "fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \ "fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \ @@ -509,15 +557,15 @@ asm ( \ "fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \ "fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // TM_PROJ #define TM_PROJ_A64FXd \ { \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.d \n\t" \ "fsub z12.d, p5/m, z12.d, z24.d \n\t" \ "fsub z13.d, p5/m, z13.d, z25.d \n\t" \ "fsub z14.d, p5/m, z14.d, z26.d \n\t" \ @@ -525,8 +573,8 @@ asm ( \ "fsub z16.d, p5/m, z16.d, z28.d \n\t" \ "fsub z17.d, p5/m, z17.d, z29.d \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XM_RECON_ACCUM @@ -538,12 +586,12 @@ asm ( \ "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index ac710a97..210d537e 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -2,7 +2,7 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: XXX + Source file: Fujitsu_A64FX_asm_single.h Copyright (C) 2020 @@ -40,9 +40,9 @@ Author: Nils Meyer #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A) #define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) -#define MAYBEPERM(A,perm) if (perm) { A ; } +#define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI ZERO_PSI_A64FXf +#define ZERO_PSI #define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf @@ -62,10 +62,10 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 PERM0_A64FXf -#define PERMUTE_DIR1 PERM1_A64FXf -#define PERMUTE_DIR2 PERM2_A64FXf -#define PERMUTE_DIR3 PERM3_A64FXf +#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; } +#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; } +#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; } +#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; } // DECLARATIONS #define DECLARATIONS_A64FXf \ const uint32_t lut[4][16] = { \ @@ -155,14 +155,14 @@ asm ( \ #define LOAD_CHI_A64FXf(base) \ { \ asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ + : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } @@ -234,55 +234,45 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } -// PERM0 -#define PERM0_A64FXf \ +// LOAD_TABLE0 +#define LOAD_TABLE0 \ asm ( \ "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (0) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM1 -#define PERM1_A64FXf \ +// LOAD_TABLE1 +#define LOAD_TABLE1 \ asm ( \ "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (1) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM2 -#define PERM2_A64FXf \ +// LOAD_TABLE2 +#define LOAD_TABLE2 \ asm ( \ "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (2) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM3 -#define PERM3_A64FXf \ +// LOAD_TABLE3 +#define LOAD_TABLE3 \ asm ( \ "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM0 +#define PERM0_A64FXf \ +asm ( \ "tbl z12.s, { z12.s }, z30.s \n\t" \ "tbl z13.s, { z13.s }, z30.s \n\t" \ "tbl z14.s, { z14.s }, z30.s \n\t" \ @@ -290,8 +280,50 @@ asm ( \ "tbl z16.s, { z16.s }, z30.s \n\t" \ "tbl z17.s, { z17.s }, z30.s \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM1 +#define PERM1_A64FXf \ +asm ( \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM2 +#define PERM2_A64FXf \ +asm ( \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM3 +#define PERM3_A64FXf \ +asm ( \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); // MULT_2SPIN @@ -299,23 +331,24 @@ asm ( \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ + "ptrue p5.s \n\t" \ "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "fmov z18.s , 0 \n\t" \ - "fmov z21.s , 0 \n\t" \ - "fmov z19.s , 0 \n\t" \ - "fmov z22.s , 0 \n\t" \ - "fmov z20.s , 0 \n\t" \ - "fmov z23.s , 0 \n\t" \ + "movprfx z18.s, p5/m, z31.s \n\t" \ "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ + "movprfx z21.s, p5/m, z31.s \n\t" \ "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ + "movprfx z19.s, p5/m, z31.s \n\t" \ "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ + "movprfx z22.s, p5/m, z31.s \n\t" \ "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ + "movprfx z20.s, p5/m, z31.s \n\t" \ "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ + "movprfx z23.s, p5/m, z31.s \n\t" \ "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ @@ -359,7 +392,7 @@ asm ( \ #define XP_PROJ_A64FXf \ { \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z27.s, 90 \n\t" \ "fcadd z13.s, p5/m, z13.s, z28.s, 90 \n\t" \ "fcadd z14.s, p5/m, z14.s, z29.s, 90 \n\t" \ @@ -367,18 +400,24 @@ asm ( \ "fcadd z16.s, p5/m, z16.s, z25.s, 90 \n\t" \ "fcadd z17.s, p5/m, z17.s, z26.s, 90 \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XP_RECON #define XP_RECON_A64FXf \ asm ( \ + "movprfx z6.s, p5/m, z31.s \n\t" \ "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ + "movprfx z7.s, p5/m, z31.s \n\t" \ "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ + "movprfx z8.s, p5/m, z31.s \n\t" \ "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ + "movprfx z9.s, p5/m, z31.s \n\t" \ "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ + "movprfx z10.s, p5/m, z31.s \n\t" \ "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ + "movprfx z11.s, p5/m, z31.s \n\t" \ "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ "mov z0.s, p5/m, z18.s \n\t" \ "mov z1.s, p5/m, z19.s \n\t" \ @@ -415,7 +454,7 @@ asm ( \ #define YP_PROJ_A64FXf \ { \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.s \n\t" \ "fsub z12.s, p5/m, z12.s, z27.s \n\t" \ "fsub z13.s, p5/m, z13.s, z28.s \n\t" \ "fsub z14.s, p5/m, z14.s, z29.s \n\t" \ @@ -423,15 +462,15 @@ asm ( \ "fadd z16.s, p5/m, z16.s, z25.s \n\t" \ "fadd z17.s, p5/m, z17.s, z26.s \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // ZP_PROJ #define ZP_PROJ_A64FXf \ { \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z24.s, 90 \n\t" \ "fcadd z13.s, p5/m, z13.s, z25.s, 90 \n\t" \ "fcadd z14.s, p5/m, z14.s, z26.s, 90 \n\t" \ @@ -439,15 +478,15 @@ asm ( \ "fcadd z16.s, p5/m, z16.s, z28.s, 270 \n\t" \ "fcadd z17.s, p5/m, z17.s, z29.s, 270 \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // TP_PROJ #define TP_PROJ_A64FXf \ { \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.s \n\t" \ "fadd z12.s, p5/m, z12.s, z24.s \n\t" \ "fadd z13.s, p5/m, z13.s, z25.s \n\t" \ "fadd z14.s, p5/m, z14.s, z26.s \n\t" \ @@ -455,15 +494,15 @@ asm ( \ "fadd z16.s, p5/m, z16.s, z28.s \n\t" \ "fadd z17.s, p5/m, z17.s, z29.s \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XM_PROJ #define XM_PROJ_A64FXf \ { \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z27.s, 270 \n\t" \ "fcadd z13.s, p5/m, z13.s, z28.s, 270 \n\t" \ "fcadd z14.s, p5/m, z14.s, z29.s, 270 \n\t" \ @@ -471,18 +510,24 @@ asm ( \ "fcadd z16.s, p5/m, z16.s, z25.s, 270 \n\t" \ "fcadd z17.s, p5/m, z17.s, z26.s, 270 \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XM_RECON #define XM_RECON_A64FXf \ asm ( \ + "movprfx z6.s, p5/m, z31.s \n\t" \ "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ + "movprfx z7.s, p5/m, z31.s \n\t" \ "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ + "movprfx z8.s, p5/m, z31.s \n\t" \ "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ + "movprfx z9.s, p5/m, z31.s \n\t" \ "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ + "movprfx z10.s, p5/m, z31.s \n\t" \ "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ + "movprfx z11.s, p5/m, z31.s \n\t" \ "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ "mov z0.s, p5/m, z18.s \n\t" \ "mov z1.s, p5/m, z19.s \n\t" \ @@ -499,7 +544,7 @@ asm ( \ #define YM_PROJ_A64FXf \ { \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.s \n\t" \ "fadd z12.s, p5/m, z12.s, z27.s \n\t" \ "fadd z13.s, p5/m, z13.s, z28.s \n\t" \ "fadd z14.s, p5/m, z14.s, z29.s \n\t" \ @@ -507,15 +552,15 @@ asm ( \ "fsub z16.s, p5/m, z16.s, z25.s \n\t" \ "fsub z17.s, p5/m, z17.s, z26.s \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // ZM_PROJ #define ZM_PROJ_A64FXf \ { \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z24.s, 270 \n\t" \ "fcadd z13.s, p5/m, z13.s, z25.s, 270 \n\t" \ "fcadd z14.s, p5/m, z14.s, z26.s, 270 \n\t" \ @@ -523,15 +568,15 @@ asm ( \ "fcadd z16.s, p5/m, z16.s, z28.s, 90 \n\t" \ "fcadd z17.s, p5/m, z17.s, z29.s, 90 \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // TM_PROJ #define TM_PROJ_A64FXf \ { \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ptrue p5.s \n\t" \ "fsub z12.s, p5/m, z12.s, z24.s \n\t" \ "fsub z13.s, p5/m, z13.s, z25.s \n\t" \ "fsub z14.s, p5/m, z14.s, z26.s \n\t" \ @@ -539,8 +584,8 @@ asm ( \ "fsub z16.s, p5/m, z16.s, z28.s \n\t" \ "fsub z17.s, p5/m, z17.s, z29.s \n\t" \ : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XM_RECON_ACCUM @@ -552,12 +597,12 @@ asm ( \ "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 1663fc07..f94d4f47 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -2,7 +2,7 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: XXX + Source file: Fujitsu_A64FX_intrin_double.h Copyright (C) 2020 @@ -40,9 +40,9 @@ Author: Nils Meyer #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A) #define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) -#define MAYBEPERM(A,perm) if (perm) { A ; } +#define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI ZERO_PSI_A64FXd +#define ZERO_PSI #define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd @@ -62,10 +62,10 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 PERM0_A64FXd -#define PERMUTE_DIR1 PERM1_A64FXd -#define PERMUTE_DIR2 PERM2_A64FXd -#define PERMUTE_DIR3 PERM3_A64FXd +#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; } +#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; } +#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; } +#define PERMUTE_DIR3 // DECLARATIONS #define DECLARATIONS_A64FXd \ const uint64_t lut[4][8] = { \ @@ -170,12 +170,12 @@ Author: Nils Meyer // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ { \ - Chi_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chi_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chi_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chi_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chi_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \ + Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \ + Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \ + Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \ + Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \ + Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ @@ -227,9 +227,24 @@ Author: Nils Meyer Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ } +// LOAD_TABLE0 +#define LOAD_TABLE0 \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ + table0 = svld1(pg1, (uint64_t*)&lut[3]); + // PERM0 #define PERM0_A64FXd \ - table0 = svld1(pg1, (uint64_t*)&lut[0]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -239,7 +254,6 @@ Author: Nils Meyer // PERM1 #define PERM1_A64FXd \ - table0 = svld1(pg1, (uint64_t*)&lut[1]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -249,7 +263,6 @@ Author: Nils Meyer // PERM2 #define PERM2_A64FXd \ - table0 = svld1(pg1, (uint64_t*)&lut[2]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -270,18 +283,12 @@ Author: Nils Meyer U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ - UChi_00 = __svzero(UChi_00); \ - UChi_10 = __svzero(UChi_10); \ - UChi_01 = __svzero(UChi_01); \ - UChi_11 = __svzero(UChi_11); \ - UChi_02 = __svzero(UChi_02); \ - UChi_12 = __svzero(UChi_12); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \ UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ @@ -328,12 +335,12 @@ Author: Nils Meyer } // XP_RECON #define XP_RECON_A64FXd \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \ result_00 = UChi_00; \ result_01 = UChi_01; \ result_02 = UChi_02; \ @@ -359,7 +366,6 @@ Author: Nils Meyer // YP_PROJ #define YP_PROJ_A64FXd \ { \ - table0 = svld1(pg1, (uint64_t*)&lut[2]); \ Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ @@ -370,7 +376,6 @@ Author: Nils Meyer // ZP_PROJ #define ZP_PROJ_A64FXd \ { \ - table0 = svld1(pg1, (uint64_t*)&lut[1]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ @@ -381,7 +386,6 @@ Author: Nils Meyer // TP_PROJ #define TP_PROJ_A64FXd \ { \ - table0 = svld1(pg1, (uint64_t*)&lut[0]); \ Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ @@ -401,12 +405,12 @@ Author: Nils Meyer } // XM_RECON #define XM_RECON_A64FXd \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \ result_00 = UChi_00; \ result_01 = UChi_01; \ result_02 = UChi_02; \ @@ -417,7 +421,6 @@ Author: Nils Meyer // YM_PROJ #define YM_PROJ_A64FXd \ { \ - table0 = svld1(pg1, (uint64_t*)&lut[2]); \ Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ @@ -428,7 +431,6 @@ Author: Nils Meyer // ZM_PROJ #define ZM_PROJ_A64FXd \ { \ - table0 = svld1(pg1, (uint64_t*)&lut[1]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ @@ -439,7 +441,6 @@ Author: Nils Meyer // TM_PROJ #define TM_PROJ_A64FXd \ { \ - table0 = svld1(pg1, (uint64_t*)&lut[0]); \ Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ @@ -455,12 +456,12 @@ Author: Nils Meyer result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ - result_00 = UChi_00; \ - result_01 = UChi_01; \ - result_02 = UChi_02; \ - result_10 = UChi_10; \ - result_11 = UChi_11; \ - result_12 = UChi_12; + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); // YP_RECON_ACCUM #define YP_RECON_ACCUM_A64FXd \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index c8e12652..7329e4dc 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -2,7 +2,7 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: XXX + Source file: Fujitsu_A64FX_intrin_single.h Copyright (C) 2020 @@ -40,9 +40,9 @@ Author: Nils Meyer #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A) #define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) -#define MAYBEPERM(A,perm) if (perm) { A ; } +#define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI ZERO_PSI_A64FXf +#define ZERO_PSI #define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf @@ -62,10 +62,10 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 PERM0_A64FXf -#define PERMUTE_DIR1 PERM1_A64FXf -#define PERMUTE_DIR2 PERM2_A64FXf -#define PERMUTE_DIR3 PERM3_A64FXf +#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; } +#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; } +#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; } +#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; } // DECLARATIONS #define DECLARATIONS_A64FXf \ const uint32_t lut[4][16] = { \ @@ -170,12 +170,12 @@ Author: Nils Meyer // LOAD_CHI #define LOAD_CHI_A64FXf(base) \ { \ - Chi_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chi_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chi_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chi_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chi_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \ + Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \ + Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \ + Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \ + Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \ + Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ @@ -227,9 +227,24 @@ Author: Nils Meyer Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ } +// LOAD_TABLE0 +#define LOAD_TABLE0 \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); + // PERM0 #define PERM0_A64FXf \ - table0 = svld1(pg1, (float32_t*)&lut[0]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -239,7 +254,6 @@ Author: Nils Meyer // PERM1 #define PERM1_A64FXf \ - table0 = svld1(pg1, (float32_t*)&lut[1]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -249,7 +263,6 @@ Author: Nils Meyer // PERM2 #define PERM2_A64FXf \ - table0 = svld1(pg1, (float32_t*)&lut[2]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -259,7 +272,6 @@ Author: Nils Meyer // PERM3 #define PERM3_A64FXf \ - table0 = svld1(pg1, (float32_t*)&lut[3]); \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -277,18 +289,12 @@ Author: Nils Meyer U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ - UChi_00 = __svzero(UChi_00); \ - UChi_10 = __svzero(UChi_10); \ - UChi_01 = __svzero(UChi_01); \ - UChi_11 = __svzero(UChi_11); \ - UChi_02 = __svzero(UChi_02); \ - UChi_12 = __svzero(UChi_12); \ - UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ - UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ - UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ - UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ - UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ - UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \ UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ @@ -326,7 +332,6 @@ Author: Nils Meyer // XP_PROJ #define XP_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (float32_t*)&lut[3]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ @@ -336,12 +341,12 @@ Author: Nils Meyer } // XP_RECON #define XP_RECON_A64FXf \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \ result_00 = UChi_00; \ result_01 = UChi_01; \ result_02 = UChi_02; \ @@ -367,7 +372,6 @@ Author: Nils Meyer // YP_PROJ #define YP_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (float32_t*)&lut[2]); \ Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ @@ -378,7 +382,6 @@ Author: Nils Meyer // ZP_PROJ #define ZP_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (float32_t*)&lut[1]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ @@ -389,7 +392,6 @@ Author: Nils Meyer // TP_PROJ #define TP_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (float32_t*)&lut[0]); \ Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ @@ -400,7 +402,6 @@ Author: Nils Meyer // XM_PROJ #define XM_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (float32_t*)&lut[3]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ @@ -410,12 +411,12 @@ Author: Nils Meyer } // XM_RECON #define XM_RECON_A64FXf \ - result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ - result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ - result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ - result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ - result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ - result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \ result_00 = UChi_00; \ result_01 = UChi_01; \ result_02 = UChi_02; \ @@ -426,7 +427,6 @@ Author: Nils Meyer // YM_PROJ #define YM_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (float32_t*)&lut[2]); \ Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ @@ -437,7 +437,6 @@ Author: Nils Meyer // ZM_PROJ #define ZM_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (float32_t*)&lut[1]); \ Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ @@ -448,7 +447,6 @@ Author: Nils Meyer // TM_PROJ #define TM_PROJ_A64FXf \ { \ - table0 = svld1(pg1, (float32_t*)&lut[0]); \ Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ @@ -464,12 +462,12 @@ Author: Nils Meyer result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ - result_00 = UChi_00; \ - result_01 = UChi_01; \ - result_02 = UChi_02; \ - result_10 = UChi_10; \ - result_11 = UChi_11; \ - result_12 = UChi_12; + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); // YP_RECON_ACCUM #define YP_RECON_ACCUM_A64FXf \ From 113f277b6ac57bee08d8393cf4d011ba637059c8 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Sat, 11 Apr 2020 04:55:01 +0200 Subject: [PATCH 038/147] enable dslash asm using -DA64FXASM, additionaly -DDSLASHINTRIN for intrinsics impl --- .../implementation/WilsonKernelsAsmA64FX.h | 2 +- .../WilsonKernelsHandImplementation.h | 403 ++------ .../WilsonKernelsHandImplementation.h.debug | 943 ++++++++++++++++++ Grid/simd/Fujitsu_A64FX_asm_double.h | 7 - Grid/simd/Fujitsu_A64FX_asm_single.h | 7 - Grid/simd/Fujitsu_A64FX_undef.h | 4 + 6 files changed, 1020 insertions(+), 346 deletions(-) create mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.debug diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 139721e6..d14f4b9c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -30,7 +30,7 @@ Author: paboyle /* END LEGAL */ #pragma once -#if defined(A64FX) +#if defined(A64FXASM) #pragma message("invoking A64FX Dslash") diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index 6f3edbb5..f7b018fa 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -32,29 +32,29 @@ Author: paboyle #include -#undef LOAD_CHIMU -#undef LOAD_CHI +#undef LOAD_CHIMU +#undef LOAD_CHI #undef MULT_2SPIN #undef PERMUTE_DIR -#undef XP_PROJ -#undef YP_PROJ -#undef ZP_PROJ -#undef TP_PROJ -#undef XM_PROJ -#undef YM_PROJ -#undef ZM_PROJ -#undef TM_PROJ -#undef XP_RECON -#undef XP_RECON_ACCUM -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef ZERO_RESULT +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT #undef Chimu_00 #undef Chimu_01 #undef Chimu_02 @@ -89,21 +89,7 @@ Author: paboyle Chimu_22=ref()(2)(2);\ Chimu_30=ref()(3)(0);\ Chimu_31=ref()(3)(1);\ - Chimu_32=ref()(3)(2);\ - std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \ - std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \ - std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \ - std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \ - std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \ - std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \ - std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \ - std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \ - std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \ - std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \ - std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \ - std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \ - std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \ -} + Chimu_32=ref()(3)(2);} #define LOAD_CHI\ {const SiteHalfSpinor &ref(buf[offset]); \ @@ -112,15 +98,7 @@ Author: paboyle Chi_02 = ref()(0)(2);\ Chi_10 = ref()(1)(0);\ Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2);\ - std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ - } + Chi_12 = ref()(1)(2);} // To splat or not to splat depends on the implementation #define MULT_2SPIN(A)\ @@ -151,38 +129,16 @@ Author: paboyle UChi_01+= U_10*Chi_02;\ UChi_11+= U_10*Chi_12;\ UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12;\ - std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \ - std::cout << "UChi_00 -- " << UChi_00 << std::endl; \ - std::cout << "UChi_01 -- " << UChi_01 << std::endl; \ - std::cout << "UChi_02 -- " << UChi_02 << std::endl; \ - std::cout << "UChi_10 -- " << UChi_10 << std::endl; \ - std::cout << "UChi_11 -- " << UChi_11 << std::endl; \ - std::cout << "UChi_12 -- " << UChi_12 << std::endl; \ - } + UChi_12+= U_20*Chi_12;} #define PERMUTE_DIR(dir) \ -std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \ -std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ -std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ -std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ -std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ -std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ -std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ permute##dir(Chi_00,Chi_00);\ permute##dir(Chi_01,Chi_01);\ permute##dir(Chi_02,Chi_02);\ permute##dir(Chi_10,Chi_10);\ permute##dir(Chi_11,Chi_11);\ - permute##dir(Chi_12,Chi_12);\ - std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + permute##dir(Chi_12,Chi_12); // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); @@ -192,14 +148,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Chi_02 = Chimu_02+timesI(Chimu_32);\ Chi_10 = Chimu_10+timesI(Chimu_20);\ Chi_11 = Chimu_11+timesI(Chimu_21);\ - Chi_12 = Chimu_12+timesI(Chimu_22);\ - std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + Chi_12 = Chimu_12+timesI(Chimu_22); #define YP_PROJ \ Chi_00 = Chimu_00-Chimu_30;\ @@ -207,14 +156,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Chi_02 = Chimu_02-Chimu_32;\ Chi_10 = Chimu_10+Chimu_20;\ Chi_11 = Chimu_11+Chimu_21;\ - Chi_12 = Chimu_12+Chimu_22;\ - std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + Chi_12 = Chimu_12+Chimu_22; #define ZP_PROJ \ Chi_00 = Chimu_00+timesI(Chimu_20); \ @@ -222,14 +164,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Chi_02 = Chimu_02+timesI(Chimu_22); \ Chi_10 = Chimu_10-timesI(Chimu_30); \ Chi_11 = Chimu_11-timesI(Chimu_31); \ - Chi_12 = Chimu_12-timesI(Chimu_32);\ - std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + Chi_12 = Chimu_12-timesI(Chimu_32); #define TP_PROJ \ Chi_00 = Chimu_00+Chimu_20; \ @@ -237,14 +172,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Chi_02 = Chimu_02+Chimu_22; \ Chi_10 = Chimu_10+Chimu_30; \ Chi_11 = Chimu_11+Chimu_31; \ - Chi_12 = Chimu_12+Chimu_32;\ - std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + Chi_12 = Chimu_12+Chimu_32; // hspin(0)=fspin(0)-timesI(fspin(3)); @@ -255,14 +183,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Chi_02 = Chimu_02-timesI(Chimu_32);\ Chi_10 = Chimu_10-timesI(Chimu_20);\ Chi_11 = Chimu_11-timesI(Chimu_21);\ - Chi_12 = Chimu_12-timesI(Chimu_22);\ - std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + Chi_12 = Chimu_12-timesI(Chimu_22); #define YM_PROJ \ Chi_00 = Chimu_00+Chimu_30;\ @@ -270,14 +191,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Chi_02 = Chimu_02+Chimu_32;\ Chi_10 = Chimu_10-Chimu_20;\ Chi_11 = Chimu_11-Chimu_21;\ - Chi_12 = Chimu_12-Chimu_22;\ - std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + Chi_12 = Chimu_12-Chimu_22; #define ZM_PROJ \ Chi_00 = Chimu_00-timesI(Chimu_20); \ @@ -285,14 +199,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Chi_02 = Chimu_02-timesI(Chimu_22); \ Chi_10 = Chimu_10+timesI(Chimu_30); \ Chi_11 = Chimu_11+timesI(Chimu_31); \ - Chi_12 = Chimu_12+timesI(Chimu_32);\ - std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + Chi_12 = Chimu_12+timesI(Chimu_32); #define TM_PROJ \ Chi_00 = Chimu_00-Chimu_20; \ @@ -300,14 +207,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Chi_02 = Chimu_02-Chimu_22; \ Chi_10 = Chimu_10-Chimu_30; \ Chi_11 = Chimu_11-Chimu_31; \ - Chi_12 = Chimu_12-Chimu_32;\ - std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \ - std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ - std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ - std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ - std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ - std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ - std::cout << "Chi_12 -- " << Chi_12 << std::endl; + Chi_12 = Chimu_12-Chimu_32; // fspin(0)=hspin(0); // fspin(1)=hspin(1); @@ -325,20 +225,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22 = timesMinusI(UChi_12);\ result_30 = timesMinusI(UChi_00);\ result_31 = timesMinusI(UChi_01);\ - result_32 = timesMinusI(UChi_02);\ - std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32 = timesMinusI(UChi_02); #define XP_RECON_ACCUM\ result_00+=UChi_00;\ @@ -352,20 +239,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22-=timesI(UChi_12);\ result_30-=timesI(UChi_00);\ result_31-=timesI(UChi_01);\ - result_32-=timesI(UChi_02);\ - std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32-=timesI(UChi_02); #define XM_RECON\ result_00 = UChi_00;\ @@ -379,20 +253,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22 = timesI(UChi_12);\ result_30 = timesI(UChi_00);\ result_31 = timesI(UChi_01);\ - result_32 = timesI(UChi_02);\ - std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32 = timesI(UChi_02); #define XM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -406,20 +267,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22+= timesI(UChi_12);\ result_30+= timesI(UChi_00);\ result_31+= timesI(UChi_01);\ - result_32+= timesI(UChi_02);\ - std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32+= timesI(UChi_02); #define YP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -433,20 +281,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22+= UChi_12;\ result_30-= UChi_00;\ result_31-= UChi_01;\ - result_32-= UChi_02;\ - std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32-= UChi_02; #define YM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -460,20 +295,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22-= UChi_12;\ result_30+= UChi_00;\ result_31+= UChi_01;\ - result_32+= UChi_02;\ - std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32+= UChi_02; #define ZP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -487,20 +309,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22-= timesI(UChi_02); \ result_30+= timesI(UChi_10); \ result_31+= timesI(UChi_11); \ - result_32+= timesI(UChi_12);\ - std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32+= timesI(UChi_12); #define ZM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -514,20 +323,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22+= timesI(UChi_02); \ result_30-= timesI(UChi_10); \ result_31-= timesI(UChi_11); \ - result_32-= timesI(UChi_12);\ - std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32-= timesI(UChi_12); #define TP_RECON_ACCUM\ result_00+= UChi_00;\ @@ -541,20 +337,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22+= UChi_02; \ result_30+= UChi_10; \ result_31+= UChi_11; \ - result_32+= UChi_12;\ - std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32+= UChi_12; #define TM_RECON_ACCUM\ result_00+= UChi_00;\ @@ -568,20 +351,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22-= UChi_02; \ result_30-= UChi_10; \ result_31-= UChi_11; \ - result_32-= UChi_12;\ - std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl; + result_32-= UChi_12; #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ @@ -598,7 +368,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ LOAD_CHI; \ } \ MULT_2SPIN(DIR); \ - RECON; + RECON; #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ @@ -644,19 +414,6 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ vstream(ref()(3)(0),result_30); \ vstream(ref()(3)(1),result_31); \ vstream(ref()(3)(2),result_32); \ - std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl;\ } #define HAND_RESULT_EXT(ss) \ @@ -674,19 +431,6 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ ref()(3)(0)+=result_30; \ ref()(3)(1)+=result_31; \ ref()(3)(2)+=result_32; \ - std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \ - std::cout << "result_00 -- " << result_00 << std::endl; \ - std::cout << "result_01 -- " << result_01 << std::endl; \ - std::cout << "result_02 -- " << result_02 << std::endl; \ - std::cout << "result_10 -- " << result_10 << std::endl; \ - std::cout << "result_11 -- " << result_11 << std::endl; \ - std::cout << "result_12 -- " << result_12 << std::endl; \ - std::cout << "result_20 -- " << result_20 << std::endl; \ - std::cout << "result_21 -- " << result_21 << std::endl; \ - std::cout << "result_22 -- " << result_22 << std::endl; \ - std::cout << "result_30 -- " << result_30 << std::endl; \ - std::cout << "result_31 -- " << result_31 << std::endl; \ - std::cout << "result_32 -- " << result_32 << std::endl;\ } @@ -720,10 +464,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ Simd U_20; \ Simd U_01; \ Simd U_11; \ - Simd U_21;\ - Simd debugreg;\ - svbool_t pg1; \ - pg1 = svptrue_b64(); \ + Simd U_21; #define ZERO_RESULT \ result_00=Zero(); \ @@ -737,7 +478,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ result_22=Zero(); \ result_30=Zero(); \ result_31=Zero(); \ - result_32=Zero(); + result_32=Zero(); #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -754,7 +495,7 @@ std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ NAMESPACE_BEGIN(Grid); -template void +template void WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -789,7 +530,7 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView StencilEntry *SE; int offset,local,perm, ptype; - + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); @@ -801,7 +542,7 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView HAND_RESULT(ss); } -template void +template void WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -848,7 +589,7 @@ void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi HAND_RESULT(ss); } -template void +template void WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -900,29 +641,29 @@ void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi ////////////// Wilson ; uses this implementation ///////////////////// NAMESPACE_END(Grid); -#undef LOAD_CHIMU -#undef LOAD_CHI +#undef LOAD_CHIMU +#undef LOAD_CHI #undef MULT_2SPIN #undef PERMUTE_DIR -#undef XP_PROJ -#undef YP_PROJ -#undef ZP_PROJ -#undef TP_PROJ -#undef XM_PROJ -#undef YM_PROJ -#undef ZM_PROJ -#undef TM_PROJ -#undef XP_RECON -#undef XP_RECON_ACCUM -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef ZERO_RESULT +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT #undef Chimu_00 #undef Chimu_01 #undef Chimu_02 diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.debug b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.debug new file mode 100644 index 00000000..6f3edbb5 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.debug @@ -0,0 +1,943 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +#pragma once + +#include + + +#undef LOAD_CHIMU +#undef LOAD_CHI +#undef MULT_2SPIN +#undef PERMUTE_DIR +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT +#undef Chimu_00 +#undef Chimu_01 +#undef Chimu_02 +#undef Chimu_10 +#undef Chimu_11 +#undef Chimu_12 +#undef Chimu_20 +#undef Chimu_21 +#undef Chimu_22 +#undef Chimu_30 +#undef Chimu_31 +#undef Chimu_32 +#undef HAND_STENCIL_LEG +#undef HAND_STENCIL_LEG_INT +#undef HAND_STENCIL_LEG_EXT +#undef HAND_RESULT +#undef HAND_RESULT_INT +#undef HAND_RESULT_EXT + +#define REGISTER + +#define LOAD_CHIMU \ + {const SiteSpinor & ref (in[offset]); \ + Chimu_00=ref()(0)(0);\ + Chimu_01=ref()(0)(1);\ + Chimu_02=ref()(0)(2);\ + Chimu_10=ref()(1)(0);\ + Chimu_11=ref()(1)(1);\ + Chimu_12=ref()(1)(2);\ + Chimu_20=ref()(2)(0);\ + Chimu_21=ref()(2)(1);\ + Chimu_22=ref()(2)(2);\ + Chimu_30=ref()(3)(0);\ + Chimu_31=ref()(3)(1);\ + Chimu_32=ref()(3)(2);\ + std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \ + std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \ + std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \ + std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \ + std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \ + std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \ + std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \ + std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \ + std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \ + std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \ + std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \ + std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \ + std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \ +} + +#define LOAD_CHI\ + {const SiteHalfSpinor &ref(buf[offset]); \ + Chi_00 = ref()(0)(0);\ + Chi_01 = ref()(0)(1);\ + Chi_02 = ref()(0)(2);\ + Chi_10 = ref()(1)(0);\ + Chi_11 = ref()(1)(1);\ + Chi_12 = ref()(1)(2);\ + std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ + } + +// To splat or not to splat depends on the implementation +#define MULT_2SPIN(A)\ + {auto & ref(U[sU](A)); \ + Impl::loadLinkElement(U_00,ref()(0,0)); \ + Impl::loadLinkElement(U_10,ref()(1,0)); \ + Impl::loadLinkElement(U_20,ref()(2,0)); \ + Impl::loadLinkElement(U_01,ref()(0,1)); \ + Impl::loadLinkElement(U_11,ref()(1,1)); \ + Impl::loadLinkElement(U_21,ref()(2,1)); \ + UChi_00 = U_00*Chi_00;\ + UChi_10 = U_00*Chi_10;\ + UChi_01 = U_10*Chi_00;\ + UChi_11 = U_10*Chi_10;\ + UChi_02 = U_20*Chi_00;\ + UChi_12 = U_20*Chi_10;\ + UChi_00+= U_01*Chi_01;\ + UChi_10+= U_01*Chi_11;\ + UChi_01+= U_11*Chi_01;\ + UChi_11+= U_11*Chi_11;\ + UChi_02+= U_21*Chi_01;\ + UChi_12+= U_21*Chi_11;\ + Impl::loadLinkElement(U_00,ref()(0,2)); \ + Impl::loadLinkElement(U_10,ref()(1,2)); \ + Impl::loadLinkElement(U_20,ref()(2,2)); \ + UChi_00+= U_00*Chi_02;\ + UChi_10+= U_00*Chi_12;\ + UChi_01+= U_10*Chi_02;\ + UChi_11+= U_10*Chi_12;\ + UChi_02+= U_20*Chi_02;\ + UChi_12+= U_20*Chi_12;\ + std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \ + std::cout << "UChi_00 -- " << UChi_00 << std::endl; \ + std::cout << "UChi_01 -- " << UChi_01 << std::endl; \ + std::cout << "UChi_02 -- " << UChi_02 << std::endl; \ + std::cout << "UChi_10 -- " << UChi_10 << std::endl; \ + std::cout << "UChi_11 -- " << UChi_11 << std::endl; \ + std::cout << "UChi_12 -- " << UChi_12 << std::endl; \ + } + + +#define PERMUTE_DIR(dir) \ +std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \ +std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ +std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ +std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ +std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ +std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ +std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ + permute##dir(Chi_00,Chi_00);\ + permute##dir(Chi_01,Chi_01);\ + permute##dir(Chi_02,Chi_02);\ + permute##dir(Chi_10,Chi_10);\ + permute##dir(Chi_11,Chi_11);\ + permute##dir(Chi_12,Chi_12);\ + std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +// hspin(0)=fspin(0)+timesI(fspin(3)); +// hspin(1)=fspin(1)+timesI(fspin(2)); +#define XP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_30);\ + Chi_01 = Chimu_01+timesI(Chimu_31);\ + Chi_02 = Chimu_02+timesI(Chimu_32);\ + Chi_10 = Chimu_10+timesI(Chimu_20);\ + Chi_11 = Chimu_11+timesI(Chimu_21);\ + Chi_12 = Chimu_12+timesI(Chimu_22);\ + std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define YP_PROJ \ + Chi_00 = Chimu_00-Chimu_30;\ + Chi_01 = Chimu_01-Chimu_31;\ + Chi_02 = Chimu_02-Chimu_32;\ + Chi_10 = Chimu_10+Chimu_20;\ + Chi_11 = Chimu_11+Chimu_21;\ + Chi_12 = Chimu_12+Chimu_22;\ + std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define ZP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_20); \ + Chi_01 = Chimu_01+timesI(Chimu_21); \ + Chi_02 = Chimu_02+timesI(Chimu_22); \ + Chi_10 = Chimu_10-timesI(Chimu_30); \ + Chi_11 = Chimu_11-timesI(Chimu_31); \ + Chi_12 = Chimu_12-timesI(Chimu_32);\ + std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define TP_PROJ \ + Chi_00 = Chimu_00+Chimu_20; \ + Chi_01 = Chimu_01+Chimu_21; \ + Chi_02 = Chimu_02+Chimu_22; \ + Chi_10 = Chimu_10+Chimu_30; \ + Chi_11 = Chimu_11+Chimu_31; \ + Chi_12 = Chimu_12+Chimu_32;\ + std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + + +// hspin(0)=fspin(0)-timesI(fspin(3)); +// hspin(1)=fspin(1)-timesI(fspin(2)); +#define XM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_30);\ + Chi_01 = Chimu_01-timesI(Chimu_31);\ + Chi_02 = Chimu_02-timesI(Chimu_32);\ + Chi_10 = Chimu_10-timesI(Chimu_20);\ + Chi_11 = Chimu_11-timesI(Chimu_21);\ + Chi_12 = Chimu_12-timesI(Chimu_22);\ + std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define YM_PROJ \ + Chi_00 = Chimu_00+Chimu_30;\ + Chi_01 = Chimu_01+Chimu_31;\ + Chi_02 = Chimu_02+Chimu_32;\ + Chi_10 = Chimu_10-Chimu_20;\ + Chi_11 = Chimu_11-Chimu_21;\ + Chi_12 = Chimu_12-Chimu_22;\ + std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define ZM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_20); \ + Chi_01 = Chimu_01-timesI(Chimu_21); \ + Chi_02 = Chimu_02-timesI(Chimu_22); \ + Chi_10 = Chimu_10+timesI(Chimu_30); \ + Chi_11 = Chimu_11+timesI(Chimu_31); \ + Chi_12 = Chimu_12+timesI(Chimu_32);\ + std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define TM_PROJ \ + Chi_00 = Chimu_00-Chimu_20; \ + Chi_01 = Chimu_01-Chimu_21; \ + Chi_02 = Chimu_02-Chimu_22; \ + Chi_10 = Chimu_10-Chimu_30; \ + Chi_11 = Chimu_11-Chimu_31; \ + Chi_12 = Chimu_12-Chimu_32;\ + std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +// fspin(0)=hspin(0); +// fspin(1)=hspin(1); +// fspin(2)=timesMinusI(hspin(1)); +// fspin(3)=timesMinusI(hspin(0)); +#define XP_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesMinusI(UChi_10);\ + result_21 = timesMinusI(UChi_11);\ + result_22 = timesMinusI(UChi_12);\ + result_30 = timesMinusI(UChi_00);\ + result_31 = timesMinusI(UChi_01);\ + result_32 = timesMinusI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XP_RECON_ACCUM\ + result_00+=UChi_00;\ + result_01+=UChi_01;\ + result_02+=UChi_02;\ + result_10+=UChi_10;\ + result_11+=UChi_11;\ + result_12+=UChi_12;\ + result_20-=timesI(UChi_10);\ + result_21-=timesI(UChi_11);\ + result_22-=timesI(UChi_12);\ + result_30-=timesI(UChi_00);\ + result_31-=timesI(UChi_01);\ + result_32-=timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XM_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesI(UChi_10);\ + result_21 = timesI(UChi_11);\ + result_22 = timesI(UChi_12);\ + result_30 = timesI(UChi_00);\ + result_31 = timesI(UChi_01);\ + result_32 = timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_10);\ + result_21+= timesI(UChi_11);\ + result_22+= timesI(UChi_12);\ + result_30+= timesI(UChi_00);\ + result_31+= timesI(UChi_01);\ + result_32+= timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define YP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_10;\ + result_21+= UChi_11;\ + result_22+= UChi_12;\ + result_30-= UChi_00;\ + result_31-= UChi_01;\ + result_32-= UChi_02;\ + std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define YM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_10;\ + result_21-= UChi_11;\ + result_22-= UChi_12;\ + result_30+= UChi_00;\ + result_31+= UChi_01;\ + result_32+= UChi_02;\ + std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define ZP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= timesI(UChi_00); \ + result_21-= timesI(UChi_01); \ + result_22-= timesI(UChi_02); \ + result_30+= timesI(UChi_10); \ + result_31+= timesI(UChi_11); \ + result_32+= timesI(UChi_12);\ + std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define ZM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_00); \ + result_21+= timesI(UChi_01); \ + result_22+= timesI(UChi_02); \ + result_30-= timesI(UChi_10); \ + result_31-= timesI(UChi_11); \ + result_32-= timesI(UChi_12);\ + std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define TP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_00; \ + result_21+= UChi_01; \ + result_22+= UChi_02; \ + result_30+= UChi_10; \ + result_31+= UChi_11; \ + result_32+= UChi_12;\ + std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define TM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_00; \ + result_21-= UChi_01; \ + result_22-= UChi_02; \ + result_30-= UChi_10; \ + result_31-= UChi_11; \ + result_32-= UChi_12;\ + std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ + } \ + if (local || st.same_node[DIR] ) { \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ + LOAD_CHI; \ + MULT_2SPIN(DIR); \ + RECON; \ + nmu++; \ + } + +#define HAND_RESULT(ss) \ + { \ + SiteSpinor & ref (out[ss]); \ + vstream(ref()(0)(0),result_00); \ + vstream(ref()(0)(1),result_01); \ + vstream(ref()(0)(2),result_02); \ + vstream(ref()(1)(0),result_10); \ + vstream(ref()(1)(1),result_11); \ + vstream(ref()(1)(2),result_12); \ + vstream(ref()(2)(0),result_20); \ + vstream(ref()(2)(1),result_21); \ + vstream(ref()(2)(2),result_22); \ + vstream(ref()(3)(0),result_30); \ + vstream(ref()(3)(1),result_31); \ + vstream(ref()(3)(2),result_32); \ + std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl;\ + } + +#define HAND_RESULT_EXT(ss) \ + if (nmu){ \ + SiteSpinor & ref (out[ss]); \ + ref()(0)(0)+=result_00; \ + ref()(0)(1)+=result_01; \ + ref()(0)(2)+=result_02; \ + ref()(1)(0)+=result_10; \ + ref()(1)(1)+=result_11; \ + ref()(1)(2)+=result_12; \ + ref()(2)(0)+=result_20; \ + ref()(2)(1)+=result_21; \ + ref()(2)(2)+=result_22; \ + ref()(3)(0)+=result_30; \ + ref()(3)(1)+=result_31; \ + ref()(3)(2)+=result_32; \ + std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl;\ + } + + +#define HAND_DECLARATIONS(a) \ + Simd result_00; \ + Simd result_01; \ + Simd result_02; \ + Simd result_10; \ + Simd result_11; \ + Simd result_12; \ + Simd result_20; \ + Simd result_21; \ + Simd result_22; \ + Simd result_30; \ + Simd result_31; \ + Simd result_32; \ + Simd Chi_00; \ + Simd Chi_01; \ + Simd Chi_02; \ + Simd Chi_10; \ + Simd Chi_11; \ + Simd Chi_12; \ + Simd UChi_00; \ + Simd UChi_01; \ + Simd UChi_02; \ + Simd UChi_10; \ + Simd UChi_11; \ + Simd UChi_12; \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21;\ + Simd debugreg;\ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ + +#define ZERO_RESULT \ + result_00=Zero(); \ + result_01=Zero(); \ + result_02=Zero(); \ + result_10=Zero(); \ + result_11=Zero(); \ + result_12=Zero(); \ + result_20=Zero(); \ + result_21=Zero(); \ + result_22=Zero(); \ + result_30=Zero(); \ + result_31=Zero(); \ + result_32=Zero(); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 + +NAMESPACE_BEGIN(Grid); + +template void +WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset, ptype; + StencilEntry *SE; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset, ptype; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +////////////// Wilson ; uses this implementation ///////////////////// + +NAMESPACE_END(Grid); +#undef LOAD_CHIMU +#undef LOAD_CHI +#undef MULT_2SPIN +#undef PERMUTE_DIR +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT +#undef Chimu_00 +#undef Chimu_01 +#undef Chimu_02 +#undef Chimu_10 +#undef Chimu_11 +#undef Chimu_12 +#undef Chimu_20 +#undef Chimu_21 +#undef Chimu_22 +#undef Chimu_30 +#undef Chimu_31 +#undef Chimu_32 +#undef HAND_STENCIL_LEG +#undef HAND_STENCIL_LEG_INT +#undef HAND_STENCIL_LEG_EXT +#undef HAND_RESULT +#undef HAND_RESULT_INT +#undef HAND_RESULT_EXT diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index e2d731e3..52dd8320 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -443,7 +443,6 @@ asm ( \ #define YP_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fsub z12.d, p5/m, z12.d, z27.d \n\t" \ "fsub z13.d, p5/m, z13.d, z28.d \n\t" \ "fsub z14.d, p5/m, z14.d, z29.d \n\t" \ @@ -459,7 +458,6 @@ asm ( \ #define ZP_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \ "fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \ "fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \ @@ -475,7 +473,6 @@ asm ( \ #define TP_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fadd z12.d, p5/m, z12.d, z24.d \n\t" \ "fadd z13.d, p5/m, z13.d, z25.d \n\t" \ "fadd z14.d, p5/m, z14.d, z26.d \n\t" \ @@ -491,7 +488,6 @@ asm ( \ #define XM_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \ "fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \ "fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \ @@ -533,7 +529,6 @@ asm ( \ #define YM_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fadd z12.d, p5/m, z12.d, z27.d \n\t" \ "fadd z13.d, p5/m, z13.d, z28.d \n\t" \ "fadd z14.d, p5/m, z14.d, z29.d \n\t" \ @@ -549,7 +544,6 @@ asm ( \ #define ZM_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \ "fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \ "fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \ @@ -680,7 +674,6 @@ asm ( \ // TP_RECON_ACCUM #define TP_RECON_ACCUM_A64FXd \ asm ( \ - "ptrue p5.d \n\t" \ "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 210d537e..faa8249b 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -454,7 +454,6 @@ asm ( \ #define YP_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fsub z12.s, p5/m, z12.s, z27.s \n\t" \ "fsub z13.s, p5/m, z13.s, z28.s \n\t" \ "fsub z14.s, p5/m, z14.s, z29.s \n\t" \ @@ -470,7 +469,6 @@ asm ( \ #define ZP_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z24.s, 90 \n\t" \ "fcadd z13.s, p5/m, z13.s, z25.s, 90 \n\t" \ "fcadd z14.s, p5/m, z14.s, z26.s, 90 \n\t" \ @@ -486,7 +484,6 @@ asm ( \ #define TP_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fadd z12.s, p5/m, z12.s, z24.s \n\t" \ "fadd z13.s, p5/m, z13.s, z25.s \n\t" \ "fadd z14.s, p5/m, z14.s, z26.s \n\t" \ @@ -502,7 +499,6 @@ asm ( \ #define XM_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z27.s, 270 \n\t" \ "fcadd z13.s, p5/m, z13.s, z28.s, 270 \n\t" \ "fcadd z14.s, p5/m, z14.s, z29.s, 270 \n\t" \ @@ -544,7 +540,6 @@ asm ( \ #define YM_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fadd z12.s, p5/m, z12.s, z27.s \n\t" \ "fadd z13.s, p5/m, z13.s, z28.s \n\t" \ "fadd z14.s, p5/m, z14.s, z29.s \n\t" \ @@ -560,7 +555,6 @@ asm ( \ #define ZM_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z24.s, 270 \n\t" \ "fcadd z13.s, p5/m, z13.s, z25.s, 270 \n\t" \ "fcadd z14.s, p5/m, z14.s, z26.s, 270 \n\t" \ @@ -691,7 +685,6 @@ asm ( \ // TP_RECON_ACCUM #define TP_RECON_ACCUM_A64FXf \ asm ( \ - "ptrue p5.s \n\t" \ "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 31abc038..12ade6e2 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -69,3 +69,7 @@ Author: Nils Meyer #undef PERMUTE_DIR1 #undef PERMUTE_DIR2 #undef PERMUTE_DIR3 +#undef LOAD_TABLE0 +#undef LOAD_TABLE1 +#undef LOAD_TABLE2 +#undef LOAD_TABLE3 From 581392f2f2ad8b9856caa39507681d64465c0340 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Sun, 12 Apr 2020 22:06:14 +0200 Subject: [PATCH 039/147] now with pf, best results so far using intrinsics+pf --- .../implementation/WilsonKernelsAsmA64FX.h | 104 ++- .../WilsonKernelsAsmBodyA64FX.h | 225 ++++++ .../WilsonKernelsHandImplementation.h.orig | 684 ------------------ Grid/simd/Fujitsu_A64FX_asm_double.h | 50 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 50 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 26 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 26 +- 7 files changed, 414 insertions(+), 751 deletions(-) create mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h delete mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.orig diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index d14f4b9c..9d74dd15 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -37,12 +37,17 @@ Author: paboyle // undefine everything #include +#define WILSONKERNELSASMBODYA64FX +#pragma message("invoking A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") + /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine /////////////////////////////////////////////////////////// #if defined(DSLASHINTRIN) +#pragma message ("invoking A64FX Dslash: intrin") #include #else +#pragma message ("invoking A64FX Dslash: asm") #include #endif @@ -59,12 +64,20 @@ Author: paboyle template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, @@ -74,7 +87,11 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -82,23 +99,38 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include - +#endif #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -106,22 +138,39 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -133,22 +182,38 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -156,22 +221,38 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -179,22 +260,38 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFie template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif //#undef MAYBEPERM //#undef MULT_2SPIN @@ -348,7 +445,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double //#undef MAYBEPERM //#undef MULT_2SPIN -// undefine +// undefine #include /////////////////////////////////////////////////////////// @@ -361,7 +458,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double #include #endif -// former KNL +// former KNL //#define MAYBEPERM(A,perm) if (perm) { A ; } //#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) //#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; @@ -654,6 +751,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double #endif // VEC 5D // undefs +#undef WILSONKERNELSASMBODYA64FX #include #endif //A64FX diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h new file mode 100644 index 00000000..44bf2005 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -0,0 +1,225 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: WilsonKernelsAsmBodyA64FX.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifdef KERNEL_DAG +#define DIR0_PROJMEM(base) XP_PROJMEM(base); +#define DIR1_PROJMEM(base) YP_PROJMEM(base); +#define DIR2_PROJMEM(base) ZP_PROJMEM(base); +#define DIR3_PROJMEM(base) TP_PROJMEM(base); +#define DIR4_PROJMEM(base) XM_PROJMEM(base); +#define DIR5_PROJMEM(base) YM_PROJMEM(base); +#define DIR6_PROJMEM(base) ZM_PROJMEM(base); +#define DIR7_PROJMEM(base) TM_PROJMEM(base); +#define DIR0_RECON XP_RECON +#define DIR1_RECON YP_RECON_ACCUM +#define DIR2_RECON ZP_RECON_ACCUM +#define DIR3_RECON TP_RECON_ACCUM +#define DIR4_RECON XM_RECON_ACCUM +#define DIR5_RECON YM_RECON_ACCUM +#define DIR6_RECON ZM_RECON_ACCUM +#define DIR7_RECON TM_RECON_ACCUM +#else +#define DIR0_PROJMEM(base) XM_PROJMEM(base); +#define DIR1_PROJMEM(base) YM_PROJMEM(base); +#define DIR2_PROJMEM(base) ZM_PROJMEM(base); +#define DIR3_PROJMEM(base) TM_PROJMEM(base); +#define DIR4_PROJMEM(base) XP_PROJMEM(base); +#define DIR5_PROJMEM(base) YP_PROJMEM(base); +#define DIR6_PROJMEM(base) ZP_PROJMEM(base); +#define DIR7_PROJMEM(base) TP_PROJMEM(base); +#define DIR0_RECON XM_RECON +#define DIR1_RECON YM_RECON_ACCUM +#define DIR2_RECON ZM_RECON_ACCUM +#define DIR3_RECON TM_RECON_ACCUM +#define DIR4_RECON XP_RECON_ACCUM +#define DIR5_RECON YP_RECON_ACCUM +#define DIR6_RECON ZP_RECON_ACCUM +#define DIR7_RECON TP_RECON_ACCUM +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Comms then compute kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR_AND_EXTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + LOAD64(%r10,isigns); \ + RECON; \ + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Pre comms kernel -- prefetch like normal because it is mostly right +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + PREFETCH_CHIMU(base); \ + LOAD64(%r10,isigns); \ + RECON; \ + } else { PREFETCH_CHIMU(base); } + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif +//////////////////////////////////////////////////////////////////////////////// +// Post comms kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef EXTERIOR + + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + { ZERO_PSI;} \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } + +#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} + +#endif +{ + int nmu; + int local,perm, ptype; + uint64_t base; + uint64_t basep; + const uint64_t plocal =(uint64_t) & in[0]; + + COMPLEX_SIGNS(isigns); + MASK_REGS; + int nmax=U.oSites(); + for(int site=0;site=nmax) ssn=0; + // int sUn=lo.Reorder(ssn); + int sUn=ssn; + LOCK_GAUGE(0); +#else + int sU =ssU; + int ssn=ssU+1; if(ssn>=nmax) ssn=0; + int sUn=ssn; +#endif + for(int s=0;s -Author: paboyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ - -#pragma once - -#include - - -#undef LOAD_CHIMU -#undef LOAD_CHI -#undef MULT_2SPIN -#undef PERMUTE_DIR -#undef XP_PROJ -#undef YP_PROJ -#undef ZP_PROJ -#undef TP_PROJ -#undef XM_PROJ -#undef YM_PROJ -#undef ZM_PROJ -#undef TM_PROJ -#undef XP_RECON -#undef XP_RECON_ACCUM -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef ZERO_RESULT -#undef Chimu_00 -#undef Chimu_01 -#undef Chimu_02 -#undef Chimu_10 -#undef Chimu_11 -#undef Chimu_12 -#undef Chimu_20 -#undef Chimu_21 -#undef Chimu_22 -#undef Chimu_30 -#undef Chimu_31 -#undef Chimu_32 -#undef HAND_STENCIL_LEG -#undef HAND_STENCIL_LEG_INT -#undef HAND_STENCIL_LEG_EXT -#undef HAND_RESULT -#undef HAND_RESULT_INT -#undef HAND_RESULT_EXT - -#define REGISTER - -#define LOAD_CHIMU \ - {const SiteSpinor & ref (in[offset]); \ - Chimu_00=ref()(0)(0);\ - Chimu_01=ref()(0)(1);\ - Chimu_02=ref()(0)(2);\ - Chimu_10=ref()(1)(0);\ - Chimu_11=ref()(1)(1);\ - Chimu_12=ref()(1)(2);\ - Chimu_20=ref()(2)(0);\ - Chimu_21=ref()(2)(1);\ - Chimu_22=ref()(2)(2);\ - Chimu_30=ref()(3)(0);\ - Chimu_31=ref()(3)(1);\ - Chimu_32=ref()(3)(2);} - -#define LOAD_CHI\ - {const SiteHalfSpinor &ref(buf[offset]); \ - Chi_00 = ref()(0)(0);\ - Chi_01 = ref()(0)(1);\ - Chi_02 = ref()(0)(2);\ - Chi_10 = ref()(1)(0);\ - Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2);} - -// To splat or not to splat depends on the implementation -#define MULT_2SPIN(A)\ - {auto & ref(U[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - UChi_00 = U_00*Chi_00;\ - UChi_10 = U_00*Chi_10;\ - UChi_01 = U_10*Chi_00;\ - UChi_11 = U_10*Chi_10;\ - UChi_02 = U_20*Chi_00;\ - UChi_12 = U_20*Chi_10;\ - UChi_00+= U_01*Chi_01;\ - UChi_10+= U_01*Chi_11;\ - UChi_01+= U_11*Chi_01;\ - UChi_11+= U_11*Chi_11;\ - UChi_02+= U_21*Chi_01;\ - UChi_12+= U_21*Chi_11;\ - Impl::loadLinkElement(U_00,ref()(0,2)); \ - Impl::loadLinkElement(U_10,ref()(1,2)); \ - Impl::loadLinkElement(U_20,ref()(2,2)); \ - UChi_00+= U_00*Chi_02;\ - UChi_10+= U_00*Chi_12;\ - UChi_01+= U_10*Chi_02;\ - UChi_11+= U_10*Chi_12;\ - UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12;} - - -#define PERMUTE_DIR(dir) \ - permute##dir(Chi_00,Chi_00);\ - permute##dir(Chi_01,Chi_01);\ - permute##dir(Chi_02,Chi_02);\ - permute##dir(Chi_10,Chi_10);\ - permute##dir(Chi_11,Chi_11);\ - permute##dir(Chi_12,Chi_12); - -// hspin(0)=fspin(0)+timesI(fspin(3)); -// hspin(1)=fspin(1)+timesI(fspin(2)); -#define XP_PROJ \ - Chi_00 = Chimu_00+timesI(Chimu_30);\ - Chi_01 = Chimu_01+timesI(Chimu_31);\ - Chi_02 = Chimu_02+timesI(Chimu_32);\ - Chi_10 = Chimu_10+timesI(Chimu_20);\ - Chi_11 = Chimu_11+timesI(Chimu_21);\ - Chi_12 = Chimu_12+timesI(Chimu_22); - -#define YP_PROJ \ - Chi_00 = Chimu_00-Chimu_30;\ - Chi_01 = Chimu_01-Chimu_31;\ - Chi_02 = Chimu_02-Chimu_32;\ - Chi_10 = Chimu_10+Chimu_20;\ - Chi_11 = Chimu_11+Chimu_21;\ - Chi_12 = Chimu_12+Chimu_22; - -#define ZP_PROJ \ - Chi_00 = Chimu_00+timesI(Chimu_20); \ - Chi_01 = Chimu_01+timesI(Chimu_21); \ - Chi_02 = Chimu_02+timesI(Chimu_22); \ - Chi_10 = Chimu_10-timesI(Chimu_30); \ - Chi_11 = Chimu_11-timesI(Chimu_31); \ - Chi_12 = Chimu_12-timesI(Chimu_32); - -#define TP_PROJ \ - Chi_00 = Chimu_00+Chimu_20; \ - Chi_01 = Chimu_01+Chimu_21; \ - Chi_02 = Chimu_02+Chimu_22; \ - Chi_10 = Chimu_10+Chimu_30; \ - Chi_11 = Chimu_11+Chimu_31; \ - Chi_12 = Chimu_12+Chimu_32; - - -// hspin(0)=fspin(0)-timesI(fspin(3)); -// hspin(1)=fspin(1)-timesI(fspin(2)); -#define XM_PROJ \ - Chi_00 = Chimu_00-timesI(Chimu_30);\ - Chi_01 = Chimu_01-timesI(Chimu_31);\ - Chi_02 = Chimu_02-timesI(Chimu_32);\ - Chi_10 = Chimu_10-timesI(Chimu_20);\ - Chi_11 = Chimu_11-timesI(Chimu_21);\ - Chi_12 = Chimu_12-timesI(Chimu_22); - -#define YM_PROJ \ - Chi_00 = Chimu_00+Chimu_30;\ - Chi_01 = Chimu_01+Chimu_31;\ - Chi_02 = Chimu_02+Chimu_32;\ - Chi_10 = Chimu_10-Chimu_20;\ - Chi_11 = Chimu_11-Chimu_21;\ - Chi_12 = Chimu_12-Chimu_22; - -#define ZM_PROJ \ - Chi_00 = Chimu_00-timesI(Chimu_20); \ - Chi_01 = Chimu_01-timesI(Chimu_21); \ - Chi_02 = Chimu_02-timesI(Chimu_22); \ - Chi_10 = Chimu_10+timesI(Chimu_30); \ - Chi_11 = Chimu_11+timesI(Chimu_31); \ - Chi_12 = Chimu_12+timesI(Chimu_32); - -#define TM_PROJ \ - Chi_00 = Chimu_00-Chimu_20; \ - Chi_01 = Chimu_01-Chimu_21; \ - Chi_02 = Chimu_02-Chimu_22; \ - Chi_10 = Chimu_10-Chimu_30; \ - Chi_11 = Chimu_11-Chimu_31; \ - Chi_12 = Chimu_12-Chimu_32; - -// fspin(0)=hspin(0); -// fspin(1)=hspin(1); -// fspin(2)=timesMinusI(hspin(1)); -// fspin(3)=timesMinusI(hspin(0)); -#define XP_RECON\ - result_00 = UChi_00;\ - result_01 = UChi_01;\ - result_02 = UChi_02;\ - result_10 = UChi_10;\ - result_11 = UChi_11;\ - result_12 = UChi_12;\ - result_20 = timesMinusI(UChi_10);\ - result_21 = timesMinusI(UChi_11);\ - result_22 = timesMinusI(UChi_12);\ - result_30 = timesMinusI(UChi_00);\ - result_31 = timesMinusI(UChi_01);\ - result_32 = timesMinusI(UChi_02); - -#define XP_RECON_ACCUM\ - result_00+=UChi_00;\ - result_01+=UChi_01;\ - result_02+=UChi_02;\ - result_10+=UChi_10;\ - result_11+=UChi_11;\ - result_12+=UChi_12;\ - result_20-=timesI(UChi_10);\ - result_21-=timesI(UChi_11);\ - result_22-=timesI(UChi_12);\ - result_30-=timesI(UChi_00);\ - result_31-=timesI(UChi_01);\ - result_32-=timesI(UChi_02); - -#define XM_RECON\ - result_00 = UChi_00;\ - result_01 = UChi_01;\ - result_02 = UChi_02;\ - result_10 = UChi_10;\ - result_11 = UChi_11;\ - result_12 = UChi_12;\ - result_20 = timesI(UChi_10);\ - result_21 = timesI(UChi_11);\ - result_22 = timesI(UChi_12);\ - result_30 = timesI(UChi_00);\ - result_31 = timesI(UChi_01);\ - result_32 = timesI(UChi_02); - -#define XM_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20+= timesI(UChi_10);\ - result_21+= timesI(UChi_11);\ - result_22+= timesI(UChi_12);\ - result_30+= timesI(UChi_00);\ - result_31+= timesI(UChi_01);\ - result_32+= timesI(UChi_02); - -#define YP_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20+= UChi_10;\ - result_21+= UChi_11;\ - result_22+= UChi_12;\ - result_30-= UChi_00;\ - result_31-= UChi_01;\ - result_32-= UChi_02; - -#define YM_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20-= UChi_10;\ - result_21-= UChi_11;\ - result_22-= UChi_12;\ - result_30+= UChi_00;\ - result_31+= UChi_01;\ - result_32+= UChi_02; - -#define ZP_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20-= timesI(UChi_00); \ - result_21-= timesI(UChi_01); \ - result_22-= timesI(UChi_02); \ - result_30+= timesI(UChi_10); \ - result_31+= timesI(UChi_11); \ - result_32+= timesI(UChi_12); - -#define ZM_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20+= timesI(UChi_00); \ - result_21+= timesI(UChi_01); \ - result_22+= timesI(UChi_02); \ - result_30-= timesI(UChi_10); \ - result_31-= timesI(UChi_11); \ - result_32-= timesI(UChi_12); - -#define TP_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20+= UChi_00; \ - result_21+= UChi_01; \ - result_22+= UChi_02; \ - result_30+= UChi_10; \ - result_31+= UChi_11; \ - result_32+= UChi_12; - -#define TM_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20-= UChi_00; \ - result_21-= UChi_01; \ - result_22-= UChi_02; \ - result_30-= UChi_10; \ - result_31-= UChi_11; \ - result_32-= UChi_12; - -#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ - SE=st.GetEntry(ptype,DIR,ss); \ - offset = SE->_offset; \ - local = SE->_is_local; \ - perm = SE->_permute; \ - if ( local ) { \ - LOAD_CHIMU; \ - PROJ; \ - if ( perm) { \ - PERMUTE_DIR(PERM); \ - } \ - } else { \ - LOAD_CHI; \ - } \ - MULT_2SPIN(DIR); \ - RECON; - -#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ - SE=st.GetEntry(ptype,DIR,ss); \ - offset = SE->_offset; \ - local = SE->_is_local; \ - perm = SE->_permute; \ - if ( local ) { \ - LOAD_CHIMU; \ - PROJ; \ - if ( perm) { \ - PERMUTE_DIR(PERM); \ - } \ - } else if ( st.same_node[DIR] ) { \ - LOAD_CHI; \ - } \ - if (local || st.same_node[DIR] ) { \ - MULT_2SPIN(DIR); \ - RECON; \ - } - -#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ - SE=st.GetEntry(ptype,DIR,ss); \ - offset = SE->_offset; \ - if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ - LOAD_CHI; \ - MULT_2SPIN(DIR); \ - RECON; \ - nmu++; \ - } - -#define HAND_RESULT(ss) \ - { \ - SiteSpinor & ref (out[ss]); \ - vstream(ref()(0)(0),result_00); \ - vstream(ref()(0)(1),result_01); \ - vstream(ref()(0)(2),result_02); \ - vstream(ref()(1)(0),result_10); \ - vstream(ref()(1)(1),result_11); \ - vstream(ref()(1)(2),result_12); \ - vstream(ref()(2)(0),result_20); \ - vstream(ref()(2)(1),result_21); \ - vstream(ref()(2)(2),result_22); \ - vstream(ref()(3)(0),result_30); \ - vstream(ref()(3)(1),result_31); \ - vstream(ref()(3)(2),result_32); \ - } - -#define HAND_RESULT_EXT(ss) \ - if (nmu){ \ - SiteSpinor & ref (out[ss]); \ - ref()(0)(0)+=result_00; \ - ref()(0)(1)+=result_01; \ - ref()(0)(2)+=result_02; \ - ref()(1)(0)+=result_10; \ - ref()(1)(1)+=result_11; \ - ref()(1)(2)+=result_12; \ - ref()(2)(0)+=result_20; \ - ref()(2)(1)+=result_21; \ - ref()(2)(2)+=result_22; \ - ref()(3)(0)+=result_30; \ - ref()(3)(1)+=result_31; \ - ref()(3)(2)+=result_32; \ - } - - -#define HAND_DECLARATIONS(a) \ - Simd result_00; \ - Simd result_01; \ - Simd result_02; \ - Simd result_10; \ - Simd result_11; \ - Simd result_12; \ - Simd result_20; \ - Simd result_21; \ - Simd result_22; \ - Simd result_30; \ - Simd result_31; \ - Simd result_32; \ - Simd Chi_00; \ - Simd Chi_01; \ - Simd Chi_02; \ - Simd Chi_10; \ - Simd Chi_11; \ - Simd Chi_12; \ - Simd UChi_00; \ - Simd UChi_01; \ - Simd UChi_02; \ - Simd UChi_10; \ - Simd UChi_11; \ - Simd UChi_12; \ - Simd U_00; \ - Simd U_10; \ - Simd U_20; \ - Simd U_01; \ - Simd U_11; \ - Simd U_21; - -#define ZERO_RESULT \ - result_00=Zero(); \ - result_01=Zero(); \ - result_02=Zero(); \ - result_10=Zero(); \ - result_11=Zero(); \ - result_12=Zero(); \ - result_20=Zero(); \ - result_21=Zero(); \ - result_22=Zero(); \ - result_30=Zero(); \ - result_31=Zero(); \ - result_32=Zero(); - -#define Chimu_00 Chi_00 -#define Chimu_01 Chi_01 -#define Chimu_02 Chi_02 -#define Chimu_10 Chi_10 -#define Chimu_11 Chi_11 -#define Chimu_12 Chi_12 -#define Chimu_20 UChi_00 -#define Chimu_21 UChi_01 -#define Chimu_22 UChi_02 -#define Chimu_30 UChi_10 -#define Chimu_31 UChi_11 -#define Chimu_32 UChi_12 - -NAMESPACE_BEGIN(Grid); - -template void -WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset,local,perm, ptype; - StencilEntry *SE; - - HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); - HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT(ss); -} - -template -void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset,local,perm, ptype; - - HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); - HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT(ss); -} - -template void -WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset,local,perm, ptype; - StencilEntry *SE; - ZERO_RESULT; - HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT(ss); -} - -template -void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset,local,perm, ptype; - ZERO_RESULT; - HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT(ss); -} - -template void -WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset, ptype; - StencilEntry *SE; - int nmu=0; - ZERO_RESULT; - HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT_EXT(ss); -} - -template -void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset, ptype; - int nmu=0; - ZERO_RESULT; - HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT_EXT(ss); -} - -////////////// Wilson ; uses this implementation ///////////////////// - -NAMESPACE_END(Grid); -#undef LOAD_CHIMU -#undef LOAD_CHI -#undef MULT_2SPIN -#undef PERMUTE_DIR -#undef XP_PROJ -#undef YP_PROJ -#undef ZP_PROJ -#undef TP_PROJ -#undef XM_PROJ -#undef YM_PROJ -#undef ZM_PROJ -#undef TM_PROJ -#undef XP_RECON -#undef XP_RECON_ACCUM -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef ZERO_RESULT -#undef Chimu_00 -#undef Chimu_01 -#undef Chimu_02 -#undef Chimu_10 -#undef Chimu_11 -#undef Chimu_12 -#undef Chimu_20 -#undef Chimu_21 -#undef Chimu_22 -#undef Chimu_30 -#undef Chimu_31 -#undef Chimu_32 -#undef HAND_STENCIL_LEG -#undef HAND_STENCIL_LEG_INT -#undef HAND_STENCIL_LEG_EXT -#undef HAND_RESULT -#undef HAND_RESULT_INT -#undef HAND_RESULT_EXT diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 52dd8320..b24fb3a8 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -26,20 +26,20 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) #define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI @@ -105,9 +105,9 @@ asm ( \ #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -117,9 +117,9 @@ asm ( \ #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -128,24 +128,30 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index faa8249b..e60ab381 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -26,20 +26,20 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) #define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) +#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI @@ -105,9 +105,9 @@ asm ( \ #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ { \ asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -117,9 +117,9 @@ asm ( \ #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ { \ asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -128,24 +128,30 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index f94d4f47..9cf1c5db 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -26,20 +26,20 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) #define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI @@ -154,15 +154,21 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 7329e4dc..3d8b6bf5 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -26,20 +26,20 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) #define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) +#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI @@ -154,15 +154,21 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ From c12a67030ac4f455c684a4a68da86d806ab93937 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Wed, 15 Apr 2020 10:55:06 +0200 Subject: [PATCH 040/147] 980 GiB/s Wilson; 680 GiB/s DW (DP) --- .../implementation/WilsonKernelsAsmA64FX.h | 409 +-- .../WilsonKernelsAsmBodyA64FX.h | 156 +- Grid/simd/Fujitsu_A64FX_asm_double.h | 38 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 38 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 26 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 26 +- Grid/simd/Fujitsu_A64FX_undef.h | 2 + Grid/simd/gridverter.py | 2410 +++++++++++++++++ 8 files changed, 2787 insertions(+), 318 deletions(-) create mode 100755 Grid/simd/gridverter.py diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 9d74dd15..2a88414a 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -82,7 +82,11 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, @@ -293,157 +297,6 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie #include #endif -//#undef MAYBEPERM -//#undef MULT_2SPIN -//#define MAYBEPERM(A,B) -//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) - -///////////////////////////////////////////////////////////////// -// Ls vectorised, undag Kernel, single -///////////////////////////////////////////////////////////////// - -#ifdef DWFVEC5D - -#undef KERNEL_DAG -#define INTERIOR_AND_EXTERIOR -#undef INTERIOR -#undef EXTERIOR -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#undef INTERIOR_AND_EXTERIOR -#define INTERIOR -#undef EXTERIOR -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#undef INTERIOR_AND_EXTERIOR -#undef INTERIOR -#define EXTERIOR -#undef MULT_2SPIN -#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -///////////////////////////////////////////////////////////////// -// Ls vectorised, dag Kernel, single -///////////////////////////////////////////////////////////////// -#define KERNEL_DAG -#define INTERIOR_AND_EXTERIOR -#undef INTERIOR -#undef EXTERIOR -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#undef INTERIOR_AND_EXTERIOR -#define INTERIOR -#undef EXTERIOR -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#undef INTERIOR_AND_EXTERIOR -#undef INTERIOR -#define EXTERIOR -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#endif // VEC 5D - -//#undef COMPLEX_SIGNS -//#undef MAYBEPERM -//#undef MULT_2SPIN // undefine #include @@ -478,20 +331,38 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -499,20 +370,38 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -520,20 +409,38 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, double @@ -545,20 +452,38 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -566,20 +491,38 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -587,168 +530,40 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFie template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif -// KNL stuff -#undef MAYBEPERM -//#undef MULT_2SPIN -#define MAYBEPERM(A,B) -//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) - -///////////////////////////////////////////////////////////////// -// Ls vectorised, undag Kernel, double -///////////////////////////////////////////////////////////////// -#ifdef DWFVEC5D - -#undef KERNEL_DAG -#define INTERIOR_AND_EXTERIOR -#undef INTERIOR -#undef EXTERIOR -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#undef INTERIOR_AND_EXTERIOR -#define INTERIOR -#undef EXTERIOR -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#undef INTERIOR_AND_EXTERIOR -#undef INTERIOR -#define EXTERIOR -#undef MULT_2SPIN -#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -///////////////////////////////////////////////////////////////// -// Ls vectorised, dag Kernel, double -///////////////////////////////////////////////////////////////// -#define KERNEL_DAG -#define INTERIOR_AND_EXTERIOR -#undef INTERIOR -#undef EXTERIOR -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#undef INTERIOR_AND_EXTERIOR -#define INTERIOR -#undef EXTERIOR -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#undef INTERIOR_AND_EXTERIOR -#undef INTERIOR -#define EXTERIOR - -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -#endif // VEC 5D // undefs #undef WILSONKERNELSASMBODYA64FX diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 44bf2005..7c1be429 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -61,6 +61,29 @@ Author: Nils Meyer #define DIR7_RECON TP_RECON_ACCUM #endif +//using namespace std; + +#undef SHOW +//#define SHOW + +#undef WHERE + +#ifdef INTERIOR_AND_EXTERIOR +#define WHERE "INT_AND_EXT" +#endif + +#ifdef INTERIOR +#define WHERE "INT" +#endif + +#ifdef EXTERIOR +#define WHERE "EXT" +#endif + +//#pragma message("here") + + + //////////////////////////////////////////////////////////////////////////////// // Comms then compute kernel //////////////////////////////////////////////////////////////////////////////// @@ -69,16 +92,17 @@ Author: Nils Meyer #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ if ( local ) { \ - LOAD64(%r10,isigns); \ + /* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \ PROJ(base); \ + /* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \ MAYBEPERM(PERMUTE_DIR,perm); \ } else { \ LOAD_CHI(base); \ } \ - MULT_2SPIN_DIR_PF(Dir,basep); \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ PREFETCH_CHIMU(base); \ - LOAD64(%r10,isigns); \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + PREFETCH_GAUGE_L1(NxtDir); \ RECON; \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ @@ -99,17 +123,15 @@ Author: Nils Meyer #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ if ( local ) { \ - LOAD64(%r10,isigns); \ PROJ(base); \ MAYBEPERM(PERMUTE_DIR,perm); \ }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ if ( local || st.same_node[Dir] ) { \ MULT_2SPIN_DIR_PF(Dir,basep); \ - PREFETCH_CHIMU(base); \ - LOAD64(%r10,isigns); \ RECON; \ - } else { PREFETCH_CHIMU(base); } + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ @@ -132,7 +154,6 @@ Author: Nils Meyer if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_DIR_PF(Dir,base); \ - LOAD64(%r10,isigns); \ RECON; \ nmu++; \ } @@ -144,7 +165,6 @@ Author: Nils Meyer if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_DIR_PF(Dir,base); \ - LOAD64(%r10,isigns); \ RECON; \ nmu++; \ } @@ -159,7 +179,6 @@ Author: Nils Meyer uint64_t basep; const uint64_t plocal =(uint64_t) & in[0]; - COMPLEX_SIGNS(isigns); MASK_REGS; int nmax=U.oSites(); for(int site=0;site int ent=ss*8;// 2*Ndim int nent=ssn*8; + uint64_t delta_base, delta_base_p; + ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON); + +#ifdef SHOW + float rescale = 64. * 12.; + std::cout << "=================================================================" << std::endl; + std::cout << "ss = " << ss << " ssn = " << ssn << std::endl; + std::cout << "sU = " << sU << " ssU = " << ssU << std::endl; + std::cout << " " << std::endl; + + + std::cout << "Dir = " << Xp << " " << WHERE<< std::endl; + + std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl; + std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl; + std::cout << "base = " << (base - plocal)/rescale << std::endl; + std::cout << "Basep = " << (basep - plocal)/rescale << std::endl; + //printf("U = %llu\n", (uint64_t)&[sU](Dir)); + std::cout << "----------------------------------------------------" << std::endl; +#endif + ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON); + +#ifdef SHOW + std::cout << "Dir = " << Yp << " " << WHERE<< std::endl; + + std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl; + std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl; + std::cout << "base = " << (base - plocal)/rescale << std::endl; + std::cout << "Basep = " << (basep - plocal)/rescale << std::endl; + //printf("U = %llu\n", (uint64_t)&[sU](Dir)); + std::cout << "----------------------------------------------------" << std::endl; +#endif + ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON); + +#ifdef SHOW + std::cout << "Dir = " << Zp << " " << WHERE<< std::endl; + + std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl; + std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl; + std::cout << "base = " << (base - plocal)/rescale << std::endl; + std::cout << "Basep = " << (basep - plocal)/rescale << std::endl; + //printf("U = %llu\n", (uint64_t)&[sU](Dir)); + std::cout << "----------------------------------------------------" << std::endl; +#endif + ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON); +#ifdef SHOW + std::cout << "Dir = " << Tp << " " << WHERE<< std::endl; + + std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl; + std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl; + std::cout << "base = " << (base - plocal)/rescale << std::endl; + std::cout << "Basep = " << (basep - plocal)/rescale << std::endl; + //printf("U = %llu\n", (uint64_t)&[sU](Dir)); + std::cout << "----------------------------------------------------" << std::endl; +#endif + ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON); + +#ifdef SHOW + std::cout << "Dir = " << Xm << " " << WHERE<< std::endl; + + std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl; + std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl; + std::cout << "base = " << (base - plocal)/rescale << std::endl; + std::cout << "Basep = " << (basep - plocal)/rescale << std::endl; + //printf("U = %llu\n", (uint64_t)&[sU](Dir)); + std::cout << "----------------------------------------------------" << std::endl; +#endif + ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON); + +#ifdef SHOW + std::cout << "Dir = " << Ym << " " << WHERE<< std::endl; + + std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl; + std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl; + std::cout << "base = " << (base - plocal)/rescale << std::endl; + std::cout << "Basep = " << (basep - plocal)/rescale << std::endl; + //printf("U = %llu\n", (uint64_t)&[sU](Dir)); + std::cout << "----------------------------------------------------" << std::endl; +#endif + ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON); + +#ifdef SHOW + std::cout << "Dir = " << Zm << " " << WHERE<< std::endl; + + std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl; + std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl; + std::cout << "base = " << (base - plocal)/rescale << std::endl; + std::cout << "Basep = " << (basep - plocal)/rescale << std::endl; + //printf("U = %llu\n", (uint64_t)&[sU](Dir)); + std::cout << "----------------------------------------------------" << std::endl; +#endif + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON); +#ifdef SHOW + std::cout << "Dir = " << Tm << " " << WHERE<< std::endl; + + std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl; + std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl; + std::cout << "base = " << (base - plocal)/rescale << std::endl; + std::cout << "Basep = " << (basep - plocal)/rescale << std::endl; + //printf("U = %llu\n", (uint64_t)&[sU](Dir)); + std::cout << "----------------------------------------------------" << std::endl; +#endif + #ifdef EXTERIOR if (nmu==0) break; // if (nmu!=0) std::cout << "EXT "< #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) #define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) #define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) #define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) #define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) @@ -38,8 +39,11 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A) -#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_CHIMU_L1(B); +#define MULT_2SPIN_DIR_PF(A,B) \ + MULT_2SPIN_A64FXd(A); \ + PREFETCH_CHIMU_L2(B); \ + if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI @@ -321,12 +325,28 @@ asm ( \ // PERM3 #define PERM3_A64FXd +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} // MULT_2SPIN #define MULT_2SPIN_A64FXd(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "ptrue p5.d \n\t" \ "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ @@ -738,6 +758,18 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXd \ asm ( \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index e60ab381..1a341cc0 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -31,6 +31,7 @@ Author: Nils Meyer #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) #define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) #define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) #define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) #define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) @@ -38,8 +39,11 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A) -#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_CHIMU_L1(B); +#define MULT_2SPIN_DIR_PF(A,B) \ + MULT_2SPIN_A64FXf(A); \ + PREFETCH_CHIMU_L2(B); \ + if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI @@ -332,12 +336,28 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ +asm ( \ + "ptrue p5.s \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} // MULT_2SPIN #define MULT_2SPIN_A64FXf(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "ptrue p5.s \n\t" \ "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ @@ -749,6 +769,18 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXf \ asm ( \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 9cf1c5db..08ac2966 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -31,6 +31,7 @@ Author: Nils Meyer #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) #define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) #define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) #define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) #define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) @@ -38,8 +39,11 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A) -#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_CHIMU_L1(B); +#define MULT_2SPIN_DIR_PF(A,B) \ + MULT_2SPIN_A64FXd(A); \ + PREFETCH_CHIMU_L2(B); \ + if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI @@ -279,6 +283,17 @@ Author: Nils Meyer // PERM3 #define PERM3_A64FXd +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ +} // MULT_2SPIN #define MULT_2SPIN_A64FXd(A) \ { \ @@ -574,6 +589,13 @@ Author: Nils Meyer result_31 = __svzero(result_31); \ result_32 = __svzero(result_32); +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ +} // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXd \ result_00 = svadd_x(pg1, result_00, Chimu_00); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 3d8b6bf5..db5555bc 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -31,6 +31,7 @@ Author: Nils Meyer #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) #define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) #define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) #define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) #define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) @@ -38,8 +39,11 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A) -#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_CHIMU_L1(B); +#define MULT_2SPIN_DIR_PF(A,B) \ + MULT_2SPIN_A64FXf(A); \ + PREFETCH_CHIMU_L2(B); \ + if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI @@ -285,6 +289,17 @@ Author: Nils Meyer Chi_11 = svtbl(Chi_11, table0); \ Chi_12 = svtbl(Chi_12, table0); +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ +} // MULT_2SPIN #define MULT_2SPIN_A64FXf(A) \ { \ @@ -580,6 +595,13 @@ Author: Nils Meyer result_31 = __svzero(result_31); \ result_32 = __svzero(result_32); +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ +} // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXf \ result_00 = svadd_x(pg1, result_00, Chimu_00); \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 12ade6e2..07e30535 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -36,6 +36,8 @@ Author: Nils Meyer #undef PF_GAUGE #undef PREFETCH1_CHIMU #undef PREFETCH_CHIMU +#undef PREFETCH_RESULT_L2_STORE +#undef LOAD_GAUGE #undef LOCK_GAUGE #undef UNLOCK_GAUGE #undef MASK_REGS diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py new file mode 100755 index 00000000..415f5578 --- /dev/null +++ b/Grid/simd/gridverter.py @@ -0,0 +1,2410 @@ +#!/usr/bin/python3 + +import re +import argparse +import sys + +# Grid for A64FX +# +# * should align std::vector to (multiples of) cache block size = 256 bytes + +# place benchmark runtime in cycles here ! +measured_cycles = 690 #1500 #775 #1500 + + +# command line parser +parser = argparse.ArgumentParser(description="Dslash generator.") +parser.add_argument("--single", action="store_true", default="False") +parser.add_argument("--double", action="store_true", default="True") +parser.add_argument("--debug", action="store_true", default="False") +parser.add_argument("--gridbench", action="store_true", default="False") +args = parser.parse_args() + +print(args) + +ASM_LOAD_CHIMU = True # load chimu +ASM_LOAD_GAUGE = True # load gauge +ASM_LOAD_TABLE = True # load table +ASM_STORE = True # store result + +# Disable all loads and stores in asm for benchmarking purposes +#DISABLE_ASM_LOAD_STORE = True +DISABLE_ASM_LOAD_STORE = False + +if DISABLE_ASM_LOAD_STORE: + ASM_LOAD_CHIMU = True # load chimu + ASM_LOAD_GAUGE = True # load gauge + ASM_LOAD_TABLE = True # load table + ASM_STORE = False # store result + +# Alternative implementation using PROJ specific loads works, +# but be careful with predication + +ALTERNATIVE_LOADS = False +#ALTERNATIVE_LOADS = not ALTERNATIVE_LOADS # True + +# Alternative register mapping, +# must use with my_wilson4.h and my_wilson4pf.h + +ALTERNATIVE_REGISTER_MAPPING = False +ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING + +if ALTERNATIVE_REGISTER_MAPPING == True: + ALTERNATIVE_LOADS = False + +# use movprfx +MOVPRFX = False +MOVPRFX = not MOVPRFX + + +PREFETCH = False +PREFETCH = not PREFETCH # True + +PRECISION = 'double' # DP by default +PRECSUFFIX = 'A64FXd' +if args.single == True: + PRECISION = 'single' + PRECSUFFIX = 'A64FXf' + +_DEBUG = False #True # insert debugging output +if args.debug == True: + _DEBUG = True + +GRIDBENCH = False +if args.gridbench == True: + GRIDBENCH = True + +print("PRECISION = ", PRECISION) +print("DEBUG = ", _DEBUG) +print("ALTERNATIVE_LOADS = ", ALTERNATIVE_LOADS) +print("ALTERNATIVE_REGISTER_MAPPING = ", ALTERNATIVE_REGISTER_MAPPING) +print("MOVPRFX = ", MOVPRFX) +print("DISABLE_ASM_LOAD_STORE = ", DISABLE_ASM_LOAD_STORE) +print("GRIDBENCH = ", GRIDBENCH) + +print("") + +#sys.exit(0) + + +#_DEBUG = True # insert debugging output + +FETCH_BASE_PTR_COLOR_OFFSET = 2 # offset for scalar plus signed immediate addressing +STORE_BASE_PTR_COLOR_OFFSET = 2 + +# 64-bit gp register usage !!! armclang 20.0 complains about the register choice !!! +# table address: x30 +# data address: x29 +# store address: x28 +# debug address: r8 + +# Max performance of complex FMA using FCMLA instruction +# is 25% peak. +# +# Issue latency of FCMLA is 2 cycles. +# Need 2 FCMLA instructions for complex FMA. +# Complete complex FMA takes 4 cycles. +# Peak throughput is 4 * 8 Flops DP = 32 Flops DP in 4 cycles. +# A64FX FMA throughput is 4 * 8 * 2 * 2 = 132 Flops DP in 4 cycles. +# -> 25% peak FMA +# +# In: 3x 512 bits = 192 bytes +# Out: 1x 512 bits = 64 bytes +# Tot: 4x 512 bits = 256 bytes +# +# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) + +OPT = """ +#ifdef INTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ +-- LOAD64(%r10,isigns); \ + PROJ(base); \ +++ PF_GAUGE(Dir); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else if ( st.same_node[Dir] ) { + LOAD_CHI(base); +++ PF_GAUGE(Dir); + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + PREFETCH_CHIMU(base); \ +-- LOAD64(%r10,isigns); \ + RECON; \ + } else { PREFETCH_CHIMU(base); } + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ +-- PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif +""" + +filename = 'XXX' +LEGAL = """/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: {} + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +""" + +class Register: + + def __init__(self, variable, asmreg='X', predication=False): + global d + x = 'Y' + if predication == False: + x = asmreg # + d['asmsuffix'] + else: + x = asmreg + self.asmreg = x + self.asmregwithsuffix = asmreg + d['asmsuffix'] + self.asmregbyte = asmreg + '.b' + self.name = variable + self.asmname = variable + self.asmnamebyte = variable + '.b' + self.predication = predication + + d['registers'] += 1 + + def define(self, statement): + global d + d['C'] += F'#define {self.name} {statement}' + #d['A'] += F'#define {self.name} {statement}' + + def declare(self, predication=False): + global d + + if self.predication == False: + d['C'] += F' Simd {self.name}; \\\n' + + predtype = 'svfloat64_t' + if PRECISION == 'single': + predtype = 'svfloat32_t' + + d['I'] += F' {predtype} {self.name}; \\\n' + else: + d['I'] += F' svbool_t {self.name}; \\\n' + #d['A'] += F'#define {self.name} {self.asmreg} \n' + + def loadpredication(self, target='A'): + global d + if (target == 'A'): + d['A'] += F' "ptrue {self.asmregwithsuffix} \\n\\t" \\\n' + d['asmclobber'].append(F'"{self.asmreg}"') + + def loadtable(self, t): + global d + d['load'] += d['factor'] + gpr = d['asmtableptr'] + + cast = 'uint64_t' + asm_opcode = 'ld1d' + if PRECISION == 'single': + asm_opcode = 'ld1w' + cast = 'uint32_t' + + d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' + + # using immediate index break-out works + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + d['asminput'].append(F'[tableptr] "r" (&lut[0])') + d['asminput'].append(F'[index] "i" ({t})') + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + + def load(self, address, target='ALL', cast='float64_t', colors=3, offset=FETCH_BASE_PTR_COLOR_OFFSET): + global d + d['load'] += d['factor'] + indices = re.findall(r'\d+', address) + index = (int(indices[0]) - offset) * colors + int(indices[1]) + + asm_opcode = 'ld1d' + if PRECISION == 'single': + asm_opcode = 'ld1w' + cast = 'float32_t' + + gpr = d['asmfetchbaseptr'] + intrinfetchbase = d['intrinfetchbase'] + if (target in ['ALL', 'C']): + d['C'] += F' {self.name} = {address}; \\\n' + if (target in ['ALL', 'I']): + d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' + if (target in ['ALL', 'A']): + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + + def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): + global d + d['store'] += d['factor'] + indices = re.findall(r'\d+', address) + index = (int(indices[0]) - offset) * colors + int(indices[1]) + + asm_opcode = 'stnt1d' + if PRECISION == 'single': + asm_opcode = 'stnt1w' + cast = 'float32_t' + + intrinstorebase = d['intrinstorebase'] + + d['C'] += F' {address} = {self.name}; \\\n' + d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + + def movestr(self, str): + global d + #d['move'] += d['factor'] + d['I'] += F' {self.name} = {str}; \\\n' + + def move(self, op1): + global d + d['move'] += d['factor'] + d['C'] += F' {self.name} = {op1.name}; \\\n' + d['I'] += F' {self.name} = {op1.name}; \\\n' + d['A'] += F' "mov {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + + # a = a + b , a = b + c + def add(self, op1, op2=None): + global d + d['add'] += d['factor'] + if op2 is None: + d['C'] += F' {self.name} = {self.name} + {op1.name}; \\\n' + d['I'] += F' {self.name} = svadd_x(pg1, {self.name}, {op1.name}); \\\n' + d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + {op2.name}; \\\n' + d['I'] += F' {self.name} = svadd_x(pg1, {op1.name}, {op2.name}); \\\n' + d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' + + # a = a -b , a = b - c + def sub(self, op1, op2=None): + global d + d['sub'] += d['factor'] + if op2 is None: + d['C'] += F' {self.name} = {self.name} - {op1.name}; \\\n' + d['I'] += F' {self.name} = svsub_x(pg1, {self.name}, {op1.name}); \\\n' + d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} - {op2.name}; \\\n' + d['I'] += F' {self.name} = svsub_x(pg1, {op1.name}, {op2.name}); \\\n' + d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' + + # a = a * b , a = b * c + def mul(self, op1, op2): + global d + d['mul'] += 2 * d['factor'] + d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = __svzero({self.name}); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "mov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mul0(self, op1, op2, op3=None, constructive=False): + global d + d['mul'] += d['factor'] + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {op1.name}, {op2.name}, {op3.name}, 0); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op2.asmregwithsuffix}, {op3.asmregwithsuffix}, 0 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + + def mul1(self, op1, op2): + global d + d['mul'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mac(self, op1, op2): + global d + d['mac'] += 2 * d['factor'] + d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mac0(self, op1, op2): + global d + d['mac'] += d['factor'] + d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + + def mac1(self, op1, op2): + global d + d['mac'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def zero(self, zeroreg=False): + d['zero'] += d['factor'] + d['C'] += F' {self.name} = 0; \\\n' + d['I'] += F' {self.name} = __svzero({self.name}); \\\n' + + if zeroreg == True: + d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + else: + #using mov z, zero0 issue 1c, FLA, latency 6c + #d['A'] += F' "mov {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' + + #using mov z, 0 issue 1c, FLA, latency 6c + d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + + #using xor z, z, z issue 0.5c, FL*, latency 4c + #d['A'] += F' "eor {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' + + #using and z, z, zero0 issue 0.5c, FL*, latency 4c + #d['A'] += F' "and {self.asmregwithsuffix}, {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' + + #using sub z, z, z issue 0.5c, FL*, latency 9c + #d['A'] += F' "sub {self.asmregwithsuffix}, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' + + # without table + def timesI(self, op1, tempreg=None, tablereg=None): + global d + d['timesI'] += d['factor'] + d['C'] += F' {self.name} = timesI({op1.name}); \\\n' + # correct if DEBUG enabled, wrong if DEBUG disabled; no idea what's causing this + #table.load('table2', target='I', cast='uint64_t') + #d['I'] += F' {self.name} = svtbl({op1.name}, {tablereg.name}); \\\n' + #d['I'] += F' {self.name} = svneg_x(pg2, {self.name}); \\\n' + # timesI using trn tested, works but tbl should be faster + d['I'] += F' {tempreg.name} = svtrn2({op1.name}, {op1.name}); \\\n' + d['I'] += F' {tempreg.name} = svneg_x(pg1, {tempreg.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({tempreg.name}, {op1.name}); \\\n' + d['A'] += F' "trn2 {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fneg {tempreg.asmregwithsuffix}, {pg1.asmreg}/m, {tempreg.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "trn1 {self.asmregwithsuffix}, {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + + def addTimesI(self, op1, op2=None, constructive=False): + global d + d['addTimesI'] += d['factor'] + + if op2 is None: + d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + else: + if op2 is None: + d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 90); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 90 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def subTimesI(self, op1, op2=None, constructive=False): + global d + d['subTimesI'] += d['factor'] + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' + else: + if op2 is None: + d['C'] += F' {self.name} = {self.name} - timesI({op1.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 270); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 270 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} - timesI({op2.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' + + # timesMinusI is not used, def is probably wrong !!!! OPTIMIZATION with table + def timesMinusI(self, op1): + global d + d['timesMinusI'] += d['factor'] + d['C'] += F' {self.name} = timesMinusI({self.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({op1.name}, {op1.name}); \\\n' + d['I'] += F' {self.name} = svneg_x(pg1, {self.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({op1.name}, {self.name}); \\\n' + + def permute(self, dir, tablereg=None): + global d + d['permutes'] += d['factor'] + + d['C'] += F' permute{dir}({self.name}, {self.name}); \\\n' + + d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' + d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' + + # if dir == 0: + # d['I'] += F' {self.name} = svext({self.name}, {self.name}, 4); \\\n' + # # this might not work, see intrinsics assembly + # # d['A'] += F' ext {self.name}, {self.name}, {self.name}, #4 \\\n' + # # use registers directly + # d['A'] += F' "ext {self.asmregbyte}, {self.asmregbyte}, {self.asmregbyte}, 32 \\n\\t" \\\n' + # + # elif dir in [1, 2]: + # d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' + # d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' + + def debug(self): + global d + typecast = d['cfloat'] + gpr = d['asmdebugptr'] + vregs = d['asmclobberlist'] + if (d['debug'] == True): + d['C'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' + + d['I'] += F'svst1(pg1, ({typecast}*)&debugreg.v, {self.name}); \\\n' + d['I'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' + #d['I'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' + + d['A'] += F'asm ( \\\n' + d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier + d['A'] += F' "str {self.asmreg}, [%[ptr]] \\n\\t" \\\n' + d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier + d['A'] += F' : "=m" (debugreg.v) \\\n' + d['A'] += F' : [ptr] "r" (&debugreg.v) \\\n' + d['A'] += F' : "p5", "cc", "memory" \\\n' + d['A'] += F'); \\\n' + d['A'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' + # this form of addressing is not valid! + #d['A'] += F' "str {self.asmreg}, %[ptr] \\n\\t" \\\n' +# end Register + +def define(s, target='ALL'): + x = F'#define {s} \n' + global d + if (target in ['ALL', 'C']): + d['C'] += x + if (target in ['ALL', 'I']): + d['I'] += x + if (target in ['ALL', 'A']): + d['A'] += x + +def definemultiline(s): + x = F'#define {s} \\\n' + global d + d['C'] += x + d['I'] += x + d['A'] += x + +def write(s, target='ALL'): + x = F'{s}\n' + global d + if (target in ['ALL', 'C']): + d['C'] += x + if (target in ['ALL', 'I']): + d['I'] += x + if (target in ['ALL', 'A']): + d['A'] += x + +def curlyopen(): + write(F'{{ \\') + +def curlyclose(): + write(F'}}') + +def newline(target='ALL'): + global d + + if target == 'A': + if d['A'][-2:] == '\\\n': + d['A'] = d['A'][:-2] + '\n\n' + else: + if d['C'][-2:] == '\\\n': + d['C'] = d['C'][:-2] + '\n\n' + if d['I'][-2:] == '\\\n': + d['I'] = d['I'][:-2] + '\n\n' + if d['A'][-2:] == '\\\n': + d['A'] = d['A'][:-2] + '\n\n' + +# load the base pointer for fetches +def fetch_base_ptr(address, target='A'): + global d + #d['load'] += d['factor'] + + # DEBUG + #colors=3 + #indices = re.findall(r'\d+', address) + #index = (int(indices[0]) - FETCH_BASE_PTR_COLOR_OFFSET) * colors + int(indices[1]) + #print(F'{address} (base)') + + vregs = d['asmclobberlist'] + if target == 'A': + d['asminput'].append(F'[fetchptr] "r" ({address})') + d['asmclobber'].extend(vregs) + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + if target == 'I': + #print("intrinfetchbase = ", address) + d['intrinfetchbase'] = address + +# load the base pointer for stores +def store_base_ptr(address, target='A'): + global d + #d['load'] += d['factor'] + gpr = d['asmstorebaseptr'] + vregs = d['asmclobberlist'] + if target == 'A': + d['asminput'].append(F'[storeptr] "r" ({address})') + d['asmclobber'].extend(vregs) + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + if target == 'I': + d['intrinstorebase'] = address + +def prefetch_L1(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PLDL1STRM" # weak + #policy = "PLDL1KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + +def prefetch_L2(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PLDL2STRM" # weak + #policy = "PLDL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + #d['A'] += + +def prefetch_L2_store(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PSTL2STRM" # weak + #policy = "PSTL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + #d['A'] += + +def asmopen(): + #write('asm volatile ( \\', target='A') + write('asm ( \\', target='A') + + # DEBUG + #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier + #write('asm volatile ( \\', target='A') + +def asmclose(): + global d + + #print(d['asminput']) + + asmin = d['asminput'] + asmin_s = '' + if len(asmin) > 0: + asmin = list(dict.fromkeys(asmin)) # remove duplicates + #print(asmin) + for el in asmin: + asmin_s += el + ',' + asmin_s = asmin_s[:-1] + #print("-> ", asmin_s) + + d['asminput'] = [] + + asmout = d['asmoutput'] + asmout_s = '' + if len(asmout) > 0: + asmout = list(dict.fromkeys(asmout)) # remove duplicates + for el in asmout: + asmout_s += el + ',' + asmout_s = asmout_s[:-1] + + d['asmoutput'] = [] + + # DEBUG put all regs into clobber by default + d['asmclobber'].extend(d['asmclobberlist']) + + asmclobber = d['asmclobber'] + asmclobber_s = '' + #print(asmclobber) + if len(asmclobber) > 0: + asmclobber = list(dict.fromkeys(asmclobber)) # remove duplicates + for el in asmclobber: + asmclobber_s += el + ',' + asmclobber_s = asmclobber_s[:-1] + + d['asmclobber'] = [] + + # DEBUG + #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier + + + write(F' : {asmout_s} \\', target='A') + write(F' : {asmin_s} \\', target='A') + write(F' : {asmclobber_s} \\', target='A') + write('); \\', target='A') + +# -------------------------------------------------------------------------------- + +# string of vector registers to be used in clobber list +#clobberlist = ['"p0"'] +clobberlist = ['"p5"'] +clobberlist.append('"cc"') +for i in range(0, 32): + clobberlist.append(F'"z{i}"') + +d = { +'debug': _DEBUG, +'C': '', +'I': '', +'A': '', +'asmsuffix': '.d', # double precision by default +'cfloat': 'float64_t', +'registers': 0, +'load': 0, +'store': 0, +'move': 0, +'movprfx': 0, +'zero': 0, +'add': 0, +'sub': 0, +'mul': 0, +'mac': 0, +'permutes': 0, +'neg': 0, +'addTimesI': 0, +'subTimesI': 0, +'timesI': 0, +'timesMinusI': 0, +'flops': 0, +'factor': 1, # multiplicity +'asmtableptr': 'x30', +'asmfetchbaseptr': 'x29', +'asmstorebaseptr': 'x28', +'asmdebugptr': 'r12', +'asminput': [], +'asmoutput': [], +'asmclobber': [], +'asmclobberlist': clobberlist, +'intrinfetchbase': '', +'intrinstorebase': '', +'cycles_LOAD_CHIMU': 0, +'cycles_PROJ': 0, +'cycles_PERM': 0, +'cycles_MULT_2SPIN': 0, +'cycles_RECON': 0, +'cycles_RESULT': 0, +'cycles_ZERO_PSI': 0, +'cycles_PREFETCH_L1': 0, +'cycles_PREFETCH_L2': 0 +} + +if PRECISION == 'single': + d['asmsuffix'] = '.s' + d['cfloat'] = 'float32_t' + +# -------------------------------------------------------------------------------- +# Grid +# -------------------------------------------------------------------------------- + +# Variables / Registers +result_00 = Register('result_00', asmreg='z0') +result_01 = Register('result_01', asmreg='z1') +result_02 = Register('result_02', asmreg='z2') +result_10 = Register('result_10', asmreg='z3') +result_11 = Register('result_11', asmreg='z4') +result_12 = Register('result_12', asmreg='z5') +result_20 = Register('result_20', asmreg='z6') +result_21 = Register('result_21', asmreg='z7') +result_22 = Register('result_22', asmreg='z8') +result_30 = Register('result_30', asmreg='z9') +result_31 = Register('result_31', asmreg='z10') +result_32 = Register('result_32', asmreg='z11') # 12 Regs +Chi_00 = Register('Chi_00', asmreg='z12') +Chi_01 = Register('Chi_01', asmreg='z13') +Chi_02 = Register('Chi_02', asmreg='z14') +Chi_10 = Register('Chi_10', asmreg='z15') +Chi_11 = Register('Chi_11', asmreg='z16') +Chi_12 = Register('Chi_12', asmreg='z17') # 6 +UChi_00 = Register('UChi_00', asmreg='z18') +UChi_01 = Register('UChi_01', asmreg='z19') +UChi_02 = Register('UChi_02', asmreg='z20') +UChi_10 = Register('UChi_10', asmreg='z21') +UChi_11 = Register('UChi_11', asmreg='z22') +UChi_12 = Register('UChi_12', asmreg='z23') # 6 +U_00 = Register('U_00', asmreg='z24') +U_10 = Register('U_10', asmreg='z25') +U_20 = Register('U_20', asmreg='z26') +U_01 = Register('U_01', asmreg='z27') +U_11 = Register('U_11', asmreg='z28') +U_21 = Register('U_21', asmreg='z29') # 6 -> 30 Registers + +table0 = Register('table0', asmreg='z30') +zero0 = Register('zero0', asmreg='z31') # 2 -> 32 Registers +# can't overload temp1 / table due to type mismatch using intrinsics :( +# typecasting SVE intrinsics variables is not allowed + +pg1 = Register('pg1', predication=True, asmreg='p5') +#pg2 = Register('pg2', predication=True, asmreg='p1') + +# Overloaded with Chi_* and UChi_* +Chimu_00 = Register('Chimu_00', asmreg=Chi_00.asmreg) +Chimu_01 = Register('Chimu_01', asmreg=Chi_01.asmreg) +Chimu_02 = Register('Chimu_02', asmreg=Chi_02.asmreg) +Chimu_10 = Register('Chimu_10', asmreg=Chi_10.asmreg) +Chimu_11 = Register('Chimu_11', asmreg=Chi_11.asmreg) +Chimu_12 = Register('Chimu_12', asmreg=Chi_12.asmreg) +if ALTERNATIVE_REGISTER_MAPPING == False: + Chimu_20 = Register('Chimu_20', asmreg=UChi_00.asmreg) + Chimu_21 = Register('Chimu_21', asmreg=UChi_01.asmreg) + Chimu_22 = Register('Chimu_22', asmreg=UChi_02.asmreg) + Chimu_30 = Register('Chimu_30', asmreg=UChi_10.asmreg) + Chimu_31 = Register('Chimu_31', asmreg=UChi_11.asmreg) + Chimu_32 = Register('Chimu_32', asmreg=UChi_12.asmreg) # 12 Registers +else: # wilson4.h + Chimu_20 = Register('Chimu_20', asmreg=U_00.asmreg) + Chimu_21 = Register('Chimu_21', asmreg=U_10.asmreg) + Chimu_22 = Register('Chimu_22', asmreg=U_20.asmreg) + Chimu_30 = Register('Chimu_30', asmreg=U_01.asmreg) + Chimu_31 = Register('Chimu_31', asmreg=U_11.asmreg) + Chimu_32 = Register('Chimu_32', asmreg=U_21.asmreg) + +# debugging output +def debugall(msg=None, group='ALL'): + global d + if (d['debug'] == False): + return + write(F'std::cout << std::endl << "DEBUG -- {msg}" << std::endl; \\') + if (group in ['ALL', 'result']): + result_00.debug() + result_01.debug() + result_02.debug() + result_10.debug() + result_11.debug() + result_12.debug() + result_20.debug() + result_21.debug() + result_22.debug() + result_30.debug() + result_31.debug() + result_32.debug() + if (group in ['ALL', 'Chi']): + Chi_00.debug() + Chi_01.debug() + Chi_02.debug() + Chi_10.debug() + Chi_11.debug() + Chi_12.debug() + if (group in ['ALL', 'UChi']): + UChi_00.debug() + UChi_01.debug() + UChi_02.debug() + UChi_10.debug() + UChi_11.debug() + UChi_12.debug() + if (group in ['ALL', 'U']): + U_00.debug() + U_10.debug() + U_20.debug() + U_01.debug() + U_11.debug() + U_21.debug() + if (group in ['ALL', 'Chimu']): + Chimu_00.debug() + Chimu_01.debug() + Chimu_02.debug() + Chimu_10.debug() + Chimu_11.debug() + Chimu_12.debug() + Chimu_20.debug() + Chimu_21.debug() + Chimu_22.debug() + Chimu_30.debug() + Chimu_31.debug() + Chimu_32.debug() + +# -------------------------------------------------------------------------------- +# Output +# -------------------------------------------------------------------------------- + +if ALTERNATIVE_LOADS == True: + define(F'LOAD_CHIMU_0213_PLUG LOAD_CHIMU_0213_{PRECSUFFIX}') + define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') + define(F'LOAD_CHIMU(x)') +else: + define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') + +if PREFETCH: + define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') + define(F'PF_GAUGE(A)') + define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') +# define(F'PREFETCH1_CHIMU(A)') + define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') +else: + define(F'PREFETCH_CHIMU_L1(A)') + define(F'PREFETCH_GAUGE_L1(A)') + define(F'PREFETCH_CHIMU_L2(A)') + define(F'PREFETCH_GAUGE_L2(A)') + define(F'PF_GAUGE(A)') + define(F'PREFETCH1_CHIMU(A)') + define(F'PREFETCH_CHIMU(A)') + define(F'PREFETCH_RESULT_L2_STORE(A)') + +# standard defines +define(F'LOCK_GAUGE(A)') +define(F'UNLOCK_GAUGE(A)') +define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') +define(F'COMPLEX_SIGNS(A)') +define(F'LOAD64(A,B)') +#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);') +define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_CHIMU_L1(B);') +if PREFETCH: + definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ') + write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\') + write (F' PREFETCH_CHIMU_L2(B); \\') + write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}') + +# definemultiline(F'MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A);') +# write (F' PREFETCH_CHIMU_L2(B); \\') +# write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\') +# write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}') + newline() +else: + define(F'MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_{PRECSUFFIX}(A)') +# break out maybeperm in permutes +#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') +define(F'MAYBEPERM(A,perm) {{ A ; }}') +define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') +# don't need zero psi, everything is done in recons +#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') +define(F'ZERO_PSI') +define(F'ADD_RESULT(base,basep) LOAD_CHIMU_{PRECSUFFIX}(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') +# loads projections +define(F'XP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XP_PROJ_{PRECSUFFIX}') +define(F'YP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YP_PROJ_{PRECSUFFIX}') +define(F'ZP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZP_PROJ_{PRECSUFFIX}') +define(F'TP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TP_PROJ_{PRECSUFFIX}') +define(F'XM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XM_PROJ_{PRECSUFFIX}') +define(F'YM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YM_PROJ_{PRECSUFFIX}') +define(F'ZM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZM_PROJ_{PRECSUFFIX}') +define(F'TM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TM_PROJ_{PRECSUFFIX}') +# recons +define(F'XP_RECON XP_RECON_{PRECSUFFIX}') +define(F'XM_RECON XM_RECON_{PRECSUFFIX}') +define(F'XM_RECON_ACCUM XM_RECON_ACCUM_{PRECSUFFIX}') +define(F'YM_RECON_ACCUM YM_RECON_ACCUM_{PRECSUFFIX}') +define(F'ZM_RECON_ACCUM ZM_RECON_ACCUM_{PRECSUFFIX}') +define(F'TM_RECON_ACCUM TM_RECON_ACCUM_{PRECSUFFIX}') +define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') +define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') +define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') +define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') +# permutes +define(F'PERMUTE_DIR0 LOAD_TABLE0; if (perm) {{ PERM0_{PRECSUFFIX}; }}') +define(F'PERMUTE_DIR1 LOAD_TABLE1; if (perm) {{ PERM1_{PRECSUFFIX}; }}') +define(F'PERMUTE_DIR2 LOAD_TABLE2; if (perm) {{ PERM2_{PRECSUFFIX}; }}') +if PRECISION == 'double': + define(F'PERMUTE_DIR3') +else: + define(F'PERMUTE_DIR3 LOAD_TABLE3; if (perm) {{ PERM3_{PRECSUFFIX}; }}') + + +write('// DECLARATIONS') +definemultiline(F'DECLARATIONS_{PRECSUFFIX}') +# debugging register +if d['debug'] == True: + write(' Simd debugreg; \\') +# perm tables +if PRECISION == 'double': + write(' const uint64_t lut[4][8] = { \\') + write(' {4, 5, 6, 7, 0, 1, 2, 3}, \\') #0 = swap register halves + write(' {2, 3, 0, 1, 6, 7, 4, 5}, \\') #1 = swap halves of halves + write(' {1, 0, 3, 2, 5, 4, 7, 6}, \\') #2 = swap re/im + write(' {0, 1, 2, 4, 5, 6, 7, 8} };\\') #3 = identity +else: + write(' const uint32_t lut[4][16] = { \\') + write(' {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \\') #0 = swap register halves + write(' {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \\') #1 = swap halves of halves + write(' {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \\') #2 = swap halves of halves of halves + write(' {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \\') #3 = swap re/im + +#newline(target='A') +result_00.declare() +result_01.declare() +result_02.declare() +result_10.declare() +result_11.declare() +result_12.declare() +result_20.declare() +result_21.declare() +result_22.declare() +result_30.declare() +result_31.declare() +result_32.declare() # 12 +Chi_00.declare() +Chi_01.declare() +Chi_02.declare() +Chi_10.declare() +Chi_11.declare() +Chi_12.declare() # 6 +UChi_00.declare() +UChi_01.declare() +UChi_02.declare() +UChi_10.declare() +UChi_11.declare() +UChi_12.declare() # 6 +U_00.declare() +U_10.declare() +U_20.declare() +U_01.declare() +U_11.declare() +U_21.declare() # 6 -> 30 regs + +# all true +pg1.declare() +if PRECISION == 'double': + pg1.movestr('svptrue_b64()') +else: + pg1.movestr('svptrue_b32()') + +# even elements only +#pg2.declare() +#pg2.movestr('svzip1_b64(svptrue_b64(), svpfalse_b())') + +# preload tables +# 0: swap +# 1: permute 1 +if PRECISION == 'double': + write(' svuint64_t table0; \\', target='I') # -> 31 regs +else: + write(' svuint32_t table0; \\', target='I') # -> 31 regs + +zero0.declare() + +asmopen() +zero0.zero(zeroreg=True) +asmclose() + +newline() + +define('Chimu_00 Chi_00', target='I') +define('Chimu_01 Chi_01', target='I') +define('Chimu_02 Chi_02', target='I') +define('Chimu_10 Chi_10', target='I') +define('Chimu_11 Chi_11', target='I') +define('Chimu_12 Chi_12', target='I') +if ALTERNATIVE_REGISTER_MAPPING == False: + define('Chimu_20 UChi_00', target='I') + define('Chimu_21 UChi_01', target='I') + define('Chimu_22 UChi_02', target='I') + define('Chimu_30 UChi_10', target='I') + define('Chimu_31 UChi_11', target='I') + define('Chimu_32 UChi_12', target='I') +else: # wilson4.h + define('Chimu_20 U_00', target='I') + define('Chimu_21 U_10', target='I') + define('Chimu_22 U_20', target='I') + define('Chimu_30 U_01', target='I') + define('Chimu_31 U_11', target='I') + define('Chimu_32 U_21', target='I') + +newline() + + +d['cycles_RESULT'] += 12 +write('// RESULT') +definemultiline(F'RESULT_{PRECSUFFIX}(base)') +if ASM_STORE: + curlyopen() + #write(' SiteSpinor & ref(out[ss]); \\') + asmopen() + #pg1.loadpredication() + #store_base_ptr("&ref[0][0]") + #store_base_ptr(F"&ref[{STORE_BASE_PTR_COLOR_OFFSET}][0]") + store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + result_00.store("ref[0][0]") + result_01.store("ref[0][1]") + result_02.store("ref[0][2]") + result_10.store("ref[1][0]") + result_11.store("ref[1][1]") + result_12.store("ref[1][2]") + result_20.store("ref[2][0]") + result_21.store("ref[2][1]") + result_22.store("ref[2][2]") + result_30.store("ref[3][0]") + result_31.store("ref[3][1]") + result_32.store("ref[3][2]") + asmclose() + debugall('RESULT', group='result') + curlyclose() +newline() + +# prefetch spinors from memory into L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_CHIMU_L2 (prefetch to L2)') +definemultiline(F'PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"base", target='A') +prefetch_L2(F"base", 0) +prefetch_L2(F"base", 1) +prefetch_L2(F"base", 2) +asmclose() +curlyclose() +newline() + +# prefetch spinors from memory into L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_CHIMU_L1 (prefetch to L1)') +definemultiline(F'PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +#pg1.loadpredication() +fetch_base_ptr(F"base", target='A') +prefetch_L1(F"base", 0) +prefetch_L1(F"base", 1) +prefetch_L1(F"base", 2) +asmclose() +curlyclose() +newline() + +# prefetch gauge from memory into L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_GAUGE_L2 (prefetch to L2)') +definemultiline(F'PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') +curlyopen() +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') +else: + write(' const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"baseU", target='A') +prefetch_L2(F"baseU", -1) +prefetch_L2(F"baseU", 0) +prefetch_L2(F"baseU", 1) +prefetch_L2(F"baseU", 2) +prefetch_L2(F"baseU", 3) +prefetch_L2(F"baseU", 4) +prefetch_L2(F"baseU", 5) +prefetch_L2(F"baseU", 6) +prefetch_L2(F"baseU", 7) +#prefetch_L2(F"baseU", 8) +asmclose() +curlyclose() +newline() + +# prefetch gauge from memory into L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_GAUGE_L1 (prefetch to L1)') +definemultiline(F'PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') +curlyopen() +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"baseU", target='A') +prefetch_L1(F"baseU", 0) +prefetch_L1(F"baseU", 1) +prefetch_L1(F"baseU", 2) +asmclose() +curlyclose() +newline() + +d['factor'] = 0 +write('// LOAD_CHI') +definemultiline(F'LOAD_CHI_{PRECSUFFIX}(base)') +if ASM_LOAD_CHIMU: + curlyopen() + #write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + fetch_base_ptr(F"base", target='I') + fetch_base_ptr(F"base", target='A') + + Chi_00.load("ref[0][0]", offset=0) + Chi_01.load("ref[0][1]", offset=0) + Chi_02.load("ref[0][2]", offset=0) + Chi_10.load("ref[1][0]", offset=0) + Chi_11.load("ref[1][1]", offset=0) + Chi_12.load("ref[1][2]", offset=0) + asmclose() + debugall('LOAD_CHI', group='Chi') + curlyclose() +newline() + + + +d['factor'] = 8 +# 12 loads = 12 issues, load latency = 8+1 cycles +# (not perfectly clear to me from docs) +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU') +definemultiline(F'LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') +if ASM_LOAD_CHIMU: + curlyopen() + #write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + #fetch_base_ptr("&ref[0][0]") + #fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + # Chimu_00.load("ref[0][0]") + # Chimu_01.load("ref[0][1]") + # Chimu_02.load("ref[0][2]") + # Chimu_10.load("ref[1][0]") + # Chimu_11.load("ref[1][1]") + # Chimu_12.load("ref[1][2]") + # Chimu_20.load("ref[2][0]") + # Chimu_21.load("ref[2][1]") + # Chimu_22.load("ref[2][2]") + # Chimu_30.load("ref[3][0]") + # Chimu_31.load("ref[3][1]") + # Chimu_32.load("ref[3][2]") + + Chimu_00.load("ref[0][0]") # minimum penalty for all directions + Chimu_30.load("ref[3][0]") + Chimu_10.load("ref[1][0]") + Chimu_20.load("ref[2][0]") + + Chimu_01.load("ref[0][1]") + Chimu_31.load("ref[3][1]") + Chimu_11.load("ref[1][1]") + Chimu_21.load("ref[2][1]") + + Chimu_02.load("ref[0][2]") + Chimu_32.load("ref[3][2]") + Chimu_12.load("ref[1][2]") + Chimu_22.load("ref[2][2]") + asmclose() + debugall('LOAD_CHIMU', group='Chimu') + curlyclose() +newline() + +# alternative load chimu: dirac order 0213 +# placed into asm (...) +d['factor'] = 0 +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU_0213') +definemultiline(F'LOAD_CHIMU_0213_{PRECSUFFIX}') +if ASM_LOAD_CHIMU: + curlyopen() + write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + Chimu_00.load("ref[0][0]") # reordered + Chimu_20.load("ref[2][0]") + + Chimu_01.load("ref[0][1]") + Chimu_21.load("ref[2][1]") + + Chimu_02.load("ref[0][2]") + Chimu_22.load("ref[2][2]") + + Chimu_10.load("ref[1][0]") + Chimu_30.load("ref[3][0]") + + Chimu_11.load("ref[1][1]") + Chimu_31.load("ref[3][1]") + + Chimu_12.load("ref[1][2]") + Chimu_32.load("ref[3][2]") + asmclose() + debugall('LOAD_CHIMU_0213', group='Chimu') + curlyclose() +newline() + +# alternative load chimu: dirac order 0312 +# placed into asm (...) +d['factor'] = 0 +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU_0312') +definemultiline(F'LOAD_CHIMU_0312_{PRECSUFFIX}') +if ASM_LOAD_CHIMU: + curlyopen() + write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + Chimu_00.load("ref[0][0]") # reordered + Chimu_30.load("ref[3][0]") + + Chimu_01.load("ref[0][1]") + Chimu_31.load("ref[3][1]") + + Chimu_02.load("ref[0][2]") + Chimu_32.load("ref[3][2]") + + Chimu_10.load("ref[1][0]") + Chimu_20.load("ref[2][0]") + + Chimu_11.load("ref[1][1]") + Chimu_21.load("ref[2][1]") + + Chimu_12.load("ref[1][2]") + Chimu_22.load("ref[2][2]") + asmclose() + debugall('LOAD_CHIMU_0312', group='Chimu') + curlyclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE0') +definemultiline(F'LOAD_TABLE0') +asmopen() +table0.loadtable(0) +asmclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE1') +definemultiline(F'LOAD_TABLE1') +asmopen() +table0.loadtable(1) +asmclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE2') +definemultiline(F'LOAD_TABLE2') +asmopen() +table0.loadtable(2) +asmclose() +newline() + +d['factor'] = 0 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE3') +definemultiline(F'LOAD_TABLE3') +asmopen() +table0.loadtable(3) +asmclose() +newline() + +# 8 directions = 6x permutations +d['factor'] = 2 # factor is 0 +d['cycles_PERM'] += 6 * d['factor'] +write('// PERM0') +definemultiline(F'PERM0_{PRECSUFFIX}') +debugall('PERM0 PRE', group='Chi') +asmopen() +#table0.loadtable(0) +Chi_00.permute(0, table0) +Chi_01.permute(0, table0) +Chi_02.permute(0, table0) +Chi_10.permute(0, table0) +Chi_11.permute(0, table0) +Chi_12.permute(0, table0) +asmclose() +debugall('PERM0 POST', group='Chi') +newline() + +d['factor'] = 2 # factor is 2 +d['cycles_PERM'] += 6 * d['factor'] +write('// PERM1') +definemultiline(F'PERM1_{PRECSUFFIX}') +debugall('PERM1 PRE', group='Chi') +asmopen() +#table0.loadtable(1) +Chi_00.permute(1, table0) +Chi_01.permute(1, table0) +Chi_02.permute(1, table0) +Chi_10.permute(1, table0) +Chi_11.permute(1, table0) +Chi_12.permute(1, table0) +asmclose() +debugall('PERM1 POST', group='Chi') +newline() + +d['factor'] = 2 # factor is 2 +# PERM2 = swap real and imaginary +d['cycles_PERM'] += 6 * d['factor'] +write('// PERM2') +definemultiline(F'PERM2_{PRECSUFFIX}') +debugall('PERM2 PRE', group='Chi') +asmopen() +#table0.loadtable(2) +Chi_00.permute(2, table0) +Chi_01.permute(2, table0) +Chi_02.permute(2, table0) +Chi_10.permute(2, table0) +Chi_11.permute(2, table0) +Chi_12.permute(2, table0) +asmclose() +debugall('PERM2 POST', group='Chi') +newline() + +# PERM3 = identity (DP), so exclude from counting +d['factor'] = 0 +d['cycles_PERM'] += 6 * d['factor'] +write('// PERM3') +definemultiline(F'PERM3_{PRECSUFFIX}') +if PRECISION == 'single': + debugall('PERM3 PRE', group='Chi') + asmopen() + #table0.loadtable(3) + Chi_00.permute(3, table0) + Chi_01.permute(3, table0) + Chi_02.permute(3, table0) + Chi_10.permute(3, table0) + Chi_11.permute(3, table0) + Chi_12.permute(3, table0) + asmclose() + debugall('PERM3 POST', group='Chi') +newline() + +write('// LOAD_GAUGE') +definemultiline(F'LOAD_GAUGE') +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +curlyopen() +asmopen() +pg1.loadpredication() +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') +if ASM_LOAD_GAUGE: + fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + U_00.load("ref[0][0]") + U_10.load("ref[1][0]") + U_20.load("ref[2][0]") + U_01.load("ref[0][1]") + U_11.load("ref[1][1]") + U_21.load("ref[2][1]") +asmclose() +curlyclose() +newline() +# XXXXXX remove loads +d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total +# assume all U loads are hidden +# FCMLA issue latency = 2 cycles +# measurement: latency = 16 cycles if FULLY pipelined !? +# spec says 6+6+9 cycles +# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 +d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] +write('// MULT_2SPIN') +definemultiline(F'MULT_2SPIN_{PRECSUFFIX}(A)') +curlyopen() +#write(' const auto & ref(U[sU][A]); \\') +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr("&ref[0][0]") +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') +#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='I') +#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='A') +#fetch_base_ptr(F"&ref[0][{FETCH_BASE_PTR_COLOR_OFFSET}]") +if ASM_LOAD_GAUGE: + U_00.load("ref[0][0]") + U_10.load("ref[1][0]") + U_20.load("ref[2][0]") + U_01.load("ref[0][1]") + U_11.load("ref[1][1]") + U_21.load("ref[2][1]") + +if MOVPRFX == False: + UChi_00.zero() # implementation specific + UChi_10.zero() + UChi_01.zero() + UChi_11.zero() + UChi_02.zero() + UChi_12.zero() + + # round 1 + UChi_00.mul0(U_00, Chi_00) # FCMLA latency is 6+6+9 cycles + UChi_10.mul0(U_00, Chi_10) + UChi_01.mul0(U_10, Chi_00) + UChi_11.mul0(U_10, Chi_10) + UChi_02.mul0(U_20, Chi_00) + UChi_12.mul0(U_20, Chi_10) +else: + # round 1 + UChi_00.mul0(zero0, U_00, Chi_00, constructive=True) # FCMLA latency is 6+6+9 cycles + UChi_10.mul0(zero0, U_00, Chi_10, constructive=True) + UChi_01.mul0(zero0, U_10, Chi_00, constructive=True) + UChi_11.mul0(zero0, U_10, Chi_10, constructive=True) + UChi_02.mul0(zero0, U_20, Chi_00, constructive=True) + UChi_12.mul0(zero0, U_20, Chi_10, constructive=True) + +# round 2 +UChi_00.mul1(U_00, Chi_00) +UChi_10.mul1(U_00, Chi_10) +UChi_01.mul1(U_10, Chi_00) +UChi_11.mul1(U_10, Chi_10) +UChi_02.mul1(U_20, Chi_00) +UChi_12.mul1(U_20, Chi_10) # Chi_00 and Chi_10 available from here + +if ASM_LOAD_GAUGE: + U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded + U_10.load("ref[1][2]") # early load + U_20.load("ref[2][2]") # A --> + +# round 3 +UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and +UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) +UChi_01.mac0(U_11, Chi_01) # autonomously using intrinsics +UChi_11.mac0(U_11, Chi_11) +UChi_02.mac0(U_21, Chi_01) +UChi_12.mac0(U_21, Chi_11) +# round 4 +UChi_00.mac1(U_01, Chi_01) +UChi_10.mac1(U_01, Chi_11) +UChi_01.mac1(U_11, Chi_01) +UChi_11.mac1(U_11, Chi_11) +UChi_02.mac1(U_21, Chi_01) +UChi_12.mac1(U_21, Chi_11) +# round 5 +UChi_00.mac0(U_00, Chi_02) # <-- A +UChi_10.mac0(U_00, Chi_12) +UChi_01.mac0(U_10, Chi_02) +UChi_11.mac0(U_10, Chi_12) +UChi_02.mac0(U_20, Chi_02) +UChi_12.mac0(U_20, Chi_12) +# round 6 +UChi_00.mac1(U_00, Chi_02) +UChi_10.mac1(U_00, Chi_12) +UChi_01.mac1(U_10, Chi_02) +UChi_11.mac1(U_10, Chi_12) +UChi_02.mac1(U_20, Chi_02) +UChi_12.mac1(U_20, Chi_12) +asmclose() +debugall('MULT_2SPIN', group='UChi') +curlyclose() +newline() + + +#// hspin(0)=fspin(0)+timesI(fspin(3)); +#// hspin(1)=fspin(1)+timesI(fspin(2)); +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// XP_PROJ') +definemultiline(F'XP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +pg1.loadpredication() +Chi_00.addTimesI(Chimu_00, Chimu_30) +Chi_01.addTimesI(Chimu_01, Chimu_31) +Chi_02.addTimesI(Chimu_02, Chimu_32) +Chi_10.addTimesI(Chimu_10, Chimu_20) +Chi_11.addTimesI(Chimu_11, Chimu_21) +Chi_12.addTimesI(Chimu_12, Chimu_22) +asmclose() +debugall('XP_PROJ', group='Chi') +curlyclose() +newline() + +#// fspin(0)=hspin(0); +#// fspin(1)=hspin(1); +#// fspin(2)=timesMinusI(hspin(1)); +#// fspin(3)=timesMinusI(hspin(0)); +# does not occur in GridBench +d['factor'] = 0 +d['cycles_RECON'] += 15 * d['factor'] +write('// XP_RECON') +definemultiline(F'XP_RECON_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +if MOVPRFX == False: + result_20.zero() + result_21.zero() + result_22.zero() + result_30.zero() + result_31.zero() + result_32.zero() + + result_20.subTimesI(UChi_10) + result_21.subTimesI(UChi_11) + result_22.subTimesI(UChi_12) + result_30.subTimesI(UChi_00) + result_31.subTimesI(UChi_01) + result_32.subTimesI(UChi_02) +else: + result_20.subTimesI(zero0, UChi_10, constructive=True) + result_21.subTimesI(zero0, UChi_11, constructive=True) + result_22.subTimesI(zero0, UChi_12, constructive=True) + result_30.subTimesI(zero0, UChi_00, constructive=True) + result_31.subTimesI(zero0, UChi_01, constructive=True) + result_32.subTimesI(zero0, UChi_02, constructive=True) + +result_00.move(UChi_00) # don't reorder ! +result_01.move(UChi_01) +result_02.move(UChi_02) +result_10.move(UChi_10) +result_11.move(UChi_11) +result_12.move(UChi_12) + +# result_00.add(UChi_00) # faster than move? +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +asmclose() +debugall('XP_RECON', group='result') +newline() + + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_RECON'] += 15 * d['factor'] +write('// XP_RECON_ACCUM') +definemultiline(F'XP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.subTimesI(UChi_10) +# result_21.subTimesI(UChi_11) +# result_22.subTimesI(UChi_12) +# result_30.subTimesI(UChi_00) +# result_31.subTimesI(UChi_01) +# result_32.subTimesI(UChi_02) +# +# result_00.add(UChi_00) # reordered +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) + +result_30.subTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_31.subTimesI(UChi_01) +result_01.add(UChi_01) + +result_32.subTimesI(UChi_02) +result_02.add(UChi_02) + +result_20.subTimesI(UChi_10) +result_10.add(UChi_10) + +result_21.subTimesI(UChi_11) +result_11.add(UChi_11) + +result_22.subTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('XP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// YP_PROJ') +definemultiline(F'YP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.sub(Chimu_00, Chimu_30) +Chi_01.sub(Chimu_01, Chimu_31) +Chi_02.sub(Chimu_02, Chimu_32) +Chi_10.add(Chimu_10, Chimu_20) +Chi_11.add(Chimu_11, Chimu_21) +Chi_12.add(Chimu_12, Chimu_22) +asmclose() +debugall('YP_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// ZP_PROJ') +definemultiline(F'ZP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.addTimesI(Chimu_00, Chimu_20) +Chi_01.addTimesI(Chimu_01, Chimu_21) +Chi_02.addTimesI(Chimu_02, Chimu_22) +Chi_10.subTimesI(Chimu_10, Chimu_30) +Chi_11.subTimesI(Chimu_11, Chimu_31) +Chi_12.subTimesI(Chimu_12, Chimu_32) +asmclose() +debugall('ZP_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// TP_PROJ') +definemultiline(F'TP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.add(Chimu_00, Chimu_20) +Chi_01.add(Chimu_01, Chimu_21) +Chi_02.add(Chimu_02, Chimu_22) +Chi_10.add(Chimu_10, Chimu_30) +Chi_11.add(Chimu_11, Chimu_31) +Chi_12.add(Chimu_12, Chimu_32) +asmclose() +debugall('TP_PROJ', group='Chi') +curlyclose() +newline() + +#// hspin(0)=fspin(0)-timesI(fspin(3)); +#// hspin(1)=fspin(1)-timesI(fspin(2)); + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// XM_PROJ') +definemultiline(F'XM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.subTimesI(Chimu_00, Chimu_30) +Chi_01.subTimesI(Chimu_01, Chimu_31) +Chi_02.subTimesI(Chimu_02, Chimu_32) +Chi_10.subTimesI(Chimu_10, Chimu_20) +Chi_11.subTimesI(Chimu_11, Chimu_21) +Chi_12.subTimesI(Chimu_12, Chimu_22) +asmclose() +debugall('XM_PROJ sub', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// XM_RECON') +definemultiline(F'XM_RECON_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() + +# only necessary if not zeroed before +if MOVPRFX == False: + result_20.zero() + result_21.zero() + result_22.zero() + result_30.zero() + result_31.zero() + result_32.zero() + + result_20.addTimesI(UChi_10) # <-- + result_21.addTimesI(UChi_11) + result_22.addTimesI(UChi_12) + result_30.addTimesI(UChi_00) + result_31.addTimesI(UChi_01) + result_32.addTimesI(UChi_02) +else: + result_20.addTimesI(zero0, UChi_10, constructive=True) # <-- + result_21.addTimesI(zero0, UChi_11, constructive=True) + result_22.addTimesI(zero0, UChi_12, constructive=True) + result_30.addTimesI(zero0, UChi_00, constructive=True) + result_31.addTimesI(zero0, UChi_01, constructive=True) + result_32.addTimesI(zero0, UChi_02, constructive=True) + +result_00.move(UChi_00) +result_01.move(UChi_01) +result_02.move(UChi_02) +result_10.move(UChi_10) +result_11.move(UChi_11) +result_12.move(UChi_12) +asmclose() +debugall('XM_RECON result', group='result') +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// YM_PROJ') +definemultiline(F'YM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.add(Chimu_00, Chimu_30) +Chi_01.add(Chimu_01, Chimu_31) +Chi_02.add(Chimu_02, Chimu_32) +Chi_10.sub(Chimu_10, Chimu_20) +Chi_11.sub(Chimu_11, Chimu_21) +Chi_12.sub(Chimu_12, Chimu_22) +asmclose() +debugall('YM_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// ZM_PROJ') +definemultiline(F'ZM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.subTimesI(Chimu_00, Chimu_20) +Chi_01.subTimesI(Chimu_01, Chimu_21) +Chi_02.subTimesI(Chimu_02, Chimu_22) +Chi_10.addTimesI(Chimu_10, Chimu_30) +Chi_11.addTimesI(Chimu_11, Chimu_31) +Chi_12.addTimesI(Chimu_12, Chimu_32) +asmclose() +debugall('ZM_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// TM_PROJ') +definemultiline(F'TM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +pg1.loadpredication() +Chi_00.sub(Chimu_00, Chimu_20) +Chi_01.sub(Chimu_01, Chimu_21) +Chi_02.sub(Chimu_02, Chimu_22) +Chi_10.sub(Chimu_10, Chimu_30) +Chi_11.sub(Chimu_11, Chimu_31) +Chi_12.sub(Chimu_12, Chimu_32) +asmclose() +debugall('TM_PROJ', group='Chi') +curlyclose() +newline() + +# does not occur in GridBench +d['factor'] = 0 +# add/sub issue latency = 1, latency is 9 +d['cycles_RECON'] += 15 * d['factor'] +write('// XM_RECON_ACCUM') +definemultiline(F'XM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +# result_20.addTimesI(UChi_10) +# result_21.addTimesI(UChi_11) +# result_22.addTimesI(UChi_12) +# result_30.addTimesI(UChi_00) +# result_31.addTimesI(UChi_01) +# result_32.addTimesI(UChi_02) +# +# # result_00.move(UChi_00) +# # result_01.move(UChi_01) +# # result_02.move(UChi_02) +# # result_10.move(UChi_10) +# # result_11.move(UChi_11) +# # result_12.move(UChi_12) +# +# # faster than move ? +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) + +result_30.addTimesI(UChi_00) # reordered +result_31.addTimesI(UChi_01) +result_32.addTimesI(UChi_02) + +result_20.addTimesI(UChi_10) +result_21.addTimesI(UChi_11) +result_22.addTimesI(UChi_12) + +result_00.add(UChi_00) +result_01.add(UChi_01) +result_02.add(UChi_02) +result_10.add(UChi_10) +result_11.add(UChi_11) +result_12.add(UChi_12) +asmclose() +debugall('XM_RECON_ACCUM', group='result') +newline() + + + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// YP_RECON_ACCUM') +definemultiline(F'YP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.add(UChi_10) +# result_21.add(UChi_11) +# result_22.add(UChi_12) +# result_30.sub(UChi_00) +# result_31.sub(UChi_01) +# result_32.sub(UChi_02) + +result_00.add(UChi_00) # reordered +result_30.sub(UChi_00) + +result_01.add(UChi_01) +result_31.sub(UChi_01) + +result_02.add(UChi_02) +result_32.sub(UChi_02) + +result_10.add(UChi_10) +result_20.add(UChi_10) + +result_11.add(UChi_11) +result_21.add(UChi_11) + +result_12.add(UChi_12) +result_22.add(UChi_12) +asmclose() +debugall('YP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// YM_RECON_ACCUM') +definemultiline(F'YM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.sub(UChi_10) +# result_21.sub(UChi_11) +# result_22.sub(UChi_12) +# result_30.add(UChi_00) +# result_31.add(UChi_01) +# result_32.add(UChi_02) + +result_00.add(UChi_00) # reordered +result_30.add(UChi_00) + +result_01.add(UChi_01) +result_31.add(UChi_01) + +result_02.add(UChi_02) +result_32.add(UChi_02) + +result_10.add(UChi_10) +result_20.sub(UChi_10) + +result_11.add(UChi_11) +result_21.sub(UChi_11) + +result_12.add(UChi_12) +result_22.sub(UChi_12) +asmclose() +debugall('YM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// ZP_RECON_ACCUM') +definemultiline(F'ZP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.subTimesI(UChi_00) +# result_21.subTimesI(UChi_01) +# result_22.subTimesI(UChi_02) +# result_30.addTimesI(UChi_10) +# result_31.addTimesI(UChi_11) +# result_32.addTimesI(UChi_12) +# +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +result_20.subTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_21.subTimesI(UChi_01) +result_01.add(UChi_01) + +result_22.subTimesI(UChi_02) +result_02.add(UChi_02) + +result_30.addTimesI(UChi_10) +result_10.add(UChi_10) + +result_31.addTimesI(UChi_11) +result_11.add(UChi_11) + +result_32.addTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('ZP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// ZM_RECON_ACCUM') +definemultiline(F'ZM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.addTimesI(UChi_00) +# result_21.addTimesI(UChi_01) +# result_22.addTimesI(UChi_02) +# result_30.subTimesI(UChi_10) +# result_31.subTimesI(UChi_11) +# result_32.subTimesI(UChi_12) +# +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +result_20.addTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_21.addTimesI(UChi_01) +result_01.add(UChi_01) + +result_22.addTimesI(UChi_02) +result_02.add(UChi_02) + +result_30.subTimesI(UChi_10) +result_10.add(UChi_10) + +result_31.subTimesI(UChi_11) +result_11.add(UChi_11) + +result_32.subTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('ZM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// TP_RECON_ACCUM') +definemultiline(F'TP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.add(UChi_00) +# result_21.add(UChi_01) +# result_22.add(UChi_02) +# result_30.add(UChi_10) +# result_31.add(UChi_11) +# result_32.add(UChi_12) + +result_00.add(UChi_00) # reordered +result_20.add(UChi_00) + +result_01.add(UChi_01) +result_21.add(UChi_01) + +result_02.add(UChi_02) +result_22.add(UChi_02) + +result_10.add(UChi_10) +result_30.add(UChi_10) + +result_11.add(UChi_11) +result_31.add(UChi_11) + +result_12.add(UChi_12) +result_32.add(UChi_12) +asmclose() +debugall('TP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// TM_RECON_ACCUM') +definemultiline(F'TM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.sub(UChi_00) +# result_21.sub(UChi_01) +# result_22.sub(UChi_02) +# result_30.sub(UChi_10) +# result_31.sub(UChi_11) +# result_32.sub(UChi_12) + +result_00.add(UChi_00) # reordered +result_20.sub(UChi_00) + +result_01.add(UChi_01) +result_21.sub(UChi_01) + +result_02.add(UChi_02) +result_22.sub(UChi_02) + +result_10.add(UChi_10) +result_30.sub(UChi_10) + +result_11.add(UChi_11) +result_31.sub(UChi_11) + +result_12.add(UChi_12) +result_32.sub(UChi_12) +asmclose() +debugall('TM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 0 +# have 12 instructions +# picking dual issue versions +d['cycles_ZERO_PSI'] += 6 * d['factor'] +write('// ZERO_PSI') +definemultiline(F'ZERO_PSI_{PRECSUFFIX}') +asmopen() +pg1.loadpredication() +result_00.zero() +result_01.zero() +result_02.zero() +result_10.zero() +result_11.zero() +result_12.zero() +result_20.zero() +result_21.zero() +result_22.zero() +result_30.zero() +result_31.zero() +result_32.zero() +asmclose() +#debugall('ZERO_PSI', group='result') +newline() + +d['factor'] = 0 +# prefetch store spinors into L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') +definemultiline(F'PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +fetch_base_ptr(F"base", target='A') +prefetch_L2_store(F"base", 0) +prefetch_L2_store(F"base", 1) +prefetch_L2_store(F"base", 2) +asmclose() +curlyclose() +newline() + +d['factor'] = 0 +write('// ADD_RESULT_INTERNAL') +definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}') +asmopen() +result_00.add(Chimu_00) +result_01.add(Chimu_01) +result_02.add(Chimu_02) +result_10.add(Chimu_10) +result_11.add(Chimu_11) +result_12.add(Chimu_12) +result_20.add(Chimu_20) +result_21.add(Chimu_21) +result_22.add(Chimu_22) +result_30.add(Chimu_30) +result_31.add(Chimu_31) +result_32.add(Chimu_32) +asmclose() +#debugall('ZERO_PSI', group='result') +newline() + +# -------------------------------------------------------------------------------- + +# C +f = open('w.h', 'w') +f.write(d['C']) +f.close() + +# intrin +f = open('wi.h', 'w') +f.write(d['I']) +f.close() + +filename = '' +if PRECISION == 'double': + filename = "Fujitsu_A64FX_intrin_double.h" +else: + filename = "Fujitsu_A64FX_intrin_single.h" +f = open(filename, 'w') +f.write(LEGAL.format(filename)) +f.write(d['I']) +f.close() + + +# asm +f = open('wa.h', 'w') +f.write(d['A']) +f.close() + +filename = '' +if PRECISION == 'double': + filename = "Fujitsu_A64FX_asm_double.h" +else: + filename = "Fujitsu_A64FX_asm_single.h" +f = open(filename, 'w') +f.write(LEGAL.format(filename)) +f.write(d['A']) +f.close() + + +# arithmetics instruction count, mul/mac = 2 instructions each +d['acount'] = d['add'] + d['sub'] + \ + d['mul'] + d['mac'] + d['addTimesI'] + d['subTimesI'] + +# permutations +d['permutes'] += 2*d['timesI'] + 1*d['timesMinusI'] +d['neg'] = 1*d['timesI'] + 1*d['timesMinusI'] + +# instruction count, mul/mac = 2 instructions each, +/- *i = 3 instructions each +d['icount'] = d['load'] + d['store'] + d['move'] + d['add'] + d['sub'] + \ + d['mul'] + d['mac'] + d['permutes'] + d['neg'] + \ + d['addTimesI'] + d['subTimesI'] + d['zero'] + d['movprfx'] + +# flops +d['flops'] = 4*d['mac'] + 3*d['mul'] + d['add'] + d['sub'] + \ + d['addTimesI'] + d['subTimesI'] + + + + + +print('Statistics') +print('') +print('Type Occurences Total / Arith instructions') +print('-------------------------------------------------------------------') +print('Variables {:4d}'.format(d['registers'])) +print('') +print('load {:4d}'.format(d['load'])) +print('store {:4d}'.format(d['store'])) +print('move {:4d}'.format(d['move'])) +print('movprfx {:4d}'.format(d['movprfx'])) +print('zero {:4d}'.format(d['zero'])) +print('negate {:4d}'.format(d['neg'])) + + +print('add {:4d} {:0.2f} / {:0.2f}'.\ + format(d['add'], d['add'] / d['icount'], d['add'] / d['acount'])) +print('sub {:4d} {:0.2f} / {:0.2f}'.\ + format(d['sub'], d['sub'] / d['icount'], d['sub'] / d['acount'])) +print('mul {:4d} {:0.2f} / {:0.2f}'.\ + format(d['mul'], 2*d['mul'] / d['icount'], 2*d['mul'] / d['acount'])) +print('mac {:4d} {:0.2f} / {:0.2f}'.\ + format(d['mac'], 2*d['mac'] / d['icount'], 2*d['mac'] / d['acount'])) +print('addTimesI {:4d} {:0.2f} / {:0.2f}'.\ + format(d['addTimesI'], 2*d['addTimesI'] / d['icount'], 2*d['addTimesI'] / d['acount'])) +print('subTimesI {:4d} {:0.2f} / {:0.2f}'.\ + format(d['subTimesI'], 2*d['subTimesI'] / d['icount'], 2*d['subTimesI'] / d['acount'])) + +print('timesI {:4d}'.format(d['timesI'])) +print('timesMinusI {:4d}'.format(d['timesMinusI'])) +print('permutes {:4d} {:0.2f}'.\ + format(d['permutes'], d['permutes'] / d['icount'])) +print('') +print('flops {:4d}'.format(d['flops'])) +print('instruction count {:4d}'.format(d['icount'])) +print('arith. instruction count {:4d} {:0.2f}'.\ + format(d['acount'], d['acount'] / d['icount'])) + + +# ---- static pipeline resources consumption ---- +FLA = 0 +FLA += 2 * d['mac'] + 2 * d['mul'] +FLA += 1 * d['addTimesI'] + 1 * d['subTimesI'] +FLA += 1 * d['move'] +FLA += 1 * d['permutes'] +FLA += 1 * d['store'] +FLA += 1 * d['zero'] + +FLB = 0 +FLB += 1 * d['addTimesI'] + 1 * d['subTimesI'] + +FLAB = 0 +FLAB += 1 * d['mac'] + 1 * d['mul'] +FLAB += 1 * d['add'] + 1 * d['sub'] +FLAB += 1 * d['neg'] + 1 * d['movprfx'] +#FLAB += 1 * d['zero'] + + +FL_slots = 2 * d['icount'] +FL_micro_ops = FLA + FLB + FLAB + +print('') +print('------------------------------------------------------------------') +print('') +print('Static FL slot usage') +print('') +print(' FLA {:4d}'.format(FLA)) +print(' FLB {:4d}'.format(FLB)) +print(' FLA/B {:4d}'.format(FLAB)) + +print('') +print('Static FL slot efficiency') +print('') +print(' Total FL slots {:4d}'.format(FL_slots)) +print(' FL slots occupied {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / FL_slots)) + +cycles_total = d['cycles_ZERO_PSI'] + d['cycles_LOAD_CHIMU'] + \ + d['cycles_PROJ'] + d['cycles_PERM'] + d['cycles_MULT_2SPIN'] + \ + d['cycles_RECON'] + d['cycles_RESULT'] +cycles_total_hidden = d['cycles_ZERO_PSI'] + \ + d['cycles_PROJ'] + d['cycles_MULT_2SPIN'] + \ + d['cycles_RECON'] + +# ---- dynamic estimate ---- + +print('') +print('Dynamic cycles estimate (incl. latencies)') +print('') +print(' ZERO_PSI {:4d}'.format(d['cycles_ZERO_PSI'])) +print(' LOAD_CHIMU {:4d}'.format(d['cycles_LOAD_CHIMU'])) +print(' PROJ {:4d}'.format(d['cycles_PROJ'])) +print(' PERM {:4d}'.format(d['cycles_PERM'])) +print(' MULT_2SPIN {:4d}'.format(d['cycles_MULT_2SPIN'])) +print(' RECON {:4d}'.format(d['cycles_RECON'])) +print(' STORE {:4d}'.format(d['cycles_RESULT'])) +print('') +print(' Sum {:4d}'.format(cycles_total)) +print('') +print(' Sum* {:4d}'.format(cycles_total_hidden)) +print(' Total FL slots* {:4d}'.format(cycles_total_hidden * 2)) +print(' FL slots occupied* {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency* {:0.2f}'.format(FL_micro_ops / (2*cycles_total_hidden))) +print('') +print(' *load/store/PERM hidden') + +estimated_cycles = cycles_total_hidden +# Estimate percent peak DP; dual issue, fma +pp = 100 * 4 * d['flops'] / (2*2*8*estimated_cycles) +print('') +print('Model prediction') +print('') +print(' Cycles* {:4d}'.format(estimated_cycles)) +print(' Percent peak* {:4.1f} %'.format(pp)) + +# estimated RF throughput in GB/s @ 2.2 GHz +tp10 = (d['load'] + d['store']) * 64 * 2.2 / estimated_cycles +tp2 = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / estimated_cycles +print('') +print(' Estimated RF throughput* {:4.1f} GB/s'.\ + format(tp10)) +print(' Estimated RF throughput* {:4.1f} GiB/s'.\ + format(tp2)) + +# ---- dynamic pipeline resources consumption ---- + +runtime = measured_cycles # runtime in cycles +pp_runtime = 100 * 4 * d['flops'] / (2*2*8*runtime) +runtime_FL_slots = 2 * runtime +delta = runtime - estimated_cycles + + +print('') +print('------------------------------------------------------------------') +print('') +print('Dynamic runtime analysis (cycles from measurements)') +print('') +print(' Cycles {:4d}'.format(runtime)) +print(' Percent peak {:4.1f} %'.format(pp_runtime)) +print(' Deviation from estimate {:4d} {:4.2f} %'.\ + format(delta, 100. * abs(delta/runtime))) +print(' Deviation per direction {:4.1f}'.format(delta/8)) + +# estimated RF throughput in GB/s @ 2.2 GHz +tp10_rt = (d['load'] + d['store']) * 64 * 2.2 / runtime +tp2_rt = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / runtime +print('') +print(' RF throughput {:4.1f} GB/s'.\ + format(tp10_rt)) +print(' RF throughput {:4.1f} GiB/s'.\ + format(tp2_rt)) +print('') +print(' Total FL slots {:4d}'.format(runtime_FL_slots)) +print(' FL slots occupied {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / runtime_FL_slots)) +print('') From 79a385facabd13fca8726b3b1ede13a3ad9d5819 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Wed, 15 Apr 2020 11:46:55 +0200 Subject: [PATCH 041/147] disabled armclang hotfix cause armclang 20.0 performance gets a little --- Grid/simd/Grid_vector_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 61f19a15..c203cd9e 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -118,7 +118,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GEN #if defined(A64FX) // breakout A64FX SVE ACLE here //#pragma message("building for A64FX / SVE ACLE") - #define ARMCLANGHOTFIX + //#define ARMCLANGHOTFIX #include "Grid_a64fx-2.h" #else #include "Grid_generic.h" From 6504a098ccb2708203affc326de61e489107c2f5 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Wed, 15 Apr 2020 15:06:52 +0200 Subject: [PATCH 042/147] 999 GiB/s Wilson; 694 GiB/s DW (DP) --- .../WilsonKernelsAsmBodyA64FX.h | 7 +- Grid/simd/Fujitsu_A64FX_asm_double.h | 269 +++++++++--------- Grid/simd/Fujitsu_A64FX_asm_single.h | 269 +++++++++--------- Grid/simd/Fujitsu_A64FX_intrin_double.h | 46 +-- Grid/simd/Fujitsu_A64FX_intrin_single.h | 46 +-- Grid/simd/Fujitsu_A64FX_undef.h | 1 + 6 files changed, 340 insertions(+), 298 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 7c1be429..54a52468 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -102,7 +102,6 @@ Author: Nils Meyer base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ PREFETCH_CHIMU(base); \ MULT_2SPIN_DIR_PF(Dir,basep); \ - PREFETCH_GAUGE_L1(NxtDir); \ RECON; \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ @@ -318,7 +317,9 @@ Author: Nils Meyer // if (nmu!=0) std::cout << "EXT "< std::cout << "----------------------------------------------------" << std::endl; #endif - //basep = (uint64_t) &out[ssn]; - RESULT(base,basep); } ssU++; UNLOCK_GAUGE(0); diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 75d49744..7931398f 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -32,6 +32,7 @@ Author: Nils Meyer #define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) #define PF_GAUGE(A) #define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) #define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) #define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) @@ -39,7 +40,7 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_CHIMU_L1(B); +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B); #define MULT_2SPIN_DIR_PF(A,B) \ MULT_2SPIN_A64FXd(A); \ PREFETCH_CHIMU_L2(B); \ @@ -88,18 +89,18 @@ asm ( \ #define RESULT_A64FXd(base) \ { \ asm ( \ - "stnt1d { z0.d }, p5, [%[storeptr], -6, mul vl] \n\t" \ - "stnt1d { z1.d }, p5, [%[storeptr], -5, mul vl] \n\t" \ - "stnt1d { z2.d }, p5, [%[storeptr], -4, mul vl] \n\t" \ - "stnt1d { z3.d }, p5, [%[storeptr], -3, mul vl] \n\t" \ - "stnt1d { z4.d }, p5, [%[storeptr], -2, mul vl] \n\t" \ - "stnt1d { z5.d }, p5, [%[storeptr], -1, mul vl] \n\t" \ - "stnt1d { z6.d }, p5, [%[storeptr], 0, mul vl] \n\t" \ - "stnt1d { z7.d }, p5, [%[storeptr], 1, mul vl] \n\t" \ - "stnt1d { z8.d }, p5, [%[storeptr], 2, mul vl] \n\t" \ - "stnt1d { z9.d }, p5, [%[storeptr], 3, mul vl] \n\t" \ - "stnt1d { z10.d }, p5, [%[storeptr], 4, mul vl] \n\t" \ - "stnt1d { z11.d }, p5, [%[storeptr], 5, mul vl] \n\t" \ + "str z0, [%[storeptr], -6, mul vl] \n\t" \ + "str z1, [%[storeptr], -5, mul vl] \n\t" \ + "str z2, [%[storeptr], -4, mul vl] \n\t" \ + "str z3, [%[storeptr], -3, mul vl] \n\t" \ + "str z4, [%[storeptr], -2, mul vl] \n\t" \ + "str z5, [%[storeptr], -1, mul vl] \n\t" \ + "str z6, [%[storeptr], 0, mul vl] \n\t" \ + "str z7, [%[storeptr], 1, mul vl] \n\t" \ + "str z8, [%[storeptr], 2, mul vl] \n\t" \ + "str z9, [%[storeptr], 3, mul vl] \n\t" \ + "str z10, [%[storeptr], 4, mul vl] \n\t" \ + "str z11, [%[storeptr], 5, mul vl] \n\t" \ : \ : [storeptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -165,12 +166,12 @@ asm ( \ #define LOAD_CHI_A64FXd(base) \ { \ asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -181,18 +182,18 @@ asm ( \ { \ asm ( \ "ptrue p5.d \n\t" \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -204,18 +205,18 @@ asm ( \ const SiteSpinor & ref(in[offset]); \ asm ( \ "ptrue p5.d \n\t" \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -227,18 +228,18 @@ asm ( \ const SiteSpinor & ref(in[offset]); \ asm ( \ "ptrue p5.d \n\t" \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -247,7 +248,7 @@ asm ( \ // LOAD_TABLE0 #define LOAD_TABLE0 \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (0) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -256,7 +257,7 @@ asm ( \ // LOAD_TABLE1 #define LOAD_TABLE1 \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (1) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -265,7 +266,7 @@ asm ( \ // LOAD_TABLE2 #define LOAD_TABLE2 \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (2) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -274,7 +275,7 @@ asm ( \ // LOAD_TABLE3 #define LOAD_TABLE3 \ asm ( \ - "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (3) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -331,12 +332,12 @@ asm ( \ { \ asm ( \ "ptrue p5.d \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -347,12 +348,12 @@ asm ( \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ "movprfx z18.d, p5/m, z31.d \n\t" \ "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ "movprfx z21.d, p5/m, z31.d \n\t" \ @@ -371,9 +372,9 @@ asm ( \ "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ @@ -408,12 +409,12 @@ asm ( \ { \ asm ( \ "ptrue p5.d \n\t" \ - "fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z24.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z25.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z26.d, 90 \n\t" \ + "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -469,12 +470,12 @@ asm ( \ #define YP_PROJ_A64FXd \ { \ asm ( \ - "fsub z12.d, p5/m, z12.d, z27.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z28.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z29.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z24.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z25.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z26.d \n\t" \ + "fsub z12.d, p5/m, z12.d, z21.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z22.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z23.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z18.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z19.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z20.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -484,12 +485,12 @@ asm ( \ #define ZP_PROJ_A64FXd \ { \ asm ( \ - "fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z27.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \ + "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -499,12 +500,12 @@ asm ( \ #define TP_PROJ_A64FXd \ { \ asm ( \ - "fadd z12.d, p5/m, z12.d, z24.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z25.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z26.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z27.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z28.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z29.d \n\t" \ + "fadd z12.d, p5/m, z12.d, z18.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z19.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z20.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z21.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z22.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z23.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -514,12 +515,12 @@ asm ( \ #define XM_PROJ_A64FXd \ { \ asm ( \ - "fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z24.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z25.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z26.d, 270 \n\t" \ + "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -555,12 +556,12 @@ asm ( \ #define YM_PROJ_A64FXd \ { \ asm ( \ - "fadd z12.d, p5/m, z12.d, z27.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z28.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z29.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z24.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z25.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z26.d \n\t" \ + "fadd z12.d, p5/m, z12.d, z21.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z22.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z23.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z18.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z19.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z20.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -570,12 +571,12 @@ asm ( \ #define ZM_PROJ_A64FXd \ { \ asm ( \ - "fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z27.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \ + "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -586,12 +587,12 @@ asm ( \ { \ asm ( \ "ptrue p5.d \n\t" \ - "fsub z12.d, p5/m, z12.d, z24.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z25.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z26.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z27.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z28.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z29.d \n\t" \ + "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z21.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z22.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z23.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -770,6 +771,18 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXd \ asm ( \ @@ -779,12 +792,12 @@ asm ( \ "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z24.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z25.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z26.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z27.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z28.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z29.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 1a341cc0..8b4442c8 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -32,6 +32,7 @@ Author: Nils Meyer #define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) #define PF_GAUGE(A) #define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) #define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) #define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) @@ -39,7 +40,7 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_CHIMU_L1(B); +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B); #define MULT_2SPIN_DIR_PF(A,B) \ MULT_2SPIN_A64FXf(A); \ PREFETCH_CHIMU_L2(B); \ @@ -88,18 +89,18 @@ asm ( \ #define RESULT_A64FXf(base) \ { \ asm ( \ - "stnt1w { z0.s }, p5, [%[storeptr], -6, mul vl] \n\t" \ - "stnt1w { z1.s }, p5, [%[storeptr], -5, mul vl] \n\t" \ - "stnt1w { z2.s }, p5, [%[storeptr], -4, mul vl] \n\t" \ - "stnt1w { z3.s }, p5, [%[storeptr], -3, mul vl] \n\t" \ - "stnt1w { z4.s }, p5, [%[storeptr], -2, mul vl] \n\t" \ - "stnt1w { z5.s }, p5, [%[storeptr], -1, mul vl] \n\t" \ - "stnt1w { z6.s }, p5, [%[storeptr], 0, mul vl] \n\t" \ - "stnt1w { z7.s }, p5, [%[storeptr], 1, mul vl] \n\t" \ - "stnt1w { z8.s }, p5, [%[storeptr], 2, mul vl] \n\t" \ - "stnt1w { z9.s }, p5, [%[storeptr], 3, mul vl] \n\t" \ - "stnt1w { z10.s }, p5, [%[storeptr], 4, mul vl] \n\t" \ - "stnt1w { z11.s }, p5, [%[storeptr], 5, mul vl] \n\t" \ + "str z0, [%[storeptr], -6, mul vl] \n\t" \ + "str z1, [%[storeptr], -5, mul vl] \n\t" \ + "str z2, [%[storeptr], -4, mul vl] \n\t" \ + "str z3, [%[storeptr], -3, mul vl] \n\t" \ + "str z4, [%[storeptr], -2, mul vl] \n\t" \ + "str z5, [%[storeptr], -1, mul vl] \n\t" \ + "str z6, [%[storeptr], 0, mul vl] \n\t" \ + "str z7, [%[storeptr], 1, mul vl] \n\t" \ + "str z8, [%[storeptr], 2, mul vl] \n\t" \ + "str z9, [%[storeptr], 3, mul vl] \n\t" \ + "str z10, [%[storeptr], 4, mul vl] \n\t" \ + "str z11, [%[storeptr], 5, mul vl] \n\t" \ : \ : [storeptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -165,12 +166,12 @@ asm ( \ #define LOAD_CHI_A64FXf(base) \ { \ asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -181,18 +182,18 @@ asm ( \ { \ asm ( \ "ptrue p5.s \n\t" \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -204,18 +205,18 @@ asm ( \ const SiteSpinor & ref(in[offset]); \ asm ( \ "ptrue p5.s \n\t" \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -227,18 +228,18 @@ asm ( \ const SiteSpinor & ref(in[offset]); \ asm ( \ "ptrue p5.s \n\t" \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z24.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -247,7 +248,7 @@ asm ( \ // LOAD_TABLE0 #define LOAD_TABLE0 \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (0) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -256,7 +257,7 @@ asm ( \ // LOAD_TABLE1 #define LOAD_TABLE1 \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (1) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -265,7 +266,7 @@ asm ( \ // LOAD_TABLE2 #define LOAD_TABLE2 \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (2) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -274,7 +275,7 @@ asm ( \ // LOAD_TABLE3 #define LOAD_TABLE3 \ asm ( \ - "ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ : \ : [tableptr] "r" (&lut[0]),[index] "i" (3) \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -342,12 +343,12 @@ asm ( \ { \ asm ( \ "ptrue p5.s \n\t" \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -358,12 +359,12 @@ asm ( \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ "movprfx z18.s, p5/m, z31.s \n\t" \ "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ "movprfx z21.s, p5/m, z31.s \n\t" \ @@ -382,9 +383,9 @@ asm ( \ "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ @@ -419,12 +420,12 @@ asm ( \ { \ asm ( \ "ptrue p5.s \n\t" \ - "fcadd z12.s, p5/m, z12.s, z27.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z28.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z29.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z24.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z25.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z26.s, 90 \n\t" \ + "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -480,12 +481,12 @@ asm ( \ #define YP_PROJ_A64FXf \ { \ asm ( \ - "fsub z12.s, p5/m, z12.s, z27.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z28.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z29.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z24.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z25.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z26.s \n\t" \ + "fsub z12.s, p5/m, z12.s, z21.s \n\t" \ + "fsub z13.s, p5/m, z13.s, z22.s \n\t" \ + "fsub z14.s, p5/m, z14.s, z23.s \n\t" \ + "fadd z15.s, p5/m, z15.s, z18.s \n\t" \ + "fadd z16.s, p5/m, z16.s, z19.s \n\t" \ + "fadd z17.s, p5/m, z17.s, z20.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -495,12 +496,12 @@ asm ( \ #define ZP_PROJ_A64FXf \ { \ asm ( \ - "fcadd z12.s, p5/m, z12.s, z24.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z25.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z26.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z27.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z28.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z29.s, 270 \n\t" \ + "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -510,12 +511,12 @@ asm ( \ #define TP_PROJ_A64FXf \ { \ asm ( \ - "fadd z12.s, p5/m, z12.s, z24.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z25.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z26.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z27.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z28.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z29.s \n\t" \ + "fadd z12.s, p5/m, z12.s, z18.s \n\t" \ + "fadd z13.s, p5/m, z13.s, z19.s \n\t" \ + "fadd z14.s, p5/m, z14.s, z20.s \n\t" \ + "fadd z15.s, p5/m, z15.s, z21.s \n\t" \ + "fadd z16.s, p5/m, z16.s, z22.s \n\t" \ + "fadd z17.s, p5/m, z17.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -525,12 +526,12 @@ asm ( \ #define XM_PROJ_A64FXf \ { \ asm ( \ - "fcadd z12.s, p5/m, z12.s, z27.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z28.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z29.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z24.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z25.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z26.s, 270 \n\t" \ + "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -566,12 +567,12 @@ asm ( \ #define YM_PROJ_A64FXf \ { \ asm ( \ - "fadd z12.s, p5/m, z12.s, z27.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z28.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z29.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z24.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z25.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z26.s \n\t" \ + "fadd z12.s, p5/m, z12.s, z21.s \n\t" \ + "fadd z13.s, p5/m, z13.s, z22.s \n\t" \ + "fadd z14.s, p5/m, z14.s, z23.s \n\t" \ + "fsub z15.s, p5/m, z15.s, z18.s \n\t" \ + "fsub z16.s, p5/m, z16.s, z19.s \n\t" \ + "fsub z17.s, p5/m, z17.s, z20.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -581,12 +582,12 @@ asm ( \ #define ZM_PROJ_A64FXf \ { \ asm ( \ - "fcadd z12.s, p5/m, z12.s, z24.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z25.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z26.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z27.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z28.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z29.s, 90 \n\t" \ + "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -597,12 +598,12 @@ asm ( \ { \ asm ( \ "ptrue p5.s \n\t" \ - "fsub z12.s, p5/m, z12.s, z24.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z25.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z26.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z27.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z28.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z29.s \n\t" \ + "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ + "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ + "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ + "fsub z15.s, p5/m, z15.s, z21.s \n\t" \ + "fsub z16.s, p5/m, z16.s, z22.s \n\t" \ + "fsub z17.s, p5/m, z17.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ @@ -781,6 +782,18 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXf \ asm ( \ @@ -790,12 +803,12 @@ asm ( \ "fadd z3.s, p5/m, z3.s, z15.s \n\t" \ "fadd z4.s, p5/m, z4.s, z16.s \n\t" \ "fadd z5.s, p5/m, z5.s, z17.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z24.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z25.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z26.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z27.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z28.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z29.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ : \ : \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 08ac2966..2ddb33f1 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -32,6 +32,7 @@ Author: Nils Meyer #define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) #define PF_GAUGE(A) #define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) #define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) #define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) @@ -39,7 +40,7 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_CHIMU_L1(B); +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B); #define MULT_2SPIN_DIR_PF(A,B) \ MULT_2SPIN_A64FXd(A); \ PREFETCH_CHIMU_L2(B); \ @@ -119,27 +120,27 @@ Author: Nils Meyer #define Chimu_10 Chi_10 #define Chimu_11 Chi_11 #define Chimu_12 Chi_12 -#define Chimu_20 U_00 -#define Chimu_21 U_10 -#define Chimu_22 U_20 -#define Chimu_30 U_01 -#define Chimu_31 U_11 -#define Chimu_32 U_21 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 // RESULT #define RESULT_A64FXd(base) \ { \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ @@ -596,6 +597,13 @@ Author: Nils Meyer svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ } +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \ +} // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXd \ result_00 = svadd_x(pg1, result_00, Chimu_00); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index db5555bc..3824aecf 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -32,6 +32,7 @@ Author: Nils Meyer #define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) #define PF_GAUGE(A) #define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) #define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) #define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) @@ -39,7 +40,7 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_CHIMU_L1(B); +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B); #define MULT_2SPIN_DIR_PF(A,B) \ MULT_2SPIN_A64FXf(A); \ PREFETCH_CHIMU_L2(B); \ @@ -119,27 +120,27 @@ Author: Nils Meyer #define Chimu_10 Chi_10 #define Chimu_11 Chi_11 #define Chimu_12 Chi_12 -#define Chimu_20 U_00 -#define Chimu_21 U_10 -#define Chimu_22 U_20 -#define Chimu_30 U_01 -#define Chimu_31 U_11 -#define Chimu_32 U_21 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 // RESULT #define RESULT_A64FXf(base) \ { \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ @@ -602,6 +603,13 @@ Author: Nils Meyer svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ } +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \ +} // ADD_RESULT_INTERNAL #define ADD_RESULT_INTERNAL_A64FXf \ result_00 = svadd_x(pg1, result_00, Chimu_00); \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 07e30535..5c41a7c4 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -37,6 +37,7 @@ Author: Nils Meyer #undef PREFETCH1_CHIMU #undef PREFETCH_CHIMU #undef PREFETCH_RESULT_L2_STORE +#undef PREFETCH_RESULT_L1_STORE #undef LOAD_GAUGE #undef LOCK_GAUGE #undef UNLOCK_GAUGE From 852db4626a3519dde32fb75e2d12a7448f2a3eb7 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Wed, 15 Apr 2020 18:22:19 +0200 Subject: [PATCH 043/147] re-introduced HOTFIX cause Grid binaries give wrong results otherwise; checked in good gridverter.py --- Grid/simd/Grid_vector_types.h | 2 +- Grid/simd/gridverter.py | 87 ++++++++++++++++++++++++++++------- 2 files changed, 72 insertions(+), 17 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index c203cd9e..61f19a15 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -118,7 +118,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GEN #if defined(A64FX) // breakout A64FX SVE ACLE here //#pragma message("building for A64FX / SVE ACLE") - //#define ARMCLANGHOTFIX + #define ARMCLANGHOTFIX #include "Grid_a64fx-2.h" #else #include "Grid_generic.h" diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py index 415f5578..137471cd 100755 --- a/Grid/simd/gridverter.py +++ b/Grid/simd/gridverter.py @@ -47,7 +47,7 @@ ALTERNATIVE_LOADS = False # must use with my_wilson4.h and my_wilson4pf.h ALTERNATIVE_REGISTER_MAPPING = False -ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING +#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING if ALTERNATIVE_REGISTER_MAPPING == True: ALTERNATIVE_LOADS = False @@ -229,15 +229,25 @@ class Register: gpr = d['asmtableptr'] cast = 'uint64_t' - asm_opcode = 'ld1d' + #asm_opcode = 'ld1d' + #if PRECISION == 'single': + # asm_opcode = 'ld1w' + # cast = 'uint32_t' + asm_opcode = 'ldr' if PRECISION == 'single': - asm_opcode = 'ld1w' + asm_opcode = 'ldr' cast = 'uint32_t' d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' # using immediate index break-out works - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + if asm_opcode == 'ldr': + # ldr version + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + else: + # ld1 version + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + d['asminput'].append(F'[tableptr] "r" (&lut[0])') d['asminput'].append(F'[index] "i" ({t})') d['asmclobber'].append(F'"memory"') @@ -249,9 +259,14 @@ class Register: indices = re.findall(r'\d+', address) index = (int(indices[0]) - offset) * colors + int(indices[1]) - asm_opcode = 'ld1d' + #asm_opcode = 'ld1d' + #if PRECISION == 'single': + #asm_opcode = 'ld1w' + # cast = 'float32_t' + + asm_opcode = 'ldr' if PRECISION == 'single': - asm_opcode = 'ld1w' + asm_opcode = 'ldr' cast = 'float32_t' gpr = d['asmfetchbaseptr'] @@ -259,9 +274,13 @@ class Register: if (target in ['ALL', 'C']): d['C'] += F' {self.name} = {address}; \\\n' if (target in ['ALL', 'I']): +# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' if (target in ['ALL', 'A']): - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + if asm_opcode == 'ldr': + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + else: + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): global d @@ -269,16 +288,24 @@ class Register: indices = re.findall(r'\d+', address) index = (int(indices[0]) - offset) * colors + int(indices[1]) - asm_opcode = 'stnt1d' + #asm_opcode = 'stnt1d' + #if PRECISION == 'single': + # asm_opcode = 'stnt1w' + # cast = 'float32_t' + asm_opcode = 'str' if PRECISION == 'single': - asm_opcode = 'stnt1w' + asm_opcode = 'str' cast = 'float32_t' intrinstorebase = d['intrinstorebase'] d['C'] += F' {address} = {self.name}; \\\n' - d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + #d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + if asm_opcode == 'str': + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + else: + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' def movestr(self, str): global d @@ -621,7 +648,16 @@ def prefetch_L2_store(address, offset): d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - #d['A'] += + +def prefetch_L1_store(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PSTL1STRM" # weak + #policy = "PSTL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + def asmopen(): #write('asm volatile ( \\', target='A') @@ -878,9 +914,11 @@ if PREFETCH: define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') define(F'PF_GAUGE(A)') define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)') define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') # define(F'PREFETCH1_CHIMU(A)') define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') +# define(F'PREFETCH_CHIMU(A)') else: define(F'PREFETCH_CHIMU_L1(A)') define(F'PREFETCH_GAUGE_L1(A)') @@ -897,8 +935,9 @@ define(F'UNLOCK_GAUGE(A)') define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') define(F'COMPLEX_SIGNS(A)') define(F'LOAD64(A,B)') -#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);') -define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_CHIMU_L1(B);') +# prefetch chimu here is useless, because already done in last leg +#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);') +define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);') if PREFETCH: definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ') write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\') @@ -2156,8 +2195,7 @@ asmclose() #debugall('ZERO_PSI', group='result') newline() -d['factor'] = 0 -# prefetch store spinors into L2 cache +# prefetch store spinors to L2 cache d['factor'] = 0 d['cycles_PREFETCH_L2'] += 0 * d['factor'] write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') @@ -2173,6 +2211,23 @@ asmclose() curlyclose() newline() +# prefetch store spinors to L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)') +definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +fetch_base_ptr(F"base", target='A') +prefetch_L1_store(F"base", 0) +prefetch_L1_store(F"base", 1) +prefetch_L1_store(F"base", 2) +asmclose() +curlyclose() +newline() + + d['factor'] = 0 write('// ADD_RESULT_INTERNAL') definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}') From 6fdce60492bd83688692111090dfe517bf1de08d Mon Sep 17 00:00:00 2001 From: nils meyer Date: Thu, 16 Apr 2020 22:43:32 +0200 Subject: [PATCH 044/147] revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0) --- .../WilsonKernelsAsmBodyA64FX.h | 168 +++++++++------- Grid/simd/Fujitsu_A64FX_asm_double.h | 92 ++++----- Grid/simd/Fujitsu_A64FX_asm_single.h | 103 ++++------ Grid/simd/Fujitsu_A64FX_intrin_double.h | 72 +++---- Grid/simd/Fujitsu_A64FX_intrin_single.h | 78 +++----- Grid/simd/Fujitsu_A64FX_undef.h | 28 ++- Grid/simd/gridverter.py | 189 +++++------------- 7 files changed, 279 insertions(+), 451 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 54a52468..d77b4414 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -26,14 +26,14 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #ifdef KERNEL_DAG -#define DIR0_PROJMEM(base) XP_PROJMEM(base); -#define DIR1_PROJMEM(base) YP_PROJMEM(base); -#define DIR2_PROJMEM(base) ZP_PROJMEM(base); -#define DIR3_PROJMEM(base) TP_PROJMEM(base); -#define DIR4_PROJMEM(base) XM_PROJMEM(base); -#define DIR5_PROJMEM(base) YM_PROJMEM(base); -#define DIR6_PROJMEM(base) ZM_PROJMEM(base); -#define DIR7_PROJMEM(base) TM_PROJMEM(base); +#define DIR0_PROJ XP_PROJ +#define DIR1_PROJ YP_PROJ +#define DIR2_PROJ ZP_PROJ +#define DIR3_PROJ TP_PROJ +#define DIR4_PROJ XM_PROJ +#define DIR5_PROJ YM_PROJ +#define DIR6_PROJ ZM_PROJ +#define DIR7_PROJ TM_PROJ #define DIR0_RECON XP_RECON #define DIR1_RECON YP_RECON_ACCUM #define DIR2_RECON ZP_RECON_ACCUM @@ -43,14 +43,14 @@ Author: Nils Meyer #define DIR6_RECON ZM_RECON_ACCUM #define DIR7_RECON TM_RECON_ACCUM #else -#define DIR0_PROJMEM(base) XM_PROJMEM(base); -#define DIR1_PROJMEM(base) YM_PROJMEM(base); -#define DIR2_PROJMEM(base) ZM_PROJMEM(base); -#define DIR3_PROJMEM(base) TM_PROJMEM(base); -#define DIR4_PROJMEM(base) XP_PROJMEM(base); -#define DIR5_PROJMEM(base) YP_PROJMEM(base); -#define DIR6_PROJMEM(base) ZP_PROJMEM(base); -#define DIR7_PROJMEM(base) TP_PROJMEM(base); +#define DIR0_PROJ XM_PROJ +#define DIR1_PROJ YM_PROJ +#define DIR2_PROJ ZM_PROJ +#define DIR3_PROJ TM_PROJ +#define DIR4_PROJ XP_PROJ +#define DIR5_PROJ YP_PROJ +#define DIR6_PROJ ZP_PROJ +#define DIR7_PROJ TP_PROJ #define DIR0_RECON XM_RECON #define DIR1_RECON YM_RECON_ACCUM #define DIR2_RECON ZM_RECON_ACCUM @@ -91,23 +91,28 @@ Author: Nils Meyer #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ - /* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \ - PROJ(base); \ - /* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \ - MAYBEPERM(PERMUTE_DIR,perm); \ - } else { \ - LOAD_CHI(base); \ - } \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ - MULT_2SPIN_DIR_PF(Dir,basep); \ - RECON; \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + PREFETCH_CHIMU_L2(basep); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - PF_GAUGE(Xp); \ - PREFETCH1_CHIMU(base); \ +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PREFETCH1_CHIMU(base); \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -121,22 +126,28 @@ Author: Nils Meyer #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ - PROJ(base); \ - MAYBEPERM(PERMUTE_DIR,perm); \ - }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ - if ( local || st.same_node[Dir] ) { \ - MULT_2SPIN_DIR_PF(Dir,basep); \ - RECON; \ - } \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + PREFETCH_CHIMU_L2(basep); \ + } else { PREFETCH_CHIMU(base); } \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - PF_GAUGE(Xp); \ PREFETCH1_CHIMU(base); \ - { ZERO_PSI; } \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -149,23 +160,34 @@ Author: Nils Meyer #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ - MULT_2SPIN_DIR_PF(Dir,base); \ - RECON; \ - nmu++; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + nmu++; \ } -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - nmu=0; \ - { ZERO_PSI;} \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ - MULT_2SPIN_DIR_PF(Dir,base); \ - RECON; \ - nmu++; \ +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + nmu++; \ } #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} @@ -201,7 +223,7 @@ Author: Nils Meyer uint64_t delta_base, delta_base_p; - ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON); + ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON); #ifdef SHOW float rescale = 64. * 12.; @@ -221,7 +243,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON); + ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON); #ifdef SHOW std::cout << "Dir = " << Yp << " " << WHERE<< std::endl; @@ -234,7 +256,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON); + ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON); #ifdef SHOW std::cout << "Dir = " << Zp << " " << WHERE<< std::endl; @@ -247,7 +269,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON); + ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON); #ifdef SHOW std::cout << "Dir = " << Tp << " " << WHERE<< std::endl; @@ -260,7 +282,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON); + ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON); #ifdef SHOW std::cout << "Dir = " << Xm << " " << WHERE<< std::endl; @@ -273,7 +295,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON); + ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); #ifdef SHOW std::cout << "Dir = " << Ym << " " << WHERE<< std::endl; @@ -286,7 +308,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON); + ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); #ifdef SHOW std::cout << "Dir = " << Zm << " " << WHERE<< std::endl; @@ -299,7 +321,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON); + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); #ifdef SHOW std::cout << "Dir = " << Tm << " " << WHERE<< std::endl; @@ -337,14 +359,14 @@ Author: Nils Meyer } } -#undef DIR0_PROJMEM -#undef DIR1_PROJMEM -#undef DIR2_PROJMEM -#undef DIR3_PROJMEM -#undef DIR4_PROJMEM -#undef DIR5_PROJMEM -#undef DIR6_PROJMEM -#undef DIR7_PROJMEM +#undef DIR0_PROJ +#undef DIR1_PROJ +#undef DIR2_PROJ +#undef DIR3_PROJ +#undef DIR4_PROJ +#undef DIR5_PROJ +#undef DIR6_PROJ +#undef DIR7_PROJ #undef DIR0_RECON #undef DIR1_RECON #undef DIR2_RECON diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 7931398f..4d9e8fd9 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -25,7 +25,7 @@ Author: Nils Meyer See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) @@ -40,23 +40,19 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B); -#define MULT_2SPIN_DIR_PF(A,B) \ - MULT_2SPIN_A64FXd(A); \ - PREFETCH_CHIMU_L2(B); \ - if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } -#define MAYBEPERM(A,perm) { A ; } +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI -#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define XP_PROJ XP_PROJ_A64FXd +#define YP_PROJ YP_PROJ_A64FXd +#define ZP_PROJ ZP_PROJ_A64FXd +#define TP_PROJ TP_PROJ_A64FXd +#define XM_PROJ XM_PROJ_A64FXd +#define YM_PROJ YM_PROJ_A64FXd +#define ZM_PROJ ZM_PROJ_A64FXd +#define TM_PROJ TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd @@ -67,10 +63,13 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; } -#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; } -#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; } -#define PERMUTE_DIR3 +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXd; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } +#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ const uint64_t lut[4][8] = { \ @@ -281,8 +280,8 @@ asm ( \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM0 -#define PERM0_A64FXd \ +// PERMUTE +#define PERMUTE_A64FXd \ asm ( \ "tbl z12.d, { z12.d }, z30.d \n\t" \ "tbl z13.d, { z13.d }, z30.d \n\t" \ @@ -295,37 +294,6 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM1 -#define PERM1_A64FXd \ -asm ( \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM2 -#define PERM2_A64FXd \ -asm ( \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM3 -#define PERM3_A64FXd - // LOAD_GAUGE #define LOAD_GAUGE \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ @@ -344,7 +312,7 @@ asm ( \ ); \ } // MULT_2SPIN -#define MULT_2SPIN_A64FXd(A) \ +#define MULT_2SPIN_1_A64FXd(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ @@ -375,6 +343,15 @@ asm ( \ "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXd \ +{ \ +asm ( \ "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ @@ -400,15 +377,14 @@ asm ( \ "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XP_PROJ #define XP_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 8b4442c8..e1532acb 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -25,7 +25,7 @@ Author: Nils Meyer See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) @@ -40,23 +40,19 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B); -#define MULT_2SPIN_DIR_PF(A,B) \ - MULT_2SPIN_A64FXf(A); \ - PREFETCH_CHIMU_L2(B); \ - if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } -#define MAYBEPERM(A,perm) { A ; } +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI -#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define XP_PROJ XP_PROJ_A64FXf +#define YP_PROJ YP_PROJ_A64FXf +#define ZP_PROJ ZP_PROJ_A64FXf +#define TP_PROJ TP_PROJ_A64FXf +#define XM_PROJ XM_PROJ_A64FXf +#define YM_PROJ YM_PROJ_A64FXf +#define ZM_PROJ ZM_PROJ_A64FXf +#define TM_PROJ TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf @@ -67,10 +63,13 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; } -#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; } -#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; } -#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; } +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXf; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } +#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ const uint32_t lut[4][16] = { \ @@ -281,50 +280,8 @@ asm ( \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM0 -#define PERM0_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM1 -#define PERM1_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM2 -#define PERM2_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM3 -#define PERM3_A64FXf \ +// PERMUTE +#define PERMUTE_A64FXf \ asm ( \ "tbl z12.s, { z12.s }, z30.s \n\t" \ "tbl z13.s, { z13.s }, z30.s \n\t" \ @@ -355,7 +312,7 @@ asm ( \ ); \ } // MULT_2SPIN -#define MULT_2SPIN_A64FXf(A) \ +#define MULT_2SPIN_1_A64FXf(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ @@ -386,6 +343,15 @@ asm ( \ "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXf \ +{ \ +asm ( \ "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ @@ -411,15 +377,14 @@ asm ( \ "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XP_PROJ #define XP_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 2ddb33f1..4a792047 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -25,7 +25,7 @@ Author: Nils Meyer See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) @@ -40,23 +40,19 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B); -#define MULT_2SPIN_DIR_PF(A,B) \ - MULT_2SPIN_A64FXd(A); \ - PREFETCH_CHIMU_L2(B); \ - if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } -#define MAYBEPERM(A,perm) { A ; } +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI -#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define XP_PROJ XP_PROJ_A64FXd +#define YP_PROJ YP_PROJ_A64FXd +#define ZP_PROJ ZP_PROJ_A64FXd +#define TP_PROJ TP_PROJ_A64FXd +#define XM_PROJ XM_PROJ_A64FXd +#define YM_PROJ YM_PROJ_A64FXd +#define ZM_PROJ ZM_PROJ_A64FXd +#define TM_PROJ TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd @@ -67,10 +63,13 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; } -#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; } -#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; } -#define PERMUTE_DIR3 +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXd; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } +#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ const uint64_t lut[4][8] = { \ @@ -254,8 +253,8 @@ Author: Nils Meyer #define LOAD_TABLE3 \ table0 = svld1(pg1, (uint64_t*)&lut[3]); -// PERM0 -#define PERM0_A64FXd \ +// PERMUTE +#define PERMUTE_A64FXd \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -263,27 +262,6 @@ Author: Nils Meyer Chi_11 = svtbl(Chi_11, table0); \ Chi_12 = svtbl(Chi_12, table0); -// PERM1 -#define PERM1_A64FXd \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM2 -#define PERM2_A64FXd \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM3 -#define PERM3_A64FXd - // LOAD_GAUGE #define LOAD_GAUGE \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ @@ -296,7 +274,7 @@ Author: Nils Meyer U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ } // MULT_2SPIN -#define MULT_2SPIN_A64FXd(A) \ +#define MULT_2SPIN_1_A64FXd(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ @@ -320,6 +298,10 @@ Author: Nils Meyer U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXd \ +{ \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 3824aecf..0ba5df17 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -25,7 +25,7 @@ Author: Nils Meyer See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) @@ -40,23 +40,19 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B); -#define MULT_2SPIN_DIR_PF(A,B) \ - MULT_2SPIN_A64FXf(A); \ - PREFETCH_CHIMU_L2(B); \ - if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } -#define MAYBEPERM(A,perm) { A ; } +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI -#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define XP_PROJ XP_PROJ_A64FXf +#define YP_PROJ YP_PROJ_A64FXf +#define ZP_PROJ ZP_PROJ_A64FXf +#define TP_PROJ TP_PROJ_A64FXf +#define XM_PROJ XM_PROJ_A64FXf +#define YM_PROJ YM_PROJ_A64FXf +#define ZM_PROJ ZM_PROJ_A64FXf +#define TM_PROJ TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf @@ -67,10 +63,13 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; } -#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; } -#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; } -#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; } +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXf; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } +#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ const uint32_t lut[4][16] = { \ @@ -254,35 +253,8 @@ Author: Nils Meyer #define LOAD_TABLE3 \ table0 = svld1(pg1, (uint32_t*)&lut[3]); -// PERM0 -#define PERM0_A64FXf \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM1 -#define PERM1_A64FXf \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM2 -#define PERM2_A64FXf \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM3 -#define PERM3_A64FXf \ +// PERMUTE +#define PERMUTE_A64FXf \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -302,7 +274,7 @@ Author: Nils Meyer U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ } // MULT_2SPIN -#define MULT_2SPIN_A64FXf(A) \ +#define MULT_2SPIN_1_A64FXf(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ @@ -326,6 +298,10 @@ Author: Nils Meyer U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXf \ +{ \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 5c41a7c4..81eec37a 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -26,14 +26,12 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ -#undef LOAD_CHIMU_A64FXd -#undef LOAD_CHIMU_A64FXf +#undef LOAD_CHIMU #undef PREFETCH_CHIMU_L1 #undef PREFETCH_GAUGE_L1 #undef PREFETCH_CHIMU_L2 #undef PREFETCH_GAUGE_L2 #undef PREFETCH_GAUGE_L1_INTERNAL -#undef PF_GAUGE #undef PREFETCH1_CHIMU #undef PREFETCH_CHIMU #undef PREFETCH_RESULT_L2_STORE @@ -42,22 +40,20 @@ Author: Nils Meyer #undef LOCK_GAUGE #undef UNLOCK_GAUGE #undef MASK_REGS -#undef COMPLEX_SIGNS -#undef LOAD64 #undef SAVE_RESULT #undef ADD_RESULT -#undef MULT_2SPIN_DIR_PF +#undef MULT_2SPIN_1 +#undef MULT_2SPIN_2 #undef MAYBEPERM #undef LOAD_CHI -#undef ZERO_PSI -#undef XP_PROJMEM -#undef YP_PROJMEM -#undef ZP_PROJMEM -#undef TP_PROJMEM -#undef XM_PROJMEM -#undef YM_PROJMEM -#undef ZM_PROJMEM -#undef TM_PROJMEM +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ #undef XP_RECON #undef XM_RECON #undef XM_RECON_ACCUM @@ -68,10 +64,12 @@ Author: Nils Meyer #undef YP_RECON_ACCUM #undef ZP_RECON_ACCUM #undef TP_RECON_ACCUM +#undef PERMUTE #undef PERMUTE_DIR0 #undef PERMUTE_DIR1 #undef PERMUTE_DIR2 #undef PERMUTE_DIR3 +#undef LOAD_TABLE #undef LOAD_TABLE0 #undef LOAD_TABLE1 #undef LOAD_TABLE2 diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py index 137471cd..7628159b 100755 --- a/Grid/simd/gridverter.py +++ b/Grid/simd/gridverter.py @@ -115,37 +115,9 @@ STORE_BASE_PTR_COLOR_OFFSET = 2 # 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) OPT = """ -#ifdef INTERIOR - -#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ --- LOAD64(%r10,isigns); \ - PROJ(base); \ -++ PF_GAUGE(Dir); \ - MAYBEPERM(PERMUTE_DIR,perm); \ - } else if ( st.same_node[Dir] ) { - LOAD_CHI(base); -++ PF_GAUGE(Dir); - } \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - if ( local || st.same_node[Dir] ) { \ - MULT_2SPIN_DIR_PF(Dir,basep); \ - PREFETCH_CHIMU(base); \ --- LOAD64(%r10,isigns); \ - RECON; \ - } else { PREFETCH_CHIMU(base); } - -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ --- PF_GAUGE(Xp); \ - PREFETCH1_CHIMU(base); \ - { ZERO_PSI; } \ - ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) - -#define RESULT(base,basep) SAVE_RESULT(base,basep); - -#endif +* interleave prefetching and compute in MULT_2SPIN +* could test storing U's in MULT_2SPIN to L1d, might be beneficial for life time cache lines +* structure reordering: MAYBEPERM after MULT_2SPIN ? """ filename = 'XXX' @@ -905,7 +877,8 @@ if ALTERNATIVE_LOADS == True: define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') define(F'LOAD_CHIMU(x)') else: - define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') + #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') + define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') if PREFETCH: define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') @@ -935,39 +908,22 @@ define(F'UNLOCK_GAUGE(A)') define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') define(F'COMPLEX_SIGNS(A)') define(F'LOAD64(A,B)') -# prefetch chimu here is useless, because already done in last leg -#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);') -define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);') -if PREFETCH: - definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ') - write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\') - write (F' PREFETCH_CHIMU_L2(B); \\') - write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}') - -# definemultiline(F'MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A);') -# write (F' PREFETCH_CHIMU_L2(B); \\') -# write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\') -# write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}') - newline() -else: - define(F'MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_{PRECSUFFIX}(A)') -# break out maybeperm in permutes -#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') -define(F'MAYBEPERM(A,perm) {{ A ; }}') +define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') +define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') +define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') # don't need zero psi, everything is done in recons #define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') -define(F'ZERO_PSI') -define(F'ADD_RESULT(base,basep) LOAD_CHIMU_{PRECSUFFIX}(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') +define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') # loads projections -define(F'XP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XP_PROJ_{PRECSUFFIX}') -define(F'YP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YP_PROJ_{PRECSUFFIX}') -define(F'ZP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZP_PROJ_{PRECSUFFIX}') -define(F'TP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TP_PROJ_{PRECSUFFIX}') -define(F'XM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XM_PROJ_{PRECSUFFIX}') -define(F'YM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YM_PROJ_{PRECSUFFIX}') -define(F'ZM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZM_PROJ_{PRECSUFFIX}') -define(F'TM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TM_PROJ_{PRECSUFFIX}') +define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}') +define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}') +define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}') +define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}') +define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}') +define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}') +define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}') +define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}') # recons define(F'XP_RECON XP_RECON_{PRECSUFFIX}') define(F'XM_RECON XM_RECON_{PRECSUFFIX}') @@ -979,14 +935,21 @@ define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') -# permutes -define(F'PERMUTE_DIR0 LOAD_TABLE0; if (perm) {{ PERM0_{PRECSUFFIX}; }}') -define(F'PERMUTE_DIR1 LOAD_TABLE1; if (perm) {{ PERM1_{PRECSUFFIX}; }}') -define(F'PERMUTE_DIR2 LOAD_TABLE2; if (perm) {{ PERM2_{PRECSUFFIX}; }}') +# new permutes +define(F'PERMUTE_DIR0 0') +define(F'PERMUTE_DIR1 1') +define(F'PERMUTE_DIR2 2') +define(F'PERMUTE_DIR3 3') +define(F'PERMUTE PERMUTE_{PRECSUFFIX};') +# load table +#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') if PRECISION == 'double': - define(F'PERMUTE_DIR3') + define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}') + define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}') else: - define(F'PERMUTE_DIR3 LOAD_TABLE3; if (perm) {{ PERM3_{PRECSUFFIX}; }}') + define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}') + define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}') + write('// DECLARATIONS') @@ -1040,20 +1003,14 @@ U_01.declare() U_11.declare() U_21.declare() # 6 -> 30 regs -# all true +# all predications true pg1.declare() if PRECISION == 'double': pg1.movestr('svptrue_b64()') else: pg1.movestr('svptrue_b32()') -# even elements only -#pg2.declare() -#pg2.movestr('svzip1_b64(svptrue_b64(), svpfalse_b())') - -# preload tables -# 0: swap -# 1: permute 1 +# tables if PRECISION == 'double': write(' svuint64_t table0; \\', target='I') # -> 31 regs else: @@ -1061,10 +1018,10 @@ else: zero0.declare() +# zero register asmopen() zero0.zero(zeroreg=True) asmclose() - newline() define('Chimu_00 Chi_00', target='I') @@ -1087,7 +1044,6 @@ else: # wilson4.h define('Chimu_30 U_01', target='I') define('Chimu_31 U_11', target='I') define('Chimu_32 U_21', target='I') - newline() @@ -1380,47 +1336,11 @@ table0.loadtable(3) asmclose() newline() -# 8 directions = 6x permutations -d['factor'] = 2 # factor is 0 -d['cycles_PERM'] += 6 * d['factor'] -write('// PERM0') -definemultiline(F'PERM0_{PRECSUFFIX}') -debugall('PERM0 PRE', group='Chi') -asmopen() -#table0.loadtable(0) -Chi_00.permute(0, table0) -Chi_01.permute(0, table0) -Chi_02.permute(0, table0) -Chi_10.permute(0, table0) -Chi_11.permute(0, table0) -Chi_12.permute(0, table0) -asmclose() -debugall('PERM0 POST', group='Chi') -newline() - d['factor'] = 2 # factor is 2 d['cycles_PERM'] += 6 * d['factor'] -write('// PERM1') -definemultiline(F'PERM1_{PRECSUFFIX}') -debugall('PERM1 PRE', group='Chi') -asmopen() -#table0.loadtable(1) -Chi_00.permute(1, table0) -Chi_01.permute(1, table0) -Chi_02.permute(1, table0) -Chi_10.permute(1, table0) -Chi_11.permute(1, table0) -Chi_12.permute(1, table0) -asmclose() -debugall('PERM1 POST', group='Chi') -newline() - -d['factor'] = 2 # factor is 2 -# PERM2 = swap real and imaginary -d['cycles_PERM'] += 6 * d['factor'] -write('// PERM2') -definemultiline(F'PERM2_{PRECSUFFIX}') -debugall('PERM2 PRE', group='Chi') +write('// PERMUTE') +definemultiline(F'PERMUTE_{PRECSUFFIX}') +debugall('PERM PRE', group='Chi') asmopen() #table0.loadtable(2) Chi_00.permute(2, table0) @@ -1430,26 +1350,7 @@ Chi_10.permute(2, table0) Chi_11.permute(2, table0) Chi_12.permute(2, table0) asmclose() -debugall('PERM2 POST', group='Chi') -newline() - -# PERM3 = identity (DP), so exclude from counting -d['factor'] = 0 -d['cycles_PERM'] += 6 * d['factor'] -write('// PERM3') -definemultiline(F'PERM3_{PRECSUFFIX}') -if PRECISION == 'single': - debugall('PERM3 PRE', group='Chi') - asmopen() - #table0.loadtable(3) - Chi_00.permute(3, table0) - Chi_01.permute(3, table0) - Chi_02.permute(3, table0) - Chi_10.permute(3, table0) - Chi_11.permute(3, table0) - Chi_12.permute(3, table0) - asmclose() - debugall('PERM3 POST', group='Chi') +debugall('PERM POST', group='Chi') newline() write('// LOAD_GAUGE') @@ -1473,7 +1374,7 @@ if ASM_LOAD_GAUGE: asmclose() curlyclose() newline() -# XXXXXX remove loads + d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total # assume all U loads are hidden # FCMLA issue latency = 2 cycles @@ -1482,7 +1383,7 @@ d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total # 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] write('// MULT_2SPIN') -definemultiline(F'MULT_2SPIN_{PRECSUFFIX}(A)') +definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)') curlyopen() #write(' const auto & ref(U[sU][A]); \\') if GRIDBENCH: # referencing differs in Grid and GridBench @@ -1541,7 +1442,15 @@ if ASM_LOAD_GAUGE: U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded U_10.load("ref[1][2]") # early load U_20.load("ref[2][2]") # A --> +asmclose() +debugall('MULT_2SPIN_1', group='UChi') +curlyclose() +newline() +write('// MULT_2SPIN_BACKEND') +definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}') +curlyopen() +asmopen() # round 3 UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) @@ -1571,7 +1480,7 @@ UChi_11.mac1(U_10, Chi_12) UChi_02.mac1(U_20, Chi_02) UChi_12.mac1(U_20, Chi_12) asmclose() -debugall('MULT_2SPIN', group='UChi') +debugall('MULT_2SPIN_2', group='UChi') curlyclose() newline() @@ -1587,7 +1496,7 @@ if ALTERNATIVE_LOADS == True: write(' LOAD_CHIMU_0312_PLUG \\') curlyopen() asmopen() -pg1.loadpredication() +#pg1.loadpredication() Chi_00.addTimesI(Chimu_00, Chimu_30) Chi_01.addTimesI(Chimu_01, Chimu_31) Chi_02.addTimesI(Chimu_02, Chimu_32) From 64b72fc17f42c17f6b96f14929638ea8c013df13 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Sun, 19 Apr 2020 01:25:40 +0200 Subject: [PATCH 045/147] testing gcc 10.0.1: build errors in Exchange1 using -DA64FX and in Lattice_base.h building Dslash only --- .../implementation/WilsonKernelsAsmA64FX.h | 7 ++++- .../implementation/WilsonKernelsAsmBody.h | 2 ++ Grid/simd/Fujitsu_A64FX_asm_double.h | 2 -- Grid/simd/Fujitsu_A64FX_asm_single.h | 2 -- Grid/simd/Fujitsu_A64FX_intrin_double.h | 28 +++++++++---------- Grid/simd/Fujitsu_A64FX_intrin_single.h | 28 +++++++++---------- Grid/simd/Grid_a64fx-2.h | 2 +- Grid/simd/Grid_vector_types.h | 5 +++- benchmarks/Benchmark_dwf.cc | 8 ++++++ benchmarks/Benchmark_wilson.cc | 3 ++ benchmarks/Benchmark_wilson_sweep.cc | 3 ++ 11 files changed, 53 insertions(+), 37 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 2a88414a..9b9dba74 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -32,7 +32,12 @@ Author: paboyle #if defined(A64FXASM) -#pragma message("invoking A64FX Dslash") +// include here if A64FX was not defined +#ifndef A64FX +#include +#endif + +#pragma message("specialize A64FX Dslash") // undefine everything #include diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h index 9f38bfc8..4452aabf 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBody.h @@ -34,6 +34,8 @@ #define DIR7_RECON TP_RECON_ACCUM #endif +#pragma message("this should not happen") + //////////////////////////////////////////////////////////////////////////////// // Comms then compute kernel //////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 4d9e8fd9..76c556d7 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -38,8 +38,6 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define COMPLEX_SIGNS(A) -#define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index e1532acb..d809f83b 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -38,8 +38,6 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define COMPLEX_SIGNS(A) -#define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 4a792047..232610f2 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -38,8 +38,6 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define COMPLEX_SIGNS(A) -#define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd @@ -111,7 +109,7 @@ Author: Nils Meyer pg1 = svptrue_b64(); \ svuint64_t table0; \ svfloat64_t zero0; \ - zero0 = __svzero(zero0); + zero0 = svdup_f64(0.); #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -559,18 +557,18 @@ Author: Nils Meyer // ZERO_PSI #define ZERO_PSI_A64FXd \ - result_00 = __svzero(result_00); \ - result_01 = __svzero(result_01); \ - result_02 = __svzero(result_02); \ - result_10 = __svzero(result_10); \ - result_11 = __svzero(result_11); \ - result_12 = __svzero(result_12); \ - result_20 = __svzero(result_20); \ - result_21 = __svzero(result_21); \ - result_22 = __svzero(result_22); \ - result_30 = __svzero(result_30); \ - result_31 = __svzero(result_31); \ - result_32 = __svzero(result_32); + result_00 = svdup_f64(0.); \ + result_01 = svdup_f64(0.); \ + result_02 = svdup_f64(0.); \ + result_10 = svdup_f64(0.); \ + result_11 = svdup_f64(0.); \ + result_12 = svdup_f64(0.); \ + result_20 = svdup_f64(0.); \ + result_21 = svdup_f64(0.); \ + result_22 = svdup_f64(0.); \ + result_30 = svdup_f64(0.); \ + result_31 = svdup_f64(0.); \ + result_32 = svdup_f64(0.); // PREFETCH_RESULT_L2_STORE (prefetch store to L2) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 0ba5df17..180e5f4f 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -38,8 +38,6 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define COMPLEX_SIGNS(A) -#define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf @@ -111,7 +109,7 @@ Author: Nils Meyer pg1 = svptrue_b32(); \ svuint32_t table0; \ svfloat32_t zero0; \ - zero0 = __svzero(zero0); + zero0 = svdup_f32(0.); #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -559,18 +557,18 @@ Author: Nils Meyer // ZERO_PSI #define ZERO_PSI_A64FXf \ - result_00 = __svzero(result_00); \ - result_01 = __svzero(result_01); \ - result_02 = __svzero(result_02); \ - result_10 = __svzero(result_10); \ - result_11 = __svzero(result_11); \ - result_12 = __svzero(result_12); \ - result_20 = __svzero(result_20); \ - result_21 = __svzero(result_21); \ - result_22 = __svzero(result_22); \ - result_30 = __svzero(result_30); \ - result_31 = __svzero(result_31); \ - result_32 = __svzero(result_32); + result_00 = svdup_f32(0.); \ + result_01 = svdup_f32(0.); \ + result_02 = svdup_f32(0.); \ + result_10 = svdup_f32(0.); \ + result_11 = svdup_f32(0.); \ + result_12 = svdup_f32(0.); \ + result_20 = svdup_f32(0.); \ + result_21 = svdup_f32(0.); \ + result_22 = svdup_f32(0.); \ + result_30 = svdup_f32(0.); \ + result_31 = svdup_f32(0.); \ + result_32 = svdup_f32(0.); // PREFETCH_RESULT_L2_STORE (prefetch store to L2) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index e44d24c9..5dc6b6d2 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -385,7 +385,7 @@ struct MultComplex{ svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); typename acle::vt b_v = svld1(pg1, b.v); - typename acle::vt z_v = __svzero(z_v); + typename acle::vt z_v = acle::zero(); // using FCMLA typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 90); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 61f19a15..571e15cf 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -118,7 +118,10 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GEN #if defined(A64FX) // breakout A64FX SVE ACLE here //#pragma message("building for A64FX / SVE ACLE") - #define ARMCLANGHOTFIX + #if defined(clang) + #define ARMCLANGHOTFIX // armclang 20.0 compiles, but binaries give wrong results without hotfix + #endif + #include #include "Grid_a64fx-2.h" #else #include "Grid_generic.h" diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 4d6b026f..37b33b0e 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -201,12 +201,20 @@ int main (int argc, char ** argv) double volume=Ls; for(int mu=0;mu Date: Mon, 20 Apr 2020 22:45:27 +0200 Subject: [PATCH 046/147] Exchange1 with generic version for now, should use svtbl2 in final version --- Grid/simd/Grid_a64fx-2.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 5dc6b6d2..0419df5b 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -592,6 +592,8 @@ struct Exchange{ } + +/* FIXME use svcreate etc. or switch to table lookup directly template static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ @@ -611,6 +613,29 @@ struct Exchange{ svst4(pg4, (typename acle::pt*)out1.v, out1_v4); svst4(pg4, (typename acle::pt*)out2.v, out2_v4); } +*/ + + #define VECTOR_FOR(i, w, inc) \ + for (unsigned int i = 0; i < w; i += inc) + + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + // FIXME + const int n = 1; + const int w = W::r; + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "< static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ From 5893888f87648a83a9c402a4f8cab583b4d20e52 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Wed, 22 Apr 2020 19:29:55 +0200 Subject: [PATCH 047/147] removed default no-strict-aliasing for gcc-10.0.1 exclusively --- configure.ac | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/configure.ac b/configure.ac index 3d4ee383..ec774b49 100644 --- a/configure.ac +++ b/configure.ac @@ -286,6 +286,10 @@ case ${CXX} in CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" fi ;; + g++-10.0.1) + # removed -fno-strict-aliasing for gcc 10 dev build only + CXXFLAGS="$CXXFLAGS" + ;; *) CXXLD=${CXX} CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" From 6f44e3c192155aa665214b2631f0187498800201 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Thu, 23 Apr 2020 11:18:50 +0200 Subject: [PATCH 048/147] reverted changes in configure.ac ; included SVE configure readme --- SVE_README.txt | 10 ++++++++++ configure.ac | 8 +++----- 2 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 SVE_README.txt diff --git a/SVE_README.txt b/SVE_README.txt new file mode 100644 index 00000000..b60d0427 --- /dev/null +++ b/SVE_README.txt @@ -0,0 +1,10 @@ +armclang 20.0 VLA + +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static + + +gcc 10.0.1 VLA + +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static + +should remove "-fno-strict-aliasing" for gcc 10 diff --git a/configure.ac b/configure.ac index ec774b49..2980a6d5 100644 --- a/configure.ac +++ b/configure.ac @@ -286,13 +286,11 @@ case ${CXX} in CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" fi ;; - g++-10.0.1) - # removed -fno-strict-aliasing for gcc 10 dev build only - CXXFLAGS="$CXXFLAGS" - ;; *) CXXLD=${CXX} - CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" + # + #CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" + CXXFLAGS="$CXXFLAGS" ;; esac From 09f0963d1fed2f551e3a3de3cd7a33f73f617d29 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 23 Apr 2020 11:27:03 +0200 Subject: [PATCH 049/147] changes in configure.ac ; to be verified --- configure.ac | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/configure.ac b/configure.ac index 2980a6d5..e9e5c8ad 100644 --- a/configure.ac +++ b/configure.ac @@ -199,7 +199,7 @@ if test "${ac_MKL}x" != "nox"; then fi if test "${ac_IPP}x" != "nox"; then - AC_SEARCH_LIBS([ippsCRC32C_8u], [ippdc], + AC_SEARCH_LIBS([ippsCRC32C_8u], [ippdc], [LIBS="${LIBS} -lippdc -lippvm -lipps -lippcore"], [AC_MSG_ERROR("Intel IPP enabled but library not found")]) fi @@ -274,8 +274,8 @@ case ${ac_gen_scalar} in esac ##################### Compiler dependent choices -case ${CXX} in - nvcc) +case ${CXX} in + nvcc) # CXX="nvcc -keep -v -x cu " # CXXLD="nvcc -v -link" CXX="nvcc -x cu " @@ -286,11 +286,17 @@ case ${CXX} in CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" fi ;; + g++-10.0.1) + CXXLD=${CXX} + # removed no-strict-aliasing flag here; only VLA tested + CXXFLAGS="$CXXFLAGS -msve-vector-bits=512" + armclang++) + CXXLD=${CXX} + # removed no-strict-aliasing flag here; no loop unrolling for VLA + CXXFLAGS="$CXXFLAGS -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2" *) CXXLD=${CXX} - # - #CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" - CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" ;; esac @@ -632,4 +638,3 @@ AC_OUTPUT echo "" cat grid.configure.summary echo "" - From 6db68d6ecb73ae6ae60ab0757a1dab81f7cebec4 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Fri, 24 Apr 2020 10:10:47 +0200 Subject: [PATCH 050/147] added SVE configure for armclang and gcc --- Grid/simd/Grid_vector_types.h | 5 ++++- SVE_README.txt | 17 ++++++++++------- configure.ac | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 571e15cf..e5c153df 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -118,8 +118,11 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GEN #if defined(A64FX) // breakout A64FX SVE ACLE here //#pragma message("building for A64FX / SVE ACLE") - #if defined(clang) + #if defined(HOTFIX) + #pragma message("applying armclang hotfix") #define ARMCLANGHOTFIX // armclang 20.0 compiles, but binaries give wrong results without hotfix + #else + #pragma message("not applying armclang hotfix") #endif #include #include "Grid_a64fx-2.h" diff --git a/SVE_README.txt b/SVE_README.txt index b60d0427..ab82d755 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -1,10 +1,13 @@ -armclang 20.0 VLA - -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static - - gcc 10.0.1 VLA -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -static -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -should remove "-fno-strict-aliasing" for gcc 10 + +armclang 20.0 VLA + +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DHOTFIX -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static + +must use armclang 20.0 with HOTFIX applied, otherwise Benchmark_wilson gives wrong result + + +what about "-fno-strict-aliasing" ? diff --git a/configure.ac b/configure.ac index 2980a6d5..aed9e3b9 100644 --- a/configure.ac +++ b/configure.ac @@ -289,8 +289,8 @@ case ${CXX} in *) CXXLD=${CXX} # - #CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" - CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" + #CXXFLAGS="$CXXFLAGS" ;; esac From 3edb2dc2dac79cd892e5b79e5e8331343f5cdb0e Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 24 Apr 2020 13:04:34 +0200 Subject: [PATCH 051/147] removed -static from gcc CXXFLAGS --- SVE_README.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SVE_README.txt b/SVE_README.txt index ab82d755..2bbb708e 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -1,6 +1,6 @@ gcc 10.0.1 VLA -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -static -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static armclang 20.0 VLA From d990e61be3e2f8398f52e6e40b9cb10e73d2d90c Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 25 Apr 2020 12:11:43 +0200 Subject: [PATCH 052/147] armclang 20.1 settings in SVE readme --- SVE_README.txt | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/SVE_README.txt b/SVE_README.txt index 2bbb708e..7462e3e6 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -10,4 +10,21 @@ armclang 20.0 VLA must use armclang 20.0 with HOTFIX applied, otherwise Benchmark_wilson gives wrong result -what about "-fno-strict-aliasing" ? +armclang 20.1 VLA + +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static + +HOTFIX unknown + + + +Fujitsu FCC + +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" + + + + + +what about "-fno-strict-aliasing" in general? + From 499edc0636731a68291a4481b404603be7980070 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 25 Apr 2020 13:41:24 +0200 Subject: [PATCH 053/147] updated SVE_README.txt; defined ARMCLANGCOMPAT macro --- SVE_README.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/SVE_README.txt b/SVE_README.txt index 7462e3e6..3286db84 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -5,16 +5,17 @@ gcc 10.0.1 VLA armclang 20.0 VLA -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DHOTFIX -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -must use armclang 20.0 with HOTFIX applied, otherwise Benchmark_wilson gives wrong result +must use armclang 20.0 with ARMCLANGCOMPAT applied, otherwise Benchmark_wilson gives wrong result armclang 20.1 VLA ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -HOTFIX unknown +must use armclang 20.1 with ARMCLANGCOMPAT applied, otherwise Benchmark_wilson gives wrong result +Test_simd build error caused by -mcpu=a64fx ? From 5abec5b8a90153594eb86355eda797f315c5eb5e Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 25 Apr 2020 13:48:26 +0200 Subject: [PATCH 054/147] SVE_readme update, update Grid_vector_types.h --- Grid/simd/Grid_vector_types.h | 11 +++++------ SVE_README.txt | 6 +++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e5c153df..bd5b4704 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -118,11 +118,10 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GEN #if defined(A64FX) // breakout A64FX SVE ACLE here //#pragma message("building for A64FX / SVE ACLE") - #if defined(HOTFIX) - #pragma message("applying armclang hotfix") - #define ARMCLANGHOTFIX // armclang 20.0 compiles, but binaries give wrong results without hotfix - #else - #pragma message("not applying armclang hotfix") + #if defined(ARMCLANGCOMPAT) + #pragma message("applying armclang fix") + //#else + // #pragma message("not applying armclang fix") #endif #include #include "Grid_a64fx-2.h" @@ -241,7 +240,7 @@ public: return sizeof(Vector_type) / sizeof(Scalar_type); } -#ifdef ARMCLANGHOTFIX +#ifdef ARMCLANGCOMPAT accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); svst1(svptrue_b8(), (int8_t*)this, tmp); diff --git a/SVE_README.txt b/SVE_README.txt index 3286db84..b8ee4e0f 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -7,14 +7,14 @@ armclang 20.0 VLA ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -must use armclang 20.0 with ARMCLANGCOMPAT applied, otherwise Benchmark_wilson gives wrong result +must use armclang 20.0 with ARMCLANGCOMPAT, otherwise Benchmark_wilson gives wrong result armclang 20.1 VLA -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -must use armclang 20.1 with ARMCLANGCOMPAT applied, otherwise Benchmark_wilson gives wrong result +must use armclang 20.1 with ARMCLANGCOMPAT, otherwise Benchmark_wilson gives wrong result Test_simd build error caused by -mcpu=a64fx ? From 7ef03c53683594c7f7debd14a07e85b103934972 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 6 May 2020 16:30:37 +0200 Subject: [PATCH 055/147] updated SVE readme --- SVE_README.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/SVE_README.txt b/SVE_README.txt index b8ee4e0f..22d01413 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -25,6 +25,11 @@ Fujitsu FCC +Fujitsu FCC w/ MPI + +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" + + what about "-fno-strict-aliasing" in general? From a306a49788ffb253033d4f7421c9198283950a41 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 7 May 2020 19:07:49 +0200 Subject: [PATCH 056/147] first mods for fixed size; still incomplete --- Grid/simd/Grid_a64fx-fixedsize.h | 989 +++++++++++++++++++++++++++++++ 1 file changed, 989 insertions(+) create mode 100644 Grid/simd/Grid_a64fx-fixedsize.h diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h new file mode 100644 index 00000000..0566cf41 --- /dev/null +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -0,0 +1,989 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Grid_a64fx-2.h + + Copyright (C) 2020 + + Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +///////////////////////////////////////////////////// +// Using SVE ACLE +///////////////////////////////////////////////////// + +#ifndef GEN_SIMD_WIDTH +#define GEN_SIMD_WIDTH 64u +#endif + +static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); + +#ifdef __ARM_FEATURE_SVE + #ifdef __clang__ + //#pragma message("Using clang compiler") + #include + #endif +#else + #pragma error "Missing SVE feature" +#endif /* __ARM_FEATURE_SVE */ + +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + + // type traits giving the number of elements for each vector type + template struct W; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; + +NAMESPACE_END(Optimization) +NAMESPACE_END(Grid) + +// low-level API +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + +template +struct acle{}; + +template <> +struct acle{ + typedef svfloat64_t vt; + typedef svfloat64x2_t vt2; + typedef svfloat64x4_t vt4; + typedef float64_t pt; + typedef uint64_t uint; + typedef svuint64_t svuint; + + static inline svbool_t pg1(){return svptrue_b64();} + static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} + static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} + static inline vec tbl_swap(){ + const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + return t; + } + static inline vec tbl0(){ + const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + return t; + } + static inline vec tbl1(){ + const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} + static inline svfloat64_t zero(){return svdup_f64(0.);} +}; + +template <> +struct acle{ + typedef svfloat32_t vt; + typedef svfloat32x2_t vt2; + typedef float32_t pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + // exchange neighboring elements + static inline vec tbl_swap(){ + const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return t; + } + static inline vec tbl0(){ + const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + return t; + } + static inline vec tbl1(){ + const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + return t; + } + static inline vec tbl2(){ + const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline svfloat32_t zero(){return svdup_f32(0.);} +}; + +template <> +struct acle{ + typedef svfloat16_t vt; + typedef float16_t pt; + typedef uint16_t uint; + typedef svuint16_t svuint; + + static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg2(){return svptrue_pat_b16(SV_VL16);} + static inline svbool_t pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} + static inline svfloat16_t zero(){return svdup_f16(0.);} +}; + +template <> +struct acle{ + typedef svuint32_t vt; + typedef svuint32x2_t vt2; + typedef Integer pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + //static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} +}; + +// --------------------------------------------------- + +struct Vsplat{ + // Complex float + inline vecf operator()(float a, float b){ + + typename acle::vt a_v = svdup_f32(a); + typename acle::vt b_v = svdup_f32(b); + typename acle::vt r_v = svzip1(a_v, b_v); + return r_v; + } + + // Real float + inline vecf operator()(float a){ + + typename acle::vt r_v = svdup_f32(a); + return r_v; + } + + // Complex double + inline vecd operator()(double a, double b){ + + typename acle::vt a_v = svdup_f64(a); + typename acle::vt b_v = svdup_f64(b); + typename acle::vt r_v = svzip1(a_v, b_v); + return r_v; + } + + // Real double + inline vecd operator()(double a){ + + vecd out; + typename acle::vt r_v = svdup_f64(a); + return r_v; + } + + // Integer + inline veci operator()(Integer a){ + + // Add check whether Integer is really a uint32_t??? + typename acle::vt r_v = svdup_u32(a); + return r_v; + } +}; + +struct Vstore{ + // Real float + inline void operator()(vecf a, float *D){ + + svbool_t pg1 = acle::pg1(); + //typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + svst1(pg1, D, a); + } + // Real double + inline void operator()(vecd a, double *D){ + + svbool_t pg1 = acle::pg1(); + //typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + svst1(pg1, D, a); + } + // Real float + inline void operator()(veci a, Integer *D){ + + svbool_t pg1 = acle::pg1(); + //typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + svst1(pg1, D, a); + } + +}; + +struct Vstream{ + // Real float + inline void operator()(float * a, vecf b){ + + svbool_t pg1 = acle::pg1(); + //typename acle::vt b_v = svld1(pg1, b.v); + svstnt1(pg1, a, b); + //svst1(pg1, a, b_v); + } + // Real double + inline void operator()(double * a, vecd b){ + + svbool_t pg1 = acle::pg1(); + //typename acle::vt b_v = svld1(pg1, b.v); + svstnt1(pg1, a, b); + //svst1(pg1, a, b_v); + } +}; + + struct Vset{ + // Complex float + inline vecf operator()(Grid::ComplexF *a){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (float*)a); + + return a_v; + } + // Complex double + inline vecd operator()(Grid::ComplexD *a){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (double*)a); + + return a_v; + } + // Real float + inline vecf operator()(float *a){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a); + + return a_v; + } + // Real double + inline vecd operator()(double *a){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a); + + return a_v; + } + // Integer + inline veci operator()(Integer *a){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a); + + return a_v; + } + }; + +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// + +struct Sum{ + // Complex/real float + inline vecf operator()(vecf a, vecf b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svadd_x(pg1, a, b); + + return r_v; + } + // Complex/real double + inline vecd operator()(vecd a, vecd b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svadd_x(pg1, a, b); + + return r_v; + } + // Integer + inline veci operator()(veci a, veci b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svadd_x(pg1, a, b); + + return r_v; + } +}; + +struct Sub{ + // Complex/real float + inline vecf operator()(vecf a, vecf b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svsub_x(pg1, a, b); + + return r_v; + } + // Complex/real double + inline vecd operator()(vecd a, vecd b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svsub_x(pg1, a, b); + + return r_v; + } + // Integer + inline veci operator()(veci a, veci b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svsub_x(pg1, a, b); + + return r_v; + } + +}; + +struct Mult{ + // Real float + inline vecf operator()(vecf a, vecf b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svmul_x(pg1, a, b); + + return r_v; + } + // Real double + inline vecd operator()(vecd a, vecd b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svmul_x(pg1, a, b); + + return r_v; + } + // Integer + inline veci operator()(veci a, veci b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svmul_x(pg1, a, b); + + return r_v; + } +}; + +struct MultRealPart{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + + svbool_t pg1 = acle::pg1(); + + // using FCMLA + typename acle::vt z_v = acle::zero(); + typename acle::vt r_v = svcmla_x(pg1, z_v, a, b, 0); + + return r_v; + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + + svbool_t pg1 = acle::pg1(); + + // using FCMLA + typename acle::vt z_v = acle::zero(); + typename acle::vt r_v = svcmla_x(pg1, z_v, a, b, 0); + + return r_v; + } +}; + +struct MaddRealPart{ + // Complex float + inline vecf operator()(vecf a, vecf b, vecf c){ + + svbool_t pg1 = acle::pg1(); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c, a, b, 0); + + return r_v; + } + // Complex double + inline vecd operator()(vecd a, vecd b, vecd c){ + + svbool_t pg1 = acle::pg1(); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c, a, b, 0); + + return r_v; + } +}; + +struct MultComplex{ + // Complex a*b + // Complex float + inline vecf operator()(vecf a, vecf b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt z = acle::zero(); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, z, a, b, 90); + r_v = svcmla_x(pg1, r_v, a, b, 0); + + return r_v; + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt z = acle::zero(); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, z, a, b, 90); + r_v = svcmla_x(pg1, r_v, a, b, 0); + + return r_v; + } +}; + +struct Div{ + // Real float + inline vecf operator()(vecf a, vecf b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdiv_x(pg1, a, b); + + return r_v; + } + // Real double + inline vecf operator()(vecf a, vecf b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdiv_x(pg1, a, b); + + return r_v; + } +}; + +struct Conj{ + // Complex float + inline vecf operator()(vecf a){ + + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt r_v = svneg_x(pg_odd, a); + + return r_v; + } + // Complex double + inline vecd operator()(vecd a){ + + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt r_v = svneg_x(pg_odd, a); + + return r_v; + } +}; + +struct TimesMinusI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_odd, a_v); + + return r_v; + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_odd, a_v); + + return r_v; + } +}; + +struct TimesI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_even, a_v); + + return r_v; + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_even, a_v); + + return r_v; + } +}; + +struct PrecisionChange { + static inline vech StoH (vecf sa, vecf sb) { + + svbool_t pg1s = acle::pg1(); + typename acle::vt ha_v = svcvt_f16_x(pg1s, sa); + typename acle::vt hb_v = svcvt_f16_x(pg1s, sb); + typename acle::vt r_v = svuzp1(ha_v, hb_v); + + return r_v; + } + static inline void HtoS(vech h,vecf &sa,vecf &sb) { + + svbool_t pg1s = acle::pg1(); + typename acle::vt ha_v = svzip1(h, h); + typename acle::vt hb_v = svzip2(h, h); + sa = svcvt_f32_x(pg1s, ha); + sb = svcvt_f32_x(pg1s, hb); + } + static inline vecf DtoS (vecd a,vecd b) { + + svbool_t pg1d = acle::pg1(); + typename acle::vt sa_v = svcvt_f32_x(pg1d, a); + typename acle::vt sb_v = svcvt_f32_x(pg1d, b); + typename acle::vt r_v = svuzp1(sa_v, sb_v); + + return r_v; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + + svbool_t pg1d = acle::pg1(); + typename acle::vt sa_v = svzip1(s, s); + typename acle::vt sb_v = svzip2(s, s); + a = svcvt_f64_x(pg1d, sa_v); + b = svcvt_f64_x(pg1d, sb_v); + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { +/* + vech ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt c_v = svld1(pg1d, c.v); + typename acle::vt d_v = svld1(pg1d, d.v); + typename acle::vt ha_v = svcvt_f16_x(pg1d, a_v); + typename acle::vt hb_v = svcvt_f16_x(pg1d, b_v); + typename acle::vt hc_v = svcvt_f16_x(pg1d, c_v); + typename acle::vt hd_v = svcvt_f16_x(pg1d, d_v); + typename acle::vt hab_v = svuzp1(ha_v, hb_v); + typename acle::vt hcd_v = svuzp1(hc_v, hd_v); + typename acle::vt r_v = svuzp1(hab_v, hcd_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + + return ret; +*/ + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { +/* + svbool_t pg1h = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt sa_v = svzip1(h_v, h_v); + typename acle::vt sb_v = svzip2(h_v, h_v); + typename acle::vt da_v = svzip1(sa_v, sa_v); + typename acle::vt db_v = svzip2(sa_v, sa_v); + typename acle::vt dc_v = svzip1(sb_v, sb_v); + typename acle::vt dd_v = svzip2(sb_v, sb_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, da_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, db_v); + typename acle::vt c_v = svcvt_f64_x(pg1d, dc_v); + typename acle::vt d_v = svcvt_f64_x(pg1d, dd_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + svst1(pg1d, c.v, c_v); + svst1(pg1d, d.v, d_v); +*/ + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } +}; + +// %%%% TODO ----------------- + +struct Exchange{ + + // Exchange0 is valid for arbitrary SVE vector length + template + static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); + r1_v = svext(r1_v, a2_v, (uint64_t)W::c); + typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); + r2_v = svext(a1_v, r2_v, (uint64_t)W::c); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + + +/* FIXME use svcreate etc. or switch to table lookup directly + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg4 = acle::pg4(); + typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); + typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); + typename acle::vt4 out1_v4; + typename acle::vt4 out2_v4; + out1_v4.v0 = in1_v4.v0; + out1_v4.v1 = in1_v4.v1; + out1_v4.v2 = in2_v4.v0; + out1_v4.v3 = in2_v4.v1; + out2_v4.v0 = in1_v4.v2; + out2_v4.v1 = in1_v4.v3; + out2_v4.v2 = in2_v4.v2; + out2_v4.v3 = in2_v4.v3; + svst4(pg4, (typename acle::pt*)out1.v, out1_v4); + svst4(pg4, (typename acle::pt*)out2.v, out2_v4); + } +*/ + + #define VECTOR_FOR(i, w, inc) \ + for (unsigned int i = 0; i < w; i += inc) + + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + // FIXME + const int n = 1; + const int w = W::r; + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "< + static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); + typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, (typename acle::pt*)out1.v, r1_v); + svst1(pg1, (typename acle::pt*)out2.v, r2_v); + } + + static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ + assert(0); + return; + } +}; + +struct Permute{ + // float + static inline vecf Permute0(vecf in) { + + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); + + return r_v; + } + static inline vecf Permute1(vecf in) { + + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + + return r_v; + } + static inline vecf Permute2(vecf in) { + + const vec::uint> tbl_swap = acle::tbl2(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + + return r_v; + } + static inline vecf Permute3(vecf in) { + + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return r_v; + } + + // double + static inline vecd Permute0(vecd in) { + + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); + + return r_v; + } + static inline vecd Permute1(vecd in) { + + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + + return r_v; + } + static inline vecd Permute2(vecd in) { + + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + + return r_v; + } + static inline vecd Permute3(vecd in) { + return in; + } +}; + +struct Rotate{ + + static inline vecf rotate(vecf in, int n){ + switch(n){ + case 0: return tRotate<0>(in); break; + case 1: return tRotate<1>(in); break; + case 2: return tRotate<2>(in); break; + case 3: return tRotate<3>(in); break; + case 4: return tRotate<4>(in); break; + case 5: return tRotate<5>(in); break; + case 6: return tRotate<6>(in); break; + case 7: return tRotate<7>(in); break; + + case 8: return tRotate<8>(in); break; + case 9: return tRotate<9>(in); break; + case 10: return tRotate<10>(in); break; + case 11: return tRotate<11>(in); break; + case 12: return tRotate<12>(in); break; + case 13: return tRotate<13>(in); break; + case 14: return tRotate<14>(in); break; + case 15: return tRotate<15>(in); break; + default: assert(0); + } + } + static inline vecd rotate(vecd in, int n){ + switch(n){ + case 0: return tRotate<0>(in); break; + case 1: return tRotate<1>(in); break; + case 2: return tRotate<2>(in); break; + case 3: return tRotate<3>(in); break; + case 4: return tRotate<4>(in); break; + case 5: return tRotate<5>(in); break; + case 6: return tRotate<6>(in); break; + case 7: return tRotate<7>(in); break; + default: assert(0); + } + } + + template static inline vecf tRotate(vecf in){ + + typename acle::vt r_v = svext(in, in, (uint64_t)(n%W::r)); + return r_v; + } + template static inline vecd tRotate(vecd in){ + + typename acle::vt r_v = svext(in, in, (uint64_t)(n%W::r)); + return r_v; + } +}; + +// tree-based reduction +#define svred(pg, v)\ +svaddv(pg, v); + +// left-to-right reduction +// #define svred(pg, v)\ +// svadda(pg, 0, v) + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + float a = svred(pg_even, in); + float b = svred(pg_odd, in); + + return Grid::ComplexF(a, b); +} + +//Real float Reduce +template <> +inline Grid::RealF Reduce::operator()(vecf in){ + + svbool_t pg1 = acle::pg1(); + float a = svred(pg1, in); + + return a; +} + +//Complex double Reduce +template <> +inline Grid::ComplexD Reduce::operator()(vecd in){ + + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + double a = svred(pg_even, in); + double b = svred(pg_odd, in); + + return Grid::ComplexD(a, b); +} + +//Real double Reduce +template <> +inline Grid::RealD Reduce::operator()(vecd in){ + + svbool_t pg1 = acle::pg1(); + double a = svred(pg1, in); + + return a; +} + +//Integer Reduce +template <> +inline Integer Reduce::operator()(veci in){ + + svbool_t pg1 = acle::pg1(); + Integer a = svred(pg1, in); + + return a; +} + +#undef svred + +NAMESPACE_END(Optimization) + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + +typedef Optimization::vech SIMD_Htype; // Reduced precision type +typedef Optimization::vecf SIMD_Ftype; // Single precision type +typedef Optimization::vecd SIMD_Dtype; // Double precision type +typedef Optimization::veci SIMD_Itype; // Integer type + +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; + +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid); From acff9d6ed239e031218d6b048d4fef820947cd45 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 7 May 2020 21:24:07 +0200 Subject: [PATCH 057/147] transition to fixed size data types almost done; still incomplete --- Grid/simd/Grid_a64fx-fixedsize.h | 511 +++++++++++-------------------- 1 file changed, 172 insertions(+), 339 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 0566cf41..a2c75d92 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -37,14 +37,22 @@ static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); #ifdef __ARM_FEATURE_SVE - #ifdef __clang__ - //#pragma message("Using clang compiler") - #include - #endif + #include #else #pragma error "Missing SVE feature" #endif /* __ARM_FEATURE_SVE */ +// gcc 10 features +#if __ARM_FEATURE_SVE_BITS==512 +typedef svbool_t pred __attribute__((arm_sve_vector_bits(512))); +typedef svfloat16_t vech __attribute__((arm_sve_vector_bits(512))); +typedef svfloat32_t vecf __attribute__((arm_sve_vector_bits(512))); +typedef svfloat64_t vecd __attribute__((arm_sve_vector_bits(512))); +typedef svuint32_t veci __attribute__((arm_sve_vector_bits(512))); +#else +#pragma error("Oops. Wrong or undefined SVE vector size?") +#endif /* __ARM_FEATURE_SVE_BITS */ + NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Optimization); @@ -100,9 +108,7 @@ struct acle{ typedef uint64_t uint; typedef svuint64_t svuint; - static inline svbool_t pg1(){return svptrue_b64();} - static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} - static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} + static inline pred pg1(){return svptrue_b64();} static inline vec tbl_swap(){ const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; return t; @@ -115,8 +121,8 @@ struct acle{ const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; return t; } - static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} - static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} + static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} static inline svfloat64_t zero(){return svdup_f64(0.);} }; @@ -128,8 +134,7 @@ struct acle{ typedef uint32_t uint; typedef svuint32_t svuint; - static inline svbool_t pg1(){return svptrue_b32();} - static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + static inline pred pg1(){return svptrue_b32();} // exchange neighboring elements static inline vec tbl_swap(){ const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; @@ -147,8 +152,8 @@ struct acle{ const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; return t; } - static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} - static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} static inline svfloat32_t zero(){return svdup_f32(0.);} }; @@ -159,10 +164,9 @@ struct acle{ typedef uint16_t uint; typedef svuint16_t svuint; - static inline svbool_t pg1(){return svptrue_b16();} - static inline svbool_t pg2(){return svptrue_pat_b16(SV_VL16);} - static inline svbool_t pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} - static inline svbool_t pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} + static inline pred pg1(){return svptrue_b16();} + static inline pred pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} static inline svfloat16_t zero(){return svdup_f16(0.);} }; @@ -175,10 +179,10 @@ struct acle{ typedef svuint32_t svuint; //static inline svbool_t pg1(){return svptrue_b16();} - static inline svbool_t pg1(){return svptrue_b32();} - static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} - static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} - static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline pred pg1(){return svptrue_b32();} + static inline pred pg2(){return svptrue_pat_b32(SV_VL8);} + static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} }; // --------------------------------------------------- @@ -186,132 +190,91 @@ struct acle{ struct Vsplat{ // Complex float inline vecf operator()(float a, float b){ - - typename acle::vt a_v = svdup_f32(a); - typename acle::vt b_v = svdup_f32(b); - typename acle::vt r_v = svzip1(a_v, b_v); - return r_v; + vecf a_v = svdup_f32(a); + vecf b_v = svdup_f32(b); + return svzip1(a_v, b_v); } - // Real float inline vecf operator()(float a){ - - typename acle::vt r_v = svdup_f32(a); - return r_v; + return svdup_f32(a); } - - // Complex double + // Complex double inline vecd operator()(double a, double b){ - - typename acle::vt a_v = svdup_f64(a); - typename acle::vt b_v = svdup_f64(b); - typename acle::vt r_v = svzip1(a_v, b_v); - return r_v; + vecd a_v = svdup_f64(a); + vecd b_v = svdup_f64(b); + return svzip1(a_v, b_v); } - // Real double inline vecd operator()(double a){ - - vecd out; - typename acle::vt r_v = svdup_f64(a); - return r_v; + return svdup_f64(a); } - // Integer inline veci operator()(Integer a){ - // Add check whether Integer is really a uint32_t??? - typename acle::vt r_v = svdup_u32(a); - return r_v; + return svdup_u32(a); } }; struct Vstore{ // Real float inline void operator()(vecf a, float *D){ - - svbool_t pg1 = acle::pg1(); - //typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + pred pg1 = acle::pg1(); svst1(pg1, D, a); } // Real double inline void operator()(vecd a, double *D){ - - svbool_t pg1 = acle::pg1(); - //typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + pred pg1 = acle::pg1(); svst1(pg1, D, a); } // Real float inline void operator()(veci a, Integer *D){ - - svbool_t pg1 = acle::pg1(); - //typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + pred pg1 = acle::pg1(); svst1(pg1, D, a); } - }; struct Vstream{ // Real float inline void operator()(float * a, vecf b){ - - svbool_t pg1 = acle::pg1(); - //typename acle::vt b_v = svld1(pg1, b.v); + pred pg1 = acle::pg1(); svstnt1(pg1, a, b); - //svst1(pg1, a, b_v); + //svst1(pg1, a, b); } // Real double inline void operator()(double * a, vecd b){ - - svbool_t pg1 = acle::pg1(); - //typename acle::vt b_v = svld1(pg1, b.v); + pred pg1 = acle::pg1(); svstnt1(pg1, a, b); - //svst1(pg1, a, b_v); + //svst1(pg1, a, b); } }; - struct Vset{ - // Complex float - inline vecf operator()(Grid::ComplexF *a){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, (float*)a); - - return a_v; - } - // Complex double - inline vecd operator()(Grid::ComplexD *a){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, (double*)a); - - return a_v; - } - // Real float - inline vecf operator()(float *a){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, a); - - return a_v; - } - // Real double - inline vecd operator()(double *a){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, a); - - return a_v; - } - // Integer - inline veci operator()(Integer *a){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, a); - - return a_v; - } - }; +struct Vset{ + // Complex float + inline vecf operator()(Grid::ComplexF *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, (float*)a); + } + // Complex double + inline vecd operator()(Grid::ComplexD *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, (double*)a); + } + // Real float + inline vecf operator()(float *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } + // Real double + inline vecd operator()(double *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } + // Integer + inline veci operator()(Integer *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } +}; ///////////////////////////////////////////////////// // Arithmetic operations @@ -320,54 +283,36 @@ struct Vstream{ struct Sum{ // Complex/real float inline vecf operator()(vecf a, vecf b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svadd_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); } // Complex/real double inline vecd operator()(vecd a, vecd b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svadd_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); } // Integer inline veci operator()(veci a, veci b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svadd_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); } }; struct Sub{ // Complex/real float inline vecf operator()(vecf a, vecf b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svsub_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); } // Complex/real double inline vecd operator()(vecd a, vecd b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svsub_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); } // Integer inline veci operator()(veci a, veci b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svsub_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); } }; @@ -375,75 +320,50 @@ struct Sub{ struct Mult{ // Real float inline vecf operator()(vecf a, vecf b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svmul_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); } // Real double inline vecd operator()(vecd a, vecd b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svmul_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); } // Integer inline veci operator()(veci a, veci b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svmul_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); } }; struct MultRealPart{ // Complex float inline vecf operator()(vecf a, vecf b){ - - svbool_t pg1 = acle::pg1(); - + pred pg1 = acle::pg1(); // using FCMLA - typename acle::vt z_v = acle::zero(); - typename acle::vt r_v = svcmla_x(pg1, z_v, a, b, 0); - - return r_v; + vecf z_v = acle::zero(); + return svcmla_x(pg1, z_v, a, b, 0); } // Complex double inline vecd operator()(vecd a, vecd b){ - - svbool_t pg1 = acle::pg1(); - + pred pg1 = acle::pg1(); // using FCMLA - typename acle::vt z_v = acle::zero(); - typename acle::vt r_v = svcmla_x(pg1, z_v, a, b, 0); - - return r_v; + vecd z_v = acle::zero(); + return svcmla_x(pg1, z_v, a, b, 0); } }; struct MaddRealPart{ // Complex float inline vecf operator()(vecf a, vecf b, vecf c){ - - svbool_t pg1 = acle::pg1(); - + pred pg1 = acle::pg1(); // using FCMLA - typename acle::vt r_v = svcmla_x(pg1, c, a, b, 0); - - return r_v; + return svcmla_x(pg1, c, a, b, 0); } // Complex double inline vecd operator()(vecd a, vecd b, vecd c){ - - svbool_t pg1 = acle::pg1(); - + pred pg1 = acle::pg1(); // using FCMLA - typename acle::vt r_v = svcmla_x(pg1, c, a, b, 0); - - return r_v; + return svcmla_x(pg1, c, a, b, 0); } }; @@ -451,162 +371,122 @@ struct MultComplex{ // Complex a*b // Complex float inline vecf operator()(vecf a, vecf b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt z = acle::zero(); - + pred pg1 = acle::pg1(); + vecf z = acle::zero(); // using FCMLA - typename acle::vt r_v = svcmla_x(pg1, z, a, b, 90); - r_v = svcmla_x(pg1, r_v, a, b, 0); - - return r_v; + vecf r_v = svcmla_x(pg1, z, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); } // Complex double inline vecd operator()(vecd a, vecd b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt z = acle::zero(); - + pred pg1 = acle::pg1(); + vecd z = acle::zero(); // using FCMLA - typename acle::vt r_v = svcmla_x(pg1, z, a, b, 90); - r_v = svcmla_x(pg1, r_v, a, b, 0); - - return r_v; + vecd r_v = svcmla_x(pg1, z, a, b, 90); + return svcmla_x(pg1, r_v, a, b, 0); } }; struct Div{ // Real float inline vecf operator()(vecf a, vecf b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svdiv_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svdiv_x(pg1, a, b); } // Real double inline vecf operator()(vecf a, vecf b){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt r_v = svdiv_x(pg1, a, b); - - return r_v; + pred pg1 = acle::pg1(); + return svdiv_x(pg1, a, b); } }; struct Conj{ // Complex float inline vecf operator()(vecf a){ - - svbool_t pg_odd = acle::pg_odd(); - typename acle::vt r_v = svneg_x(pg_odd, a); - - return r_v; + pred pg_odd = acle::pg_odd(); + return svneg_x(pg_odd, a); } // Complex double inline vecd operator()(vecd a){ - - svbool_t pg_odd = acle::pg_odd(); - typename acle::vt r_v = svneg_x(pg_odd, a); - - return r_v; + pred pg_odd = acle::pg_odd(); + return svneg_x(pg_odd, a); } }; struct TimesMinusI{ // Complex float inline vecf operator()(vecf a, vecf b){ - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - svbool_t pg_odd = acle::pg_odd(); + pred pg1 = acle::pg1(); + pred pg_odd = acle::pg_odd(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt a_v = svld1(pg1, a.v); + vecf a_v = svld1(pg1, a.v); a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_odd, a_v); - - return r_v; + return svneg_x(pg_odd, a_v); } // Complex double inline vecd operator()(vecd a, vecd b){ - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - svbool_t pg_odd = acle::pg_odd(); + pred pg1 = acle::pg1(); + pred pg_odd = acle::pg_odd(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt a_v = svld1(pg1, a.v); + vecd a_v = svld1(pg1, a.v); a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_odd, a_v); - - return r_v; + return svneg_x(pg_odd, a_v); } }; struct TimesI{ // Complex float inline vecf operator()(vecf a, vecf b){ - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - svbool_t pg_even = acle::pg_even(); + pred pg1 = acle::pg1(); + pred pg_even = acle::pg_even(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt a_v = svld1(pg1, a.v); + vecf a_v = svld1(pg1, a.v); a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_even, a_v); - - return r_v; + return svneg_x(pg_even, a_v); } // Complex double inline vecd operator()(vecd a, vecd b){ - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - svbool_t pg_even = acle::pg_even(); + pred pg1 = acle::pg1(); + pred pg_even = acle::pg_even(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt a_v = svld1(pg1, a.v); + vecd a_v = svld1(pg1, a.v); a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_even, a_v); - - return r_v; + return svneg_x(pg_even, a_v); } }; struct PrecisionChange { static inline vech StoH (vecf sa, vecf sb) { - - svbool_t pg1s = acle::pg1(); - typename acle::vt ha_v = svcvt_f16_x(pg1s, sa); - typename acle::vt hb_v = svcvt_f16_x(pg1s, sb); - typename acle::vt r_v = svuzp1(ha_v, hb_v); - - return r_v; + pred pg1s = acle::pg1(); + vech ha_v = svcvt_f16_x(pg1s, sa); + vech hb_v = svcvt_f16_x(pg1s, sb); + return svuzp1(ha_v, hb_v); } static inline void HtoS(vech h,vecf &sa,vecf &sb) { - - svbool_t pg1s = acle::pg1(); - typename acle::vt ha_v = svzip1(h, h); - typename acle::vt hb_v = svzip2(h, h); + pred pg1s = acle::pg1(); + vech ha_v = svzip1(h, h); + vech hb_v = svzip2(h, h); sa = svcvt_f32_x(pg1s, ha); sb = svcvt_f32_x(pg1s, hb); } static inline vecf DtoS (vecd a,vecd b) { - - svbool_t pg1d = acle::pg1(); - typename acle::vt sa_v = svcvt_f32_x(pg1d, a); - typename acle::vt sb_v = svcvt_f32_x(pg1d, b); - typename acle::vt r_v = svuzp1(sa_v, sb_v); - - return r_v; + pred pg1d = acle::pg1(); + vecf sa_v = svcvt_f32_x(pg1d, a); + vecf sb_v = svcvt_f32_x(pg1d, b); + return svuzp1(sa_v, sb_v); } static inline void StoD (vecf s,vecd &a,vecd &b) { - - svbool_t pg1d = acle::pg1(); - typename acle::vt sa_v = svzip1(s, s); - typename acle::vt sb_v = svzip2(s, s); + pred pg1d = acle::pg1(); + vecf sa_v = svzip1(s, s); + vecf sb_v = svzip2(s, s); a = svcvt_f64_x(pg1d, sa_v); b = svcvt_f64_x(pg1d, sb_v); } @@ -759,69 +639,43 @@ struct Exchange{ struct Permute{ // float static inline vecf Permute0(vecf in) { - - typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); - - return r_v; + return svext(in, in, (uint64_t)(16u / 2u)); } static inline vecf Permute1(vecf in) { - const vec::uint> tbl_swap = acle::tbl1(); - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, in.v); + pred pg1 = acle::pg1(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt r_v = svtbl(a_v, tbl_swap_v); - - return r_v; + return svtbl(in, tbl_swap_v); } static inline vecf Permute2(vecf in) { - const vec::uint> tbl_swap = acle::tbl2(); - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, in.v); + pred pg1 = acle::pg1(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt r_v = svtbl(a_v, tbl_swap_v); - - return r_v; + return svtbl(in, tbl_swap_v); } static inline vecf Permute3(vecf in) { - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, in.v); + pred pg1 = acle::pg1(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt r_v = svtbl(a_v, tbl_swap_v); - svst1(pg1, out.v, r_v); - - return r_v; + return svtbl(in, tbl_swap_v); } // double static inline vecd Permute0(vecd in) { - - typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); - - return r_v; + return svext(in, in, (uint64_t)(8u / 2u)); } static inline vecd Permute1(vecd in) { const vec::uint> tbl_swap = acle::tbl1(); - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, in.v); + pred pg1 = acle::pg1(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt r_v = svtbl(a_v, tbl_swap_v); - - return r_v; + return svtbl(in, tbl_swap_v); } static inline vecd Permute2(vecd in) { - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, in.v); + pred pg1 = acle::pg1(); typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt r_v = svtbl(a_v, tbl_swap_v); - - return r_v; + return svtbl(in, tbl_swap_v); } static inline vecd Permute3(vecd in) { return in; @@ -867,14 +721,10 @@ struct Rotate{ } template static inline vecf tRotate(vecf in){ - - typename acle::vt r_v = svext(in, in, (uint64_t)(n%W::r)); - return r_v; + return svext(in, in, (uint64_t)(n%16u)); } template static inline vecd tRotate(vecd in){ - - typename acle::vt r_v = svext(in, in, (uint64_t)(n%W::r)); - return r_v; + return svext(in, in, (uint64_t)(n%8u)); } }; @@ -896,72 +746,55 @@ struct Reduce{ return 0; } }; - //Complex float Reduce template <> inline Grid::ComplexF Reduce::operator()(vecf in){ - - svbool_t pg_even = acle::pg_even(); - svbool_t pg_odd = acle::pg_odd(); + pred pg_even = acle::pg_even(); + pred pg_odd = acle::pg_odd(); float a = svred(pg_even, in); float b = svred(pg_odd, in); - return Grid::ComplexF(a, b); } - //Real float Reduce template <> inline Grid::RealF Reduce::operator()(vecf in){ - - svbool_t pg1 = acle::pg1(); - float a = svred(pg1, in); - - return a; + pred pg1 = acle::pg1(); + return svred(pg1, in); } - //Complex double Reduce template <> inline Grid::ComplexD Reduce::operator()(vecd in){ - - svbool_t pg_even = acle::pg_even(); - svbool_t pg_odd = acle::pg_odd(); + pred pg_even = acle::pg_even(); + pred pg_odd = acle::pg_odd(); double a = svred(pg_even, in); double b = svred(pg_odd, in); - return Grid::ComplexD(a, b); } - //Real double Reduce template <> inline Grid::RealD Reduce::operator()(vecd in){ - - svbool_t pg1 = acle::pg1(); - double a = svred(pg1, in); - - return a; + pred pg1 = acle::pg1(); + return svred(pg1, in); } //Integer Reduce template <> inline Integer Reduce::operator()(veci in){ - - svbool_t pg1 = acle::pg1(); - Integer a = svred(pg1, in); - - return a; + pred pg1 = acle::pg1(); + return svred(pg1, in); } #undef svred -NAMESPACE_END(Optimization) +NAMESPACE_END(Optimization); ////////////////////////////////////////////////////////////////////////////////////// // Here assign types -typedef Optimization::vech SIMD_Htype; // Reduced precision type -typedef Optimization::vecf SIMD_Ftype; // Single precision type -typedef Optimization::vecd SIMD_Dtype; // Double precision type -typedef Optimization::veci SIMD_Itype; // Integer type +typedef vech SIMD_Htype; // Reduced precision type +typedef vecf SIMD_Ftype; // Single precision type +typedef vecd SIMD_Dtype; // Double precision type +typedef veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; From 2b81cbe2c2f601460a6127b88173b79e4eabe6d6 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 7 May 2020 22:01:19 +0200 Subject: [PATCH 058/147] first attempt to introduce tables using fixed-size; still incomplete --- Grid/simd/Grid_a64fx-fixedsize.h | 110 ++++++++++++++----------------- 1 file changed, 49 insertions(+), 61 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index a2c75d92..4080ad5d 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -49,6 +49,8 @@ typedef svfloat16_t vech __attribute__((arm_sve_vector_bits(512))); typedef svfloat32_t vecf __attribute__((arm_sve_vector_bits(512))); typedef svfloat64_t vecd __attribute__((arm_sve_vector_bits(512))); typedef svuint32_t veci __attribute__((arm_sve_vector_bits(512))); +typedef svuint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float +typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double #else #pragma error("Oops. Wrong or undefined SVE vector size?") #endif /* __ARM_FEATURE_SVE_BITS */ @@ -109,21 +111,24 @@ struct acle{ typedef svuint64_t svuint; static inline pred pg1(){return svptrue_b64();} - static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; - return t; + static inline lutd tbl_swap(){ + const uint64_t t[8] = {1, 0, 3, 2, 5, 4, 7, 6}; + pred pg1 = svptrue_b64(); + return svld1(pg1, t); } - static inline vec tbl0(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; - return t; + static inline lutd tbl0(){ + const uint64_t t[8] = {4, 5, 6, 7, 0, 1, 2, 3}; + pred pg1 = svptrue_b64(); + return svld1(pg1, t); } - static inline vec tbl1(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; - return t; + static inline lutd tbl1(){ + const uint64_t t[8] = {2, 3, 0, 1, 6, 7, 4, 5}; + pred pg1 = svptrue_b64(); + return svld1(pg1, t); } static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} - static inline svfloat64_t zero(){return svdup_f64(0.);} + static inline vecd zero(){return svdup_f64(0.);} }; template <> @@ -136,25 +141,29 @@ struct acle{ static inline pred pg1(){return svptrue_b32();} // exchange neighboring elements - static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - return t; + static inline lutf tbl_swap(){ + const uint32_t t[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + pred pg1 = svptrue_b32(); + return svld1(pg1, t); } - static inline vec tbl0(){ - const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; - return t; + static inline lutf tbl0(){ + const uint32_t t[16] = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + pred pg1 = svptrue_b32(); + return svld1(pg1, t); } static inline vec tbl1(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; - return t; + const lutf = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + pred pg1 = svptrue_b32(); + return svld1(pg1, t); } static inline vec tbl2(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; - return t; + const lutf = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + pred pg1 = svptrue_b32(); + return svld1(pg1, t); } static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} - static inline svfloat32_t zero(){return svdup_f32(0.);} + static inline vecf zero(){return svdup_f32(0.);} }; template <> @@ -167,7 +176,7 @@ struct acle{ static inline pred pg1(){return svptrue_b16();} static inline pred pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} - static inline svfloat16_t zero(){return svdup_f16(0.);} + static inline vech zero(){return svdup_f16(0.);} }; template <> @@ -180,7 +189,6 @@ struct acle{ //static inline svbool_t pg1(){return svptrue_b16();} static inline pred pg1(){return svptrue_b32();} - static inline pred pg2(){return svptrue_pat_b32(SV_VL8);} static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} }; @@ -416,24 +424,20 @@ struct Conj{ struct TimesMinusI{ // Complex float inline vecf operator()(vecf a, vecf b){ - const vec::uint> tbl_swap = acle::tbl_swap(); + lutf tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_odd = acle::pg_odd(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - vecf a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); + vecf a_v = svtbl(a, tbl_swap); return svneg_x(pg_odd, a_v); } // Complex double inline vecd operator()(vecd a, vecd b){ - const vec::uint> tbl_swap = acle::tbl_swap(); + lutd tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_odd = acle::pg_odd(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - vecd a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); + vecd a_v = svtbl(a, tbl_swap); return svneg_x(pg_odd, a_v); } }; @@ -441,24 +445,20 @@ struct TimesMinusI{ struct TimesI{ // Complex float inline vecf operator()(vecf a, vecf b){ - const vec::uint> tbl_swap = acle::tbl_swap(); + lutf tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_even = acle::pg_even(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - vecf a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); + vecf a_v = svtbl(a, tbl_swap); return svneg_x(pg_even, a_v); } // Complex double inline vecd operator()(vecd a, vecd b){ - const vec::uint> tbl_swap = acle::tbl_swap(); + lutd tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_even = acle::pg_even(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - vecd a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); + vecd a_v = svtbl(a, tbl_swap); return svneg_x(pg_even, a_v); } }; @@ -642,22 +642,16 @@ struct Permute{ return svext(in, in, (uint64_t)(16u / 2u)); } static inline vecf Permute1(vecf in) { - const vec::uint> tbl_swap = acle::tbl1(); - pred pg1 = acle::pg1(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - return svtbl(in, tbl_swap_v); + lutf tbl_swap = acle::tbl1(); + return svtbl(in, tbl_swap); } static inline vecf Permute2(vecf in) { - const vec::uint> tbl_swap = acle::tbl2(); - pred pg1 = acle::pg1(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - return svtbl(in, tbl_swap_v); + lutf tbl_swap = acle::tbl2(); + return svtbl(in, tbl_swap); } static inline vecf Permute3(vecf in) { - const vec::uint> tbl_swap = acle::tbl_swap(); - pred pg1 = acle::pg1(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - return svtbl(in, tbl_swap_v); + lutf tbl_swap = acle::tbl_swap(); + return svtbl(in, tbl_swap); } // double @@ -665,17 +659,12 @@ struct Permute{ return svext(in, in, (uint64_t)(8u / 2u)); } static inline vecd Permute1(vecd in) { - - const vec::uint> tbl_swap = acle::tbl1(); - pred pg1 = acle::pg1(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - return svtbl(in, tbl_swap_v); + lutd tbl_swap = acle::tbl1(); + return svtbl(in, tbl_swap); } static inline vecd Permute2(vecd in) { - const vec::uint> tbl_swap = acle::tbl_swap(); - pred pg1 = acle::pg1(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - return svtbl(in, tbl_swap_v); + lutd tbl_swap = acle::tbl_swap(); + return svtbl(in, tbl_swap); } static inline vecd Permute3(vecd in) { return in; @@ -776,7 +765,6 @@ inline Grid::RealD Reduce::operator()(vecd in){ pred pg1 = acle::pg1(); return svred(pg1, in); } - //Integer Reduce template <> inline Integer Reduce::operator()(veci in){ From b338719bc88b6835bf4d558373df510040878803 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 7 May 2020 22:33:28 +0200 Subject: [PATCH 059/147] first transition to fixed-size done, excl. Exch; next step: integration --- Grid/simd/Grid_a64fx-fixedsize.h | 119 ++++++++++++++----------------- 1 file changed, 52 insertions(+), 67 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 4080ad5d..944229e8 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -44,6 +44,7 @@ static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); // gcc 10 features #if __ARM_FEATURE_SVE_BITS==512 +#pragma message("Fixed-size SVE ACLE") typedef svbool_t pred __attribute__((arm_sve_vector_bits(512))); typedef svfloat16_t vech __attribute__((arm_sve_vector_bits(512))); typedef svfloat32_t vecf __attribute__((arm_sve_vector_bits(512))); @@ -52,48 +53,9 @@ typedef svuint32_t veci __attribute__((arm_sve_vector_bits(512))); typedef svuint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double #else -#pragma error("Oops. Wrong or undefined SVE vector size?") +#pragma error("Oops. Illegal SVE vector size!?") #endif /* __ARM_FEATURE_SVE_BITS */ -NAMESPACE_BEGIN(Grid); -NAMESPACE_BEGIN(Optimization); - - // type traits giving the number of elements for each vector type - template struct W; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; - }; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; - }; - template <> struct W { - constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; - }; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; - }; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; - }; - - // SIMD vector types - template - struct vec { - alignas(GEN_SIMD_WIDTH) T v[W::r]; - }; - - typedef vec vecf; - typedef vec vecd; - typedef vec vech; // half precision comms - typedef vec veci; - -NAMESPACE_END(Optimization) -NAMESPACE_END(Grid) - // low-level API NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Optimization); @@ -103,13 +65,6 @@ struct acle{}; template <> struct acle{ - typedef svfloat64_t vt; - typedef svfloat64x2_t vt2; - typedef svfloat64x4_t vt4; - typedef float64_t pt; - typedef uint64_t uint; - typedef svuint64_t svuint; - static inline pred pg1(){return svptrue_b64();} static inline lutd tbl_swap(){ const uint64_t t[8] = {1, 0, 3, 2, 5, 4, 7, 6}; @@ -133,12 +88,6 @@ struct acle{ template <> struct acle{ - typedef svfloat32_t vt; - typedef svfloat32x2_t vt2; - typedef float32_t pt; - typedef uint32_t uint; - typedef svuint32_t svuint; - static inline pred pg1(){return svptrue_b32();} // exchange neighboring elements static inline lutf tbl_swap(){ @@ -168,11 +117,6 @@ struct acle{ template <> struct acle{ - typedef svfloat16_t vt; - typedef float16_t pt; - typedef uint16_t uint; - typedef svuint16_t svuint; - static inline pred pg1(){return svptrue_b16();} static inline pred pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} @@ -181,12 +125,6 @@ struct acle{ template <> struct acle{ - typedef svuint32_t vt; - typedef svuint32x2_t vt2; - typedef Integer pt; - typedef uint32_t uint; - typedef svuint32_t svuint; - //static inline svbool_t pg1(){return svptrue_b16();} static inline pred pg1(){return svptrue_b32();} static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} @@ -542,10 +480,56 @@ struct PrecisionChange { } }; -// %%%% TODO ----------------- +// %%%% FIXME ----------------- struct Exchange{ + // float + static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){ + vecf r1_v = svext(in1, in1, (uint64_t)8u); + vecf r2_v = svext(in2, in2, (uint64_t)8u); + out1 = svext(r1_v, in2, (uint64_t)8u); + out2 = svext(a1_v, r2_v, (uint64_t)8u); + } + static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ + // FIXME + out1 = in1; + out2 = in2; + } + static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){ + // FIXME + out1 = in1; + out2 = in2; + //out1 = (vecf)svtrn1((vecd)in1, (vecd)in2); + //out2 = (vecf)svtrn2((vecd)in1, (vecd)in2); + } + static inline void Exchange3(vecf &out1, vecf &out2, vecf in1, vecf in2){ + out1 = svtrn1(in1, in2); + out2 = svtrn2(in1, in2); + } + // double + static inline void Exchange0(vecd &out1, vecd &out2, vecd in1, vecd in2){ + vecd r1_v = svext(in1, in1, (uint64_t)4u); + vecd r2_v = svext(in2, in2, (uint64_t)4u); + out1 = svext(r1_v, in2, (uint64_t)4u); + out2 = svext(a1_v, r2_v, (uint64_t)4u); + } + static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){ + // FIXME + out1 = in1; + out2 = in2; + } + static inline void Exchange2(vecd &out1, vecd &out2, vecd in1, vecd in2){ + out1 = svtrn1(in1, in2); + out2 = svtrn2(in1, in2); + } + static inline void Exchange3(vecd &out1, vecd &out2, vecd in1, vecd in2){ + assert(0); + return; + } + + // old +/* // Exchange0 is valid for arbitrary SVE vector length template static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ @@ -563,7 +547,7 @@ struct Exchange{ -/* FIXME use svcreate etc. or switch to table lookup directly +// FIXME use svcreate etc. or switch to table lookup directly template static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ @@ -583,7 +567,7 @@ struct Exchange{ svst4(pg4, (typename acle::pt*)out1.v, out1_v4); svst4(pg4, (typename acle::pt*)out2.v, out2_v4); } -*/ + #define VECTOR_FOR(i, w, inc) \ for (unsigned int i = 0; i < w; i += inc) @@ -634,6 +618,7 @@ struct Exchange{ assert(0); return; } + */ }; struct Permute{ From 3417147b1150623a502568f5108121eb77af4e88 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 10:20:19 +0200 Subject: [PATCH 060/147] added real fma, corrected typos in tbls; integrated, must supply A64FXGCC with GEN in configure --- Grid/simd/Grid_a64fx-fixedsize.h | 28 +++++++++++++++++++--------- Grid/simd/Grid_vector_types.h | 5 ++++- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 944229e8..7cfea2b3 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -2,7 +2,7 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: Grid_a64fx-2.h + Source file: Grid_a64fx-fixedsize.h Copyright (C) 2020 @@ -30,11 +30,11 @@ // Using SVE ACLE ///////////////////////////////////////////////////// -#ifndef GEN_SIMD_WIDTH -#define GEN_SIMD_WIDTH 64u -#endif +//#ifndef GEN_SIMD_WIDTH +//#define GEN_SIMD_WIDTH 64u +//#endif -static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); +//static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); #ifdef __ARM_FEATURE_SVE #include @@ -100,13 +100,13 @@ struct acle{ pred pg1 = svptrue_b32(); return svld1(pg1, t); } - static inline vec tbl1(){ - const lutf = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + static inline lutf tbl1(){ + const uint32_t t[16] = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; pred pg1 = svptrue_b32(); return svld1(pg1, t); } - static inline vec tbl2(){ - const lutf = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + static inline lutf tbl2(){ + const uint32_t t[16] = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; pred pg1 = svptrue_b32(); return svld1(pg1, t); } @@ -264,6 +264,16 @@ struct Sub{ }; struct Mult{ + // Real float fma + inline void mac(vecf &a, vecf b, vecf c){ + pred pg1 = acle::pg1(); + a = svmad_x(pg1, b, c, a); + } + // Real double fma + inline void mac(vecd &a, vecd b, vecd c){ + pred pg1 = acle::pg1(); + a = svmad_x(pg1, b, c, a); + } // Real float inline vecf operator()(vecf a, vecf b){ pred pg1 = acle::pg1(); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index bd5b4704..23a17cf4 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -129,7 +129,10 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #include "Grid_generic.h" #endif #endif - +// A64FX with gcc 10 +#ifdef A64FXGCC +#include "Grid_a64fx-fixedsize.h" +#endif #ifdef SSE4 #include "Grid_sse4.h" #endif From 267cce66a193bdf3752c6954d6d742749d951d95 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 10:29:28 +0200 Subject: [PATCH 061/147] added more debug output --- Grid/simd/Grid_a64fx-fixedsize.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 7cfea2b3..73848aab 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -37,6 +37,7 @@ //static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); #ifdef __ARM_FEATURE_SVE + #pragma message("Yes, we have SVE feature") #include #else #pragma error "Missing SVE feature" From 32d1a0bbea1a02ff4172bea2aff7e913ec67cb60 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 10:39:26 +0200 Subject: [PATCH 062/147] added even more debug output --- Grid/simd/Grid_vector_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 23a17cf4..383f72bf 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -117,7 +117,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { */ #ifdef GEN #if defined(A64FX) // breakout A64FX SVE ACLE here - //#pragma message("building for A64FX / SVE ACLE") + #pragma message("building for A64FX / SVE ACLE VLA") #if defined(ARMCLANGCOMPAT) #pragma message("applying armclang fix") //#else @@ -126,6 +126,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #include #include "Grid_a64fx-2.h" #else + #pragma message("building for A64FX / GEN") #include "Grid_generic.h" #endif #endif From f45621109b553800863e0e7c61d0cd15e25194b3 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 10:41:52 +0200 Subject: [PATCH 063/147] placed typedefs in Optimization --- Grid/simd/Grid_a64fx-fixedsize.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 73848aab..7e4972d0 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -43,6 +43,10 @@ #pragma error "Missing SVE feature" #endif /* __ARM_FEATURE_SVE */ +// low-level API +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + // gcc 10 features #if __ARM_FEATURE_SVE_BITS==512 #pragma message("Fixed-size SVE ACLE") @@ -57,9 +61,6 @@ typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for #pragma error("Oops. Illegal SVE vector size!?") #endif /* __ARM_FEATURE_SVE_BITS */ -// low-level API -NAMESPACE_BEGIN(Grid); -NAMESPACE_BEGIN(Optimization); template struct acle{}; @@ -775,10 +776,10 @@ NAMESPACE_END(Optimization); ////////////////////////////////////////////////////////////////////////////////////// // Here assign types -typedef vech SIMD_Htype; // Reduced precision type -typedef vecf SIMD_Ftype; // Single precision type -typedef vecd SIMD_Dtype; // Double precision type -typedef veci SIMD_Itype; // Integer type +typedef Optimization::vech SIMD_Htype; // Reduced precision type +typedef Optimization::vecf SIMD_Ftype; // Single precision type +typedef Optimization::vecd SIMD_Dtype; // Double precision type +typedef Optimization::veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; From 48a340a9d1bc5d0e9b1641fff1c13a9c9195c395 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 10:47:49 +0200 Subject: [PATCH 064/147] GEN seems to defined by default -> some fixes applied --- Grid/simd/Grid_a64fx-fixedsize.h | 3 +++ Grid/simd/Grid_vector_types.h | 36 ++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 7e4972d0..e32014bf 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -43,6 +43,9 @@ #pragma error "Missing SVE feature" #endif /* __ARM_FEATURE_SVE */ +// safety definition, not sure if it's important +#define GEN_SIMD_WIDTH 64u + // low-level API NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Optimization); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 383f72bf..55d521b3 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -115,25 +115,29 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #include "Grid_generic.h" #endif */ -#ifdef GEN - #if defined(A64FX) // breakout A64FX SVE ACLE here - #pragma message("building for A64FX / SVE ACLE VLA") - #if defined(ARMCLANGCOMPAT) - #pragma message("applying armclang fix") - //#else - // #pragma message("not applying armclang fix") - #endif - #include - #include "Grid_a64fx-2.h" - #else - #pragma message("building for A64FX / GEN") - #include "Grid_generic.h" - #endif -#endif // A64FX with gcc 10 #ifdef A64FXGCC -#include "Grid_a64fx-fixedsize.h" + #include "Grid_a64fx-fixedsize.h" +#else + #ifdef GEN + #if defined(A64FX) // breakout A64FX SVE ACLE here + #pragma message("building for A64FX / SVE ACLE VLA") + #if defined(ARMCLANGCOMPAT) + #pragma message("applying armclang fix") + //#else + // #pragma message("not applying armclang fix") + #endif + #include + #include "Grid_a64fx-2.h" + #else + #pragma message("building for A64FX / GEN") + #include "Grid_generic.h" + #endif + #else + #pragma error("Undefined architecture") + #endif #endif + #ifdef SSE4 #include "Grid_sse4.h" #endif From 92f0f29670b3a4a6fe7bbaf334e60de8817d71bb Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 10:57:23 +0200 Subject: [PATCH 065/147] fixed double overloading vecf in Div, corrected typos --- Grid/simd/Grid_a64fx-fixedsize.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index e32014bf..42e12a92 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -355,7 +355,7 @@ struct Div{ return svdiv_x(pg1, a, b); } // Real double - inline vecf operator()(vecf a, vecf b){ + inline vecd operator()(vecd a, vecd b){ pred pg1 = acle::pg1(); return svdiv_x(pg1, a, b); } @@ -369,7 +369,7 @@ struct Conj{ } // Complex double inline vecd operator()(vecd a){ - pred pg_odd = acle::pg_odd(); + pred pg_odd = acle::pg_odd(); return svneg_x(pg_odd, a); } }; @@ -398,7 +398,7 @@ struct TimesMinusI{ struct TimesI{ // Complex float inline vecf operator()(vecf a, vecf b){ - lutf tbl_swap = acle::tbl_swap(); + lutf tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_even = acle::pg_even(); From 0893b4e552185e4c208cb12f784f6b8cd10e82d7 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 10:59:07 +0200 Subject: [PATCH 066/147] fixed typos in PrecisionChange --- Grid/simd/Grid_a64fx-fixedsize.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 42e12a92..2039cc7d 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -427,8 +427,8 @@ struct PrecisionChange { pred pg1s = acle::pg1(); vech ha_v = svzip1(h, h); vech hb_v = svzip2(h, h); - sa = svcvt_f32_x(pg1s, ha); - sb = svcvt_f32_x(pg1s, hb); + sa = svcvt_f32_x(pg1s, ha_v); + sb = svcvt_f32_x(pg1s, hb_v); } static inline vecf DtoS (vecd a,vecd b) { pred pg1d = acle::pg1(); From e64bec8c8e12c5ace96f700b8150a9b1fa17da01 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 11:04:21 +0200 Subject: [PATCH 067/147] pulled SVE typedefs out of Optimization --- Grid/simd/Grid_a64fx-fixedsize.h | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 2039cc7d..4b143eef 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -43,13 +43,6 @@ #pragma error "Missing SVE feature" #endif /* __ARM_FEATURE_SVE */ -// safety definition, not sure if it's important -#define GEN_SIMD_WIDTH 64u - -// low-level API -NAMESPACE_BEGIN(Grid); -NAMESPACE_BEGIN(Optimization); - // gcc 10 features #if __ARM_FEATURE_SVE_BITS==512 #pragma message("Fixed-size SVE ACLE") @@ -64,6 +57,12 @@ typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for #pragma error("Oops. Illegal SVE vector size!?") #endif /* __ARM_FEATURE_SVE_BITS */ +// safety definition, not sure if it's necessary +#define GEN_SIMD_WIDTH 64u + +// low-level API +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); template struct acle{}; @@ -503,7 +502,7 @@ struct Exchange{ vecf r1_v = svext(in1, in1, (uint64_t)8u); vecf r2_v = svext(in2, in2, (uint64_t)8u); out1 = svext(r1_v, in2, (uint64_t)8u); - out2 = svext(a1_v, r2_v, (uint64_t)8u); + out2 = svext(in1, r2_v, (uint64_t)8u); } static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ // FIXME @@ -779,10 +778,10 @@ NAMESPACE_END(Optimization); ////////////////////////////////////////////////////////////////////////////////////// // Here assign types -typedef Optimization::vech SIMD_Htype; // Reduced precision type -typedef Optimization::vecf SIMD_Ftype; // Single precision type -typedef Optimization::vecd SIMD_Dtype; // Double precision type -typedef Optimization::veci SIMD_Itype; // Integer type +typedef vech SIMD_Htype; // Reduced precision type +typedef vecf SIMD_Ftype; // Single precision type +typedef vecd SIMD_Dtype; // Double precision type +typedef veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; From 39f3ae5b1d07cbd5327f03042f98a049abd2049c Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 11:07:14 +0200 Subject: [PATCH 068/147] corrected more types --- Grid/simd/Grid_a64fx-fixedsize.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 4b143eef..34ba955c 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -526,7 +526,7 @@ struct Exchange{ vecd r1_v = svext(in1, in1, (uint64_t)4u); vecd r2_v = svext(in2, in2, (uint64_t)4u); out1 = svext(r1_v, in2, (uint64_t)4u); - out2 = svext(a1_v, r2_v, (uint64_t)4u); + out2 = svext(in1, r2_v, (uint64_t)4u); } static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){ // FIXME From fbed02690d8a3ab1889f769afe19954fb99698e1 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 12:05:31 +0200 Subject: [PATCH 069/147] some changes in breaking out A64FX: use -DA64FXFIXEDSIZE for fixed size, but also define GEN --- Grid/simd/Grid_a64fx-fixedsize.h | 4 +++- Grid/simd/Grid_vector_types.h | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 34ba955c..901c85b8 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -36,6 +36,7 @@ //static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); +/* #ifdef __ARM_FEATURE_SVE #pragma message("Yes, we have SVE feature") #include @@ -43,6 +44,7 @@ #pragma error "Missing SVE feature" #endif /* __ARM_FEATURE_SVE */ + // gcc 10 features #if __ARM_FEATURE_SVE_BITS==512 #pragma message("Fixed-size SVE ACLE") @@ -58,7 +60,7 @@ typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for #endif /* __ARM_FEATURE_SVE_BITS */ // safety definition, not sure if it's necessary -#define GEN_SIMD_WIDTH 64u +//#define GEN_SIMD_WIDTH 64u // low-level API NAMESPACE_BEGIN(Grid); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 55d521b3..f8223441 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -119,25 +119,29 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef A64FXGCC #include "Grid_a64fx-fixedsize.h" #else - #ifdef GEN - #if defined(A64FX) // breakout A64FX SVE ACLE here + +#ifdef GEN + #if defined(A64FX) || defined(A64FXGCC) // breakout A64FX SVE ACLE here + #include + if defined(A64FX) // VLA #pragma message("building for A64FX / SVE ACLE VLA") #if defined(ARMCLANGCOMPAT) #pragma message("applying armclang fix") //#else - // #pragma message("not applying armclang fix") + //#pragma message("not applying armclang fix") #endif - #include #include "Grid_a64fx-2.h" - #else - #pragma message("building for A64FX / GEN") - #include "Grid_generic.h" + #endif + #if defined(A64FXFIXEDSIZE) // fixed size data types + #include "Grid_a64fx-fixedsize.h" #endif #else - #pragma error("Undefined architecture") + #pragma message("building for GEN") // generic + #include "Grid_generic.h" #endif #endif + #ifdef SSE4 #include "Grid_sse4.h" #endif From ed4d9d17f867e8355ee2527ceb67f18cddaa9f73 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 12:09:22 +0200 Subject: [PATCH 070/147] corrected type --- Grid/simd/Grid_vector_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index f8223441..21f50892 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -121,7 +121,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #else #ifdef GEN - #if defined(A64FX) || defined(A64FXGCC) // breakout A64FX SVE ACLE here + #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here #include if defined(A64FX) // VLA #pragma message("building for A64FX / SVE ACLE VLA") From f9cb6b979fb8cef493e0027519be5be367dd324c Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 12:11:01 +0200 Subject: [PATCH 071/147] corrected more typos --- Grid/simd/Grid_vector_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 21f50892..8d1b8704 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -123,7 +123,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GEN #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here #include - if defined(A64FX) // VLA + #if defined(A64FX) // VLA #pragma message("building for A64FX / SVE ACLE VLA") #if defined(ARMCLANGCOMPAT) #pragma message("applying armclang fix") From 6f79369955fa4ddf5bc4eecaab0adaeb561ae0bb Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 12:19:24 +0200 Subject: [PATCH 072/147] trying to get rid of macro definition error --- Grid/simd/Grid_a64fx-fixedsize.h | 13 ++++++------- Grid/simd/Grid_vector_types.h | 4 ---- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 901c85b8..1ca84da9 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -36,13 +36,12 @@ //static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); -/* -#ifdef __ARM_FEATURE_SVE - #pragma message("Yes, we have SVE feature") - #include -#else - #pragma error "Missing SVE feature" -#endif /* __ARM_FEATURE_SVE */ +//#ifdef __ARM_FEATURE_SVE +// #pragma message("Yes, we have SVE feature") +// #include +//#else +// #pragma error "Missing SVE feature" +//#endif /* __ARM_FEATURE_SVE */ // gcc 10 features diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 8d1b8704..3a7db6eb 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -115,10 +115,6 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #include "Grid_generic.h" #endif */ -// A64FX with gcc 10 -#ifdef A64FXGCC - #include "Grid_a64fx-fixedsize.h" -#else #ifdef GEN #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here From 71a7350a8566a95a8f721d52231d7f1b9476d419 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 12:26:51 +0200 Subject: [PATCH 073/147] changed 2nd argument in Reduce to native vector type --- Grid/simd/Grid_a64fx-fixedsize.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 1ca84da9..deeb5c78 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -737,7 +737,7 @@ struct Reduce{ }; //Complex float Reduce template <> -inline Grid::ComplexF Reduce::operator()(vecf in){ +inline Grid::ComplexF Reduce::operator()(svfloat32_t in){ pred pg_even = acle::pg_even(); pred pg_odd = acle::pg_odd(); float a = svred(pg_even, in); @@ -746,13 +746,13 @@ inline Grid::ComplexF Reduce::operator()(vecf in){ } //Real float Reduce template <> -inline Grid::RealF Reduce::operator()(vecf in){ +inline Grid::RealF Reduce::operator()(svfloat32_t in){ pred pg1 = acle::pg1(); return svred(pg1, in); } //Complex double Reduce template <> -inline Grid::ComplexD Reduce::operator()(vecd in){ +inline Grid::ComplexD Reduce::operator()(svfloat64_t in){ pred pg_even = acle::pg_even(); pred pg_odd = acle::pg_odd(); double a = svred(pg_even, in); @@ -761,13 +761,13 @@ inline Grid::ComplexD Reduce::operator()(vecd in){ } //Real double Reduce template <> -inline Grid::RealD Reduce::operator()(vecd in){ +inline Grid::RealD Reduce::operator()(svfloat64_t in){ pred pg1 = acle::pg1(); return svred(pg1, in); } //Integer Reduce template <> -inline Integer Reduce::operator()(veci in){ +inline Integer Reduce::operator()(svuint32_t in){ pred pg1 = acle::pg1(); return svred(pg1, in); } From b0ef2367f3b3330ace38c38bed60e2a80e08edf9 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 14:22:44 +0200 Subject: [PATCH 074/147] testing alternate call to PrecisionChange --- Grid/simd/Grid_vector_types.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 3a7db6eb..c38fdd9e 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -958,7 +958,16 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec) assert((nvec&0x1)==0); for(int m=0;m*2 Date: Fri, 8 May 2020 14:34:59 +0200 Subject: [PATCH 075/147] retry changing StoD API --- Grid/simd/Grid_a64fx-fixedsize.h | 2 +- Grid/simd/Grid_vector_types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index deeb5c78..a8c2b978 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -436,7 +436,7 @@ struct PrecisionChange { vecf sb_v = svcvt_f32_x(pg1d, b); return svuzp1(sa_v, sb_v); } - static inline void StoD (vecf s,vecd &a,vecd &b) { + static inline void StoD (svfloat32_t s,svfloat64_t &a,svfloat64_t &b) { pred pg1d = acle::pg1(); vecf sa_v = svzip1(s, s); vecf sb_v = svzip2(s, s); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index c38fdd9e..03cda20d 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -967,7 +967,7 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec) // 961 | Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v); // | ~~~~~~~^ // Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v); - Optimization::PrecisionChange::StoD(in[m].v,&out[n].v,&out[n+1].v); + Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v); } } accelerator_inline void precisionChange(vRealD *out,vRealH *in,int nvec) From 4bc2ad2894c84accb1f0e6dea5d7ac94ee1c4af0 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 15:00:25 +0200 Subject: [PATCH 076/147] API change v2 --- Grid/simd/Grid_a64fx-fixedsize.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index a8c2b978..232c2933 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -436,12 +436,12 @@ struct PrecisionChange { vecf sb_v = svcvt_f32_x(pg1d, b); return svuzp1(sa_v, sb_v); } - static inline void StoD (svfloat32_t s,svfloat64_t &a,svfloat64_t &b) { + static inline void StoD (vecf s,vecd *a,vecd *b) { pred pg1d = acle::pg1(); vecf sa_v = svzip1(s, s); vecf sb_v = svzip2(s, s); - a = svcvt_f64_x(pg1d, sa_v); - b = svcvt_f64_x(pg1d, sb_v); + *a = svcvt_f64_x(pg1d, sa_v); + *b = svcvt_f64_x(pg1d, sb_v); } static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { /* From ceb8b374da83f274547e076d1edf0d49806395bc Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 8 May 2020 15:04:44 +0200 Subject: [PATCH 077/147] API change v3 --- Grid/simd/Grid_vector_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 03cda20d..c38fdd9e 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -967,7 +967,7 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec) // 961 | Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v); // | ~~~~~~~^ // Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v); - Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v); + Optimization::PrecisionChange::StoD(in[m].v,&out[n].v,&out[n+1].v); } } accelerator_inline void precisionChange(vRealD *out,vRealH *in,int nvec) From 55a55660cbc1e53e24ad0f6c4fddb37c213adbc0 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 9 May 2020 12:48:42 +0200 Subject: [PATCH 078/147] reverted changes --- Grid/simd/Grid_a64fx-fixedsize.h | 11 ++++++++--- Grid/simd/Grid_vector_types.h | 3 +-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 232c2933..8c5999e3 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -30,6 +30,11 @@ // Using SVE ACLE ///////////////////////////////////////////////////// +/* TODO + * Exchange + * prefetching +*/ + //#ifndef GEN_SIMD_WIDTH //#define GEN_SIMD_WIDTH 64u //#endif @@ -436,12 +441,12 @@ struct PrecisionChange { vecf sb_v = svcvt_f32_x(pg1d, b); return svuzp1(sa_v, sb_v); } - static inline void StoD (vecf s,vecd *a,vecd *b) { + static inline void StoD (vecf s,vecd &a,vecd &b) { pred pg1d = acle::pg1(); vecf sa_v = svzip1(s, s); vecf sb_v = svzip2(s, s); - *a = svcvt_f64_x(pg1d, sa_v); - *b = svcvt_f64_x(pg1d, sb_v); + a = svcvt_f64_x(pg1d, sa_v); + b = svcvt_f64_x(pg1d, sb_v); } static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { /* diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index c38fdd9e..a71817eb 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -958,6 +958,7 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec) assert((nvec&0x1)==0); for(int m=0;m*2 Date: Sat, 9 May 2020 21:21:57 +0200 Subject: [PATCH 079/147] unions for tables eliminate explicit loads, gcc does not complain --- Grid/simd/Grid_a64fx-fixedsize.h | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 8c5999e3..523d0d96 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -70,6 +70,17 @@ typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Optimization); +// convenience union types for tables eliminate loads +union ulutf { + lutf v; + uint32_t s[16]; +}; + +union ulutd { + lutd v; + uint64_t s[8]; +}; + template struct acle{}; @@ -77,19 +88,31 @@ template <> struct acle{ static inline pred pg1(){return svptrue_b64();} static inline lutd tbl_swap(){ + /* const uint64_t t[8] = {1, 0, 3, 2, 5, 4, 7, 6}; pred pg1 = svptrue_b64(); return svld1(pg1, t); + */ + const ulutd t = { .s = {1, 0, 3, 2, 5, 4, 7, 6} }; + return t.v; } static inline lutd tbl0(){ + /* const uint64_t t[8] = {4, 5, 6, 7, 0, 1, 2, 3}; pred pg1 = svptrue_b64(); return svld1(pg1, t); + */ + const ulutd t = { .s = {4, 5, 6, 7, 0, 1, 2, 3} }; + return t.v; } static inline lutd tbl1(){ + /* const uint64_t t[8] = {2, 3, 0, 1, 6, 7, 4, 5}; pred pg1 = svptrue_b64(); return svld1(pg1, t); + */ + const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} }; + return t.v; } static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} @@ -101,24 +124,40 @@ struct acle{ static inline pred pg1(){return svptrue_b32();} // exchange neighboring elements static inline lutf tbl_swap(){ + /* const uint32_t t[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; pred pg1 = svptrue_b32(); return svld1(pg1, t); + */ + const ulutf t = { .s = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; + return t.v; } static inline lutf tbl0(){ + /* const uint32_t t[16] = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; pred pg1 = svptrue_b32(); return svld1(pg1, t); + */ + const ulutf t = { .s = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7} }; + return t.v; } static inline lutf tbl1(){ + /* const uint32_t t[16] = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; pred pg1 = svptrue_b32(); return svld1(pg1, t); + */ + const ulutf t = { .s = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11} }; + return t.v; } static inline lutf tbl2(){ + /* const uint32_t t[16] = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; pred pg1 = svptrue_b32(); return svld1(pg1, t); + */ + const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; + return t.v; } static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} From 291ee8c3d01d763fabae44fb96aaf24cd65b1aed Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 9 May 2020 22:18:02 +0200 Subject: [PATCH 080/147] updated fixed-size implementation; only Exch1 and prefetches missing --- Grid/simd/Grid_a64fx-fixedsize.h | 170 +++++++++++-------------------- 1 file changed, 62 insertions(+), 108 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 523d0d96..f16c50d4 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -6,7 +6,8 @@ Copyright (C) 2020 - Author: Nils Meyer + Authors: Nils Meyer Regensburg University + Richard Sandiford Arm This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,12 +28,12 @@ /* END LEGAL */ ///////////////////////////////////////////////////// -// Using SVE ACLE +// Using SVE ACLE with fixed-size data types ///////////////////////////////////////////////////// /* TODO - * Exchange - * prefetching + * Exchange1 + * prefetches */ //#ifndef GEN_SIMD_WIDTH @@ -81,6 +82,18 @@ union ulutd { uint64_t s[8]; }; +// FIXME convenience union types for Exchange1 +union uvecf { + vecf v; + float32_t s[16]; +}; + +union uvecd { + vecd v; + float64_t s[8]; +}; + + template struct acle{}; @@ -539,7 +552,8 @@ struct PrecisionChange { } }; -// %%%% FIXME ----------------- +#define VECTOR_FOR(i, w, inc) \ +for (unsigned int i = 0; i < w; i += inc) struct Exchange{ // float @@ -551,15 +565,28 @@ struct Exchange{ } static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ // FIXME - out1 = in1; - out2 = in2; + uvecf v1 = { .v = in1 }; + uvecf v2 = { .v = in2 }; + uvecf o1, o2; + + const int n = 1; + const int w = 16; // w = W::r + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "<::r + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "< - static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, in1.v); - typename acle::vt a2_v = svld1(pg1, in2.v); - typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); - r1_v = svext(r1_v, a2_v, (uint64_t)W::c); - typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); - r2_v = svext(a1_v, r2_v, (uint64_t)W::c); - svst1(pg1, out1.v, r1_v); - svst1(pg1, out2.v, r2_v); - } - - - -// FIXME use svcreate etc. or switch to table lookup directly - template - static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ - - svbool_t pg4 = acle::pg4(); - typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); - typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); - typename acle::vt4 out1_v4; - typename acle::vt4 out2_v4; - out1_v4.v0 = in1_v4.v0; - out1_v4.v1 = in1_v4.v1; - out1_v4.v2 = in2_v4.v0; - out1_v4.v3 = in2_v4.v1; - out2_v4.v0 = in1_v4.v2; - out2_v4.v1 = in1_v4.v3; - out2_v4.v2 = in2_v4.v2; - out2_v4.v3 = in2_v4.v3; - svst4(pg4, (typename acle::pt*)out1.v, out1_v4); - svst4(pg4, (typename acle::pt*)out2.v, out2_v4); - } - - - #define VECTOR_FOR(i, w, inc) \ - for (unsigned int i = 0; i < w; i += inc) - - template - static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ - // FIXME - const int n = 1; - const int w = W::r; - unsigned int mask = w >> (n + 1); - // std::cout << " Exchange "< - static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); - typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); - typename acle::vt r1_v = svtrn1(a1_v, a2_v); - typename acle::vt r2_v = svtrn2(a1_v, a2_v); - svst1(pg1, (typename acle::pt*)out1.v, r1_v); - svst1(pg1, (typename acle::pt*)out2.v, r2_v); - } - - static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ - - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, in1.v); - typename acle::vt a2_v = svld1(pg1, in2.v); - typename acle::vt r1_v = svtrn1(a1_v, a2_v); - typename acle::vt r2_v = svtrn2(a1_v, a2_v); - svst1(pg1, out1.v, r1_v); - svst1(pg1, out2.v, r2_v); - } - - static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ - assert(0); - return; - } - */ }; +#undef VECTOR_FOR + struct Permute{ // float static inline vecf Permute0(vecf in) { @@ -754,10 +708,10 @@ struct Rotate{ } template static inline vecf tRotate(vecf in){ - return svext(in, in, (uint64_t)(n%16u)); + return svext(in, in, (uint64_t)n); } template static inline vecd tRotate(vecd in){ - return svext(in, in, (uint64_t)(n%8u)); + return svext(in, in, (uint64_t)n); } }; From b2fd8b993a95291b50fc03831bee5c1366582686 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 9 May 2020 22:53:42 +0200 Subject: [PATCH 081/147] fixed-size clean up --- Grid/simd/Grid_a64fx-fixedsize.h | 43 +++----------------------------- 1 file changed, 4 insertions(+), 39 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index f16c50d4..4545379a 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -6,8 +6,8 @@ Copyright (C) 2020 - Authors: Nils Meyer Regensburg University - Richard Sandiford Arm + Author: Nils Meyer Regensburg University + Author: Richard Sandiford Arm This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -101,29 +101,14 @@ template <> struct acle{ static inline pred pg1(){return svptrue_b64();} static inline lutd tbl_swap(){ - /* - const uint64_t t[8] = {1, 0, 3, 2, 5, 4, 7, 6}; - pred pg1 = svptrue_b64(); - return svld1(pg1, t); - */ const ulutd t = { .s = {1, 0, 3, 2, 5, 4, 7, 6} }; return t.v; } static inline lutd tbl0(){ - /* - const uint64_t t[8] = {4, 5, 6, 7, 0, 1, 2, 3}; - pred pg1 = svptrue_b64(); - return svld1(pg1, t); - */ const ulutd t = { .s = {4, 5, 6, 7, 0, 1, 2, 3} }; return t.v; } static inline lutd tbl1(){ - /* - const uint64_t t[8] = {2, 3, 0, 1, 6, 7, 4, 5}; - pred pg1 = svptrue_b64(); - return svld1(pg1, t); - */ const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} }; return t.v; } @@ -137,38 +122,18 @@ struct acle{ static inline pred pg1(){return svptrue_b32();} // exchange neighboring elements static inline lutf tbl_swap(){ - /* - const uint32_t t[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - pred pg1 = svptrue_b32(); - return svld1(pg1, t); - */ const ulutf t = { .s = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; return t.v; } static inline lutf tbl0(){ - /* - const uint32_t t[16] = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; - pred pg1 = svptrue_b32(); - return svld1(pg1, t); - */ const ulutf t = { .s = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7} }; return t.v; } static inline lutf tbl1(){ - /* - const uint32_t t[16] = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; - pred pg1 = svptrue_b32(); - return svld1(pg1, t); - */ const ulutf t = { .s = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11} }; return t.v; } static inline lutf tbl2(){ - /* - const uint32_t t[16] = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; - pred pg1 = svptrue_b32(); - return svld1(pg1, t); - */ const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; return t.v; } @@ -400,8 +365,8 @@ struct MultComplex{ pred pg1 = acle::pg1(); vecd z = acle::zero(); // using FCMLA - vecd r_v = svcmla_x(pg1, z, a, b, 90); - return svcmla_x(pg1, r_v, a, b, 0); + vecd r_v = svcmla_x(pg1, z, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); } }; From ffaaed679edccbd08fb64bd4785afd204953fbab Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 11 May 2020 13:21:39 +0200 Subject: [PATCH 082/147] MPI_THREAD_SINGLE hack for Fugaku, enabled by -DTOFU --- Grid/communicator/Communicator_mpi3.cc | 45 ++++++++++++++------------ 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 2576b1fa..dcd984d2 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/communicator/Communicator_mpi.cc @@ -35,7 +35,7 @@ Grid_MPI_Comm CartesianCommunicator::communicator_world; //////////////////////////////////////////// // First initialise of comms system //////////////////////////////////////////// -void CartesianCommunicator::Init(int *argc, char ***argv) +void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; @@ -43,6 +43,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { +#if defined (TOFU) // hack for FUGAKU, credits go to Issaku Kanamori + nCommThreads=1; + MPI_Init(argc,argv); +#else MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE @@ -53,6 +57,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) { assert(0); } +#endif } // Never clean up as done once. @@ -91,7 +96,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) //////////////////////////////////////////////////////////////////////////////////////////////////////// // Initialises from communicator_world //////////////////////////////////////////////////////////////////////////////////////////////////////// -CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) +CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) { MPI_Comm optimal_comm; //////////////////////////////////////////////////// @@ -110,7 +115,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) ////////////////////////////////// // Try to subdivide communicator ////////////////////////////////// -CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) +CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); assert(_ndimension>=1); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); @@ -127,7 +132,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const ////////////////////////////////////////////////////////////////////////////////////////////////////// // split the communicator ////////////////////////////////////////////////////////////////////////////////////////////////////// - // int Nparent = parent._processors ; + // int Nparent = parent._processors ; int Nparent; MPI_Comm_size(parent.communicator,&Nparent); @@ -149,13 +154,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const } // rank within subcomm ; srank is rank of subcomm within blocks of subcomms - int crank; + int crank; // Mpi uses the reverse Lexico convention to us; so reversed routines called Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids MPI_Comm comm_split; - if ( Nchild > 1 ) { + if ( Nchild > 1 ) { //////////////////////////////////////////////////////////////// // Split the communicator @@ -180,11 +185,11 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const SetCommunicator(comm_split); /////////////////////////////////////////////// - // Free the temp communicator + // Free the temp communicator /////////////////////////////////////////////// MPI_Comm_free(&comm_split); - if(0){ + if(0){ std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; for(int d=0;d &lis int myrank = _processor; int ierr; - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { + if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { MPI_Request xrq; MPI_Request rrq; ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - + assert(ierr==0); list.push_back(xrq); list.push_back(rrq); - } else { + } else { // Give the CPU to MPI immediately; can use threads to overlap optionally ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, recv,bytes,MPI_CHAR,from, from, @@ -363,7 +368,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list,dir); } @@ -432,8 +437,8 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) communicator); assert(ierr==0); } -int CartesianCommunicator::RankWorld(void){ - int r; +int CartesianCommunicator::RankWorld(void){ + int r; MPI_Comm_rank(communicator_world,&r); return r; } @@ -466,7 +471,7 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // (Turns up on 32^3 x 64 Gparity too) MPI_Datatype object; - int iwords; + int iwords; int ibytes; iwords = words; ibytes = bytes; @@ -479,5 +484,3 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t } NAMESPACE_END(Grid); - - From fc2e9850d3eac663b8a2b6d17c572e7f384a16d4 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 11 May 2020 13:25:02 +0200 Subject: [PATCH 083/147] temporarily enable TOFU by default when using A64FX or A64FXFIXEDSIZE --- Grid/communicator/Communicator_mpi3.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index dcd984d2..31566352 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -43,6 +43,12 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { + +// temporarily enable Fugaku/Tofu support by default +#if defined (A64FX) || defined (A64FXFIXEDSIZE) +#define TOFU +#endif + #if defined (TOFU) // hack for FUGAKU, credits go to Issaku Kanamori nCommThreads=1; MPI_Init(argc,argv); From 78b8e40f83219d3eac4356a42fb55938319ec80d Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 11 May 2020 18:11:23 +0200 Subject: [PATCH 084/147] switched to gcc's internal data types --- Grid/simd/Grid_a64fx-fixedsize.h | 29 +++++++++++++---------------- Grid/simd/Grid_vector_types.h | 2 +- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 4545379a..4f9cf206 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -7,7 +7,9 @@ Copyright (C) 2020 Author: Nils Meyer Regensburg University - Author: Richard Sandiford Arm + + with support from Arm + Richard Sandiford This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,26 +35,13 @@ /* TODO * Exchange1 - * prefetches */ -//#ifndef GEN_SIMD_WIDTH -//#define GEN_SIMD_WIDTH 64u -//#endif - -//static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); - -//#ifdef __ARM_FEATURE_SVE -// #pragma message("Yes, we have SVE feature") -// #include -//#else -// #pragma error "Missing SVE feature" -//#endif /* __ARM_FEATURE_SVE */ - - // gcc 10 features #if __ARM_FEATURE_SVE_BITS==512 #pragma message("Fixed-size SVE ACLE") +/* gcc 10.0.1 and gcc 10.1 bug using ACLE data types CAS-159553-Y1K4C6 + workaround: use gcc's internal data types, bugfix expected for gcc 10.2 typedef svbool_t pred __attribute__((arm_sve_vector_bits(512))); typedef svfloat16_t vech __attribute__((arm_sve_vector_bits(512))); typedef svfloat32_t vecf __attribute__((arm_sve_vector_bits(512))); @@ -60,6 +49,14 @@ typedef svfloat64_t vecd __attribute__((arm_sve_vector_bits(512))); typedef svuint32_t veci __attribute__((arm_sve_vector_bits(512))); typedef svuint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double +*/ +typedef __SVBool_t pred __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat16_t vech __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat32_t vecf __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat64_t vecd __attribute__((arm_sve_vector_bits(512))); +typedef __SVUint32_t veci __attribute__((arm_sve_vector_bits(512))); +typedef __SVUint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float +typedef __SVUint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double #else #pragma error("Oops. Illegal SVE vector size!?") #endif /* __ARM_FEATURE_SVE_BITS */ diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index a71817eb..3a2e7228 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -959,7 +959,7 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec) for(int m=0;m*2 Date: Tue, 12 May 2020 03:59:59 +0900 Subject: [PATCH 085/147] corrected typo --- Grid/simd/Grid_a64fx-fixedsize.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 4f9cf206..5c325b25 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -697,7 +697,8 @@ struct Reduce{ }; //Complex float Reduce template <> -inline Grid::ComplexF Reduce::operator()(svfloat32_t in){ +// inline Grid::ComplexF Reduce::operator()(svfloat32_t in){ +inline Grid::ComplexF Reduce::operator()(__SVFloat32_t in){ pred pg_even = acle::pg_even(); pred pg_odd = acle::pg_odd(); float a = svred(pg_even, in); @@ -706,13 +707,15 @@ inline Grid::ComplexF Reduce::operator()(svfloat32_ } //Real float Reduce template <> -inline Grid::RealF Reduce::operator()(svfloat32_t in){ +//inline Grid::RealF Reduce::operator()(svfloat32_t in){ +inline Grid::RealF Reduce::operator()(__SVFloat32_t in){ pred pg1 = acle::pg1(); return svred(pg1, in); } //Complex double Reduce template <> -inline Grid::ComplexD Reduce::operator()(svfloat64_t in){ +//inline Grid::ComplexD Reduce::operator()(svfloat64_t in){ +inline Grid::ComplexD Reduce::operator()(__SVFloat64_t in){ pred pg_even = acle::pg_even(); pred pg_odd = acle::pg_odd(); double a = svred(pg_even, in); @@ -721,13 +724,15 @@ inline Grid::ComplexD Reduce::operator()(svfloat64_ } //Real double Reduce template <> -inline Grid::RealD Reduce::operator()(svfloat64_t in){ +//inline Grid::RealD Reduce::operator()(svfloat64_t in){ +inline Grid::RealD Reduce::operator()(__SVFloat64_t in){ pred pg1 = acle::pg1(); return svred(pg1, in); } //Integer Reduce template <> -inline Integer Reduce::operator()(svuint32_t in){ +//inline Integer Reduce::operator()(svuint32_t in){ +inline Integer Reduce::operator()(__SVUint32_t in){ pred pg1 = acle::pg1(); return svred(pg1, in); } From b7c76ede29726411dbdf615fcedf138a14b71a37 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 11 May 2020 22:43:00 +0200 Subject: [PATCH 086/147] Removed some assertions in Test_simd and removed exit() in Reduce --- Grid/simd/Grid_a64fx-fixedsize.h | 2 +- tests/Test_simd.cc | 76 ++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 34 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 5c325b25..0004d405 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -691,7 +691,7 @@ struct Reduce{ //General form must generate error if compiled inline Out_type operator()(In_type in){ printf("Error, using wrong Reduce function\n"); - exit(1); + //exit(1); return 0; } }; diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 9b0fa02b..1d7b2dc8 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -101,14 +101,14 @@ public: // FIXME still to test: // // innerProduct, -// norm2, +// norm2, // Reduce, // // mac,mult,sub,add, vone,vzero,vcomplex_i, =Zero(), // vset,vsplat,vstore,vstream,vload, scalar*vec, vec*scalar // unary -, // *= , -=, += -// outerproduct, +// outerproduct, // zeroit // permute class funcReduce { @@ -119,12 +119,12 @@ template void sfunc(reduce &rr,scal &i1,scal &i2) con std::string name(void) const { return std::string("Reduce"); } }; -template +template void Tester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -172,8 +172,10 @@ void Tester(const functor &func) } if ( ok==0 ) { std::cout< @@ -229,17 +231,19 @@ void IntTester(const functor &func) } if ( ok==0 ) { std::cout< +template void ReductionTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -278,12 +282,14 @@ void ReductionTester(const functor &func) } if ( ok==0 ) { std::cout< +template void IntReductionTester(const functor &func) { int Nsimd = vec::Nsimd(); @@ -323,8 +329,10 @@ void IntReductionTester(const functor &func) } if ( ok==0 ) { std::cout< void operator()(vec &rr,vec &i1,vec &i2) const { permute(rr,i1,n);} - template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { + template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { int sz=in.size(); int msk = sz>>(n+1); for(int i=0;i void apply(ExtractBuffer &r1, ExtractBuffer &r2, ExtractBuffer &in1, - ExtractBuffer &in2) const - { + ExtractBuffer &in2) const + { int sz=in1.size(); int msk = sz>>(n+1); @@ -364,7 +372,7 @@ public: if ( (i&msk) == 0 ) { r2[i]=in1[j2];} else { r2[i]=in2[j2];} - } + } } std::string name(void) const { return std::string("Exchange"); } }; @@ -374,7 +382,7 @@ public: int n; funcRotate(int _n) { n=_n;}; template void operator()(vec &rr,vec &i1,vec &i2) const { rr=rotate(i1,n);} - template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { + template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { int sz = in.size(); for(int i=0;i +template void PermTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -425,37 +433,39 @@ void PermTester(const functor &func) for(int i=0;i1.0e-7){ - std::cout< +template void ExchangeTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -566,7 +576,7 @@ int main (int argc, char ** argv) std::cout << " Test {1,2,3,4} " << Test < seeds({1,2,3,4}); @@ -742,7 +752,7 @@ int main (int argc, char ** argv) for(int r=0;r(funcRotate(r)); } - + std::cout< Date: Tue, 12 May 2020 19:01:12 +0200 Subject: [PATCH 087/147] enabled asm kernels for fixed-size A64FXFIXEDSIZE --- .../implementation/WilsonKernelsAsmA64FX.h | 22 ++++++++----------- .../WilsonKernelsAsmBodyA64FX.h | 2 +- ...ilsonKernelsInstantiationWilsonAdjImplD.cc | 8 ++++--- ...ilsonKernelsInstantiationWilsonAdjImplF.cc | 8 ++++--- .../WilsonKernelsInstantiationWilsonImplD.cc | 8 ++++--- .../WilsonKernelsInstantiationWilsonImplDF.cc | 8 ++++--- .../WilsonKernelsInstantiationWilsonImplF.cc | 8 ++++--- .../WilsonKernelsInstantiationWilsonImplFH.cc | 8 ++++--- .../WilsonKernelsInstantiation.cc.master | 8 ++++--- ...tiationWilsonTwoIndexAntiSymmetricImplD.cc | 8 ++++--- ...tiationWilsonTwoIndexAntiSymmetricImplF.cc | 8 ++++--- ...stantiationWilsonTwoIndexSymmetricImplD.cc | 8 ++++--- ...stantiationWilsonTwoIndexSymmetricImplF.cc | 8 ++++--- .../WilsonKernelsInstantiationZWilsonImplD.cc | 8 ++++--- ...WilsonKernelsInstantiationZWilsonImplDF.cc | 8 ++++--- .../WilsonKernelsInstantiationZWilsonImplF.cc | 8 ++++--- ...WilsonKernelsInstantiationZWilsonImplFH.cc | 8 ++++--- 17 files changed, 85 insertions(+), 59 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 9b9dba74..efb0746a 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -6,10 +6,9 @@ Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h - Copyright (C) 2015 + Copyright (C) 2020 -Author: Peter Boyle -Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -32,27 +31,24 @@ Author: paboyle #if defined(A64FXASM) -// include here if A64FX was not defined -#ifndef A64FX +// safety include #include -#endif -#pragma message("specialize A64FX Dslash") - -// undefine everything +// undefine everything related to kernels #include +// enable A64FX body #define WILSONKERNELSASMBODYA64FX -#pragma message("invoking A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") +#pragma message("Including A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine /////////////////////////////////////////////////////////// #if defined(DSLASHINTRIN) -#pragma message ("invoking A64FX Dslash: intrin") +#pragma message ("including A64FX Dslash: intrin") #include #else -#pragma message ("invoking A64FX Dslash: asm") +#pragma message ("including A64FX Dslash: asm") #include #endif @@ -574,4 +570,4 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie #undef WILSONKERNELSASMBODYA64FX #include -#endif //A64FX +#endif //A64FXASM diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index d77b4414..406e5c25 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -6,7 +6,7 @@ Copyright (C) 2020 -Author: Nils Meyer +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc index f2c0f9d2..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,15 +36,16 @@ directory #ifndef AVX512 #ifndef QPX #ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif #endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - From 0009b5cee819753d09198fc061ff7af2bd3f2ec9 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Tue, 12 May 2020 19:02:33 +0200 Subject: [PATCH 088/147] updated SVE_README --- SVE_README.txt | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/SVE_README.txt b/SVE_README.txt index 22d01413..886732ca 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -1,36 +1,32 @@ -gcc 10.0.1 VLA +* gcc 10.0.1 VLA ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -armclang 20.0 VLA +* gcc 10.0.1 fixed-size ACLE + +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-op enmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" + + +* armclang 20.0 VLA ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static must use armclang 20.0 with ARMCLANGCOMPAT, otherwise Benchmark_wilson gives wrong result -armclang 20.1 VLA +* armclang 20.1 VLA ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static must use armclang 20.1 with ARMCLANGCOMPAT, otherwise Benchmark_wilson gives wrong result -Test_simd build error caused by -mcpu=a64fx ? - -Fujitsu FCC +* Fujitsu FCC ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" +* Fujitsu FCC w/ MPI -Fujitsu FCC w/ MPI - -../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" - - - - -what about "-fno-strict-aliasing" in general? - +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" From d15ccad8a7bcc8bd23e13d8405d6799a6c4e24c0 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Tue, 12 May 2020 20:41:14 +0200 Subject: [PATCH 089/147] switched to vec* in Reduce --- Grid/simd/Grid_a64fx-fixedsize.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 0004d405..9c306569 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -698,7 +698,8 @@ struct Reduce{ //Complex float Reduce template <> // inline Grid::ComplexF Reduce::operator()(svfloat32_t in){ -inline Grid::ComplexF Reduce::operator()(__SVFloat32_t in){ +//inline Grid::ComplexF Reduce::operator()(__SVFloat32_t in){ +inline Grid::ComplexF Reduce::operator()(vecf in){ pred pg_even = acle::pg_even(); pred pg_odd = acle::pg_odd(); float a = svred(pg_even, in); @@ -708,14 +709,16 @@ inline Grid::ComplexF Reduce::operator()(__SVFloa //Real float Reduce template <> //inline Grid::RealF Reduce::operator()(svfloat32_t in){ -inline Grid::RealF Reduce::operator()(__SVFloat32_t in){ +//inline Grid::RealF Reduce::operator()(__SVFloat32_t in){ +inline Grid::RealF Reduce::operator()(vecf in){ pred pg1 = acle::pg1(); return svred(pg1, in); } //Complex double Reduce template <> //inline Grid::ComplexD Reduce::operator()(svfloat64_t in){ -inline Grid::ComplexD Reduce::operator()(__SVFloat64_t in){ +//inline Grid::ComplexD Reduce::operator()(__SVFloat64_t in){ +inline Grid::ComplexD Reduce::operator()(vecd in){ pred pg_even = acle::pg_even(); pred pg_odd = acle::pg_odd(); double a = svred(pg_even, in); @@ -725,14 +728,16 @@ inline Grid::ComplexD Reduce::operator()(__SVFloa //Real double Reduce template <> //inline Grid::RealD Reduce::operator()(svfloat64_t in){ -inline Grid::RealD Reduce::operator()(__SVFloat64_t in){ +//inline Grid::RealD Reduce::operator()(__SVFloat64_t in){ +inline Grid::RealD Reduce::operator()(vecd in){ pred pg1 = acle::pg1(); return svred(pg1, in); } //Integer Reduce template <> //inline Integer Reduce::operator()(svuint32_t in){ -inline Integer Reduce::operator()(__SVUint32_t in){ +//inline Integer Reduce::operator()(__SVUint32_t in){ +inline Integer Reduce::operator()(veci in){ pred pg1 = acle::pg1(); return svred(pg1, in); } From db8c0e758439a2b3e99981c98701c080287d4829 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 14 May 2020 23:17:35 +0200 Subject: [PATCH 090/147] replaced _x form with _m form when using even/odd predication --- Grid/simd/Grid_a64fx-2.h | 18 ++++++++++-------- Grid/simd/Grid_a64fx-fixedsize.h | 18 ++++++++++++------ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 0419df5b..67690c01 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -422,7 +422,8 @@ struct Conj{ svbool_t pg1 = acle::pg1(); svbool_t pg_odd = acle::pg_odd(); typename acle::vt a_v = svld1(pg1, a.v); - typename acle::vt r_v = svneg_x(pg_odd, a_v); + //typename acle::vt r_v = svneg_x(pg_odd, a_v); + typename acle::vt r_v = svneg_m(pg_odd, a_v); svst1(pg1, out.v, r_v); return out; @@ -442,7 +443,7 @@ struct TimesMinusI{ typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); typename acle::vt a_v = svld1(pg1, a.v); a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_odd, a_v); + typename acle::vt r_v = svneg_m(pg_odd, a_v); svst1(pg1, out.v, r_v); return out; @@ -462,7 +463,8 @@ struct TimesI{ typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); typename acle::vt a_v = svld1(pg1, a.v); a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_even, a_v); + //typename acle::vt r_v = svneg_x(pg_even, a_v); + typename acle::vt r_v = svneg_m(pg_even, a_v); svst1(pg1, out.v, r_v); return out; @@ -593,7 +595,7 @@ struct Exchange{ -/* FIXME use svcreate etc. or switch to table lookup directly +/* FIXME use svcreate etc. or switch to table lookup directly template static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ @@ -613,11 +615,11 @@ struct Exchange{ svst4(pg4, (typename acle::pt*)out1.v, out1_v4); svst4(pg4, (typename acle::pt*)out2.v, out2_v4); } -*/ +*/ #define VECTOR_FOR(i, w, inc) \ for (unsigned int i = 0; i < w; i += inc) - + template static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ // FIXME @@ -625,14 +627,14 @@ struct Exchange{ const int w = W::r; unsigned int mask = w >> (n + 1); // std::cout << " Exchange "<::pg_odd(); - return svneg_x(pg_odd, a); + //return svneg_x(pg_odd, a); + return svneg_m(pg_odd, a); } // Complex double inline vecd operator()(vecd a){ pred pg_odd = acle::pg_odd(); - return svneg_x(pg_odd, a); + //return svneg_x(pg_odd, a); + return svneg_m(pg_odd, a); } }; @@ -401,7 +403,8 @@ struct TimesMinusI{ pred pg_odd = acle::pg_odd(); vecf a_v = svtbl(a, tbl_swap); - return svneg_x(pg_odd, a_v); + //return svneg_x(pg_odd, a_v); + return svneg_m(pg_odd, a_v); } // Complex double inline vecd operator()(vecd a, vecd b){ @@ -410,7 +413,8 @@ struct TimesMinusI{ pred pg_odd = acle::pg_odd(); vecd a_v = svtbl(a, tbl_swap); - return svneg_x(pg_odd, a_v); + //return svneg_x(pg_odd, a_v); + return svneg_m(pg_odd, a_v); } }; @@ -422,7 +426,8 @@ struct TimesI{ pred pg_even = acle::pg_even(); vecf a_v = svtbl(a, tbl_swap); - return svneg_x(pg_even, a_v); + //return svneg_x(pg_even, a_v); + return svneg_m(pg_even, a_v); } // Complex double inline vecd operator()(vecd a, vecd b){ @@ -431,7 +436,8 @@ struct TimesI{ pred pg_even = acle::pg_even(); vecd a_v = svtbl(a, tbl_swap); - return svneg_x(pg_even, a_v); + //return svneg_x(pg_even, a_v); + return svneg_m(pg_even, a_v); } }; From 10a34312dce7eb5d02ea679a0c103ad4dd1f4eb2 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 14 May 2020 23:20:16 +0200 Subject: [PATCH 091/147] some fixed-size code clean up --- Grid/simd/Grid_a64fx-fixedsize.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 696acb76..cb486401 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -96,7 +96,6 @@ struct acle{}; template <> struct acle{ - static inline pred pg1(){return svptrue_b64();} static inline lutd tbl_swap(){ const ulutd t = { .s = {1, 0, 3, 2, 5, 4, 7, 6} }; return t.v; @@ -109,6 +108,7 @@ struct acle{ const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} }; return t.v; } + static inline pred pg1(){return svptrue_b64();} static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} static inline vecd zero(){return svdup_f64(0.);} @@ -116,7 +116,6 @@ struct acle{ template <> struct acle{ - static inline pred pg1(){return svptrue_b32();} // exchange neighboring elements static inline lutf tbl_swap(){ const ulutf t = { .s = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; @@ -134,6 +133,7 @@ struct acle{ const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; return t.v; } + static inline pred pg1(){return svptrue_b32();} static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} static inline vecf zero(){return svdup_f32(0.);} From 015d8bb38ae947d86df72ec1d3f14a0e8113fc43 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 15 May 2020 09:15:50 +0200 Subject: [PATCH 092/147] introduced assertions in Benchmark_wilson, removed data output from Benchmark_dwf --- benchmarks/Benchmark_dwf.cc | 36 ++++++++++++++++++++-------------- benchmarks/Benchmark_wilson.cc | 27 +++++++++++++++---------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 37b33b0e..53630650 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -1,5 +1,5 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./benchmarks/Benchmark_dwf.cc Copyright (C) 2015 @@ -77,7 +77,7 @@ int main (int argc, char ** argv) std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); - + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; @@ -107,8 +107,8 @@ int main (int argc, char ** argv) LatticeFermion err(FGrid); std::cout << GridLogMessage << "Drawing gauge field" << std::endl; - LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); + SU3::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; #if 0 Umu=1.0; @@ -126,7 +126,7 @@ int main (int argc, char ** argv) // Naive wilson implementation //////////////////////////////////// // replicate across fifth dimension - LatticeGaugeField Umu5d(FGrid); + LatticeGaugeField Umu5d(FGrid); std::vector U(4,FGrid); { auto Umu5d_v = Umu5d.View(); @@ -197,13 +197,13 @@ int main (int argc, char ** argv) } double t1=usecond(); FGrid->Barrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + if(( norm2(err)>1.0e-4) ) { + /* std::cout << "RESULT\n " << result<Barrier(); exit(-1); } @@ -243,7 +246,7 @@ int main (int argc, char ** argv) } double t1=usecond(); FGrid->Barrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4)){ +/* std::cout<< "DAG RESULT\n " <Barrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* std::cout<< "Deo RESULT\n " <(Umu,mu); } - + { // Naive wilson implementation ref = Zero(); for(int mu=0;mu Date: Fri, 15 May 2020 10:01:05 +0200 Subject: [PATCH 093/147] implemented correct _m form (using 3 operands instead of 2) --- Grid/simd/Grid_a64fx-2.h | 6 +++--- Grid/simd/Grid_a64fx-fixedsize.h | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 67690c01..55bf1e72 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -423,7 +423,7 @@ struct Conj{ svbool_t pg_odd = acle::pg_odd(); typename acle::vt a_v = svld1(pg1, a.v); //typename acle::vt r_v = svneg_x(pg_odd, a_v); - typename acle::vt r_v = svneg_m(pg_odd, a_v); + typename acle::vt r_v = svneg_m(a_v, pg_odd, a_v); svst1(pg1, out.v, r_v); return out; @@ -443,7 +443,7 @@ struct TimesMinusI{ typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); typename acle::vt a_v = svld1(pg1, a.v); a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_m(pg_odd, a_v); + typename acle::vt r_v = svneg_m(a_v, pg_odd, a_v); svst1(pg1, out.v, r_v); return out; @@ -464,7 +464,7 @@ struct TimesI{ typename acle::vt a_v = svld1(pg1, a.v); a_v = svtbl(a_v, tbl_swap_v); //typename acle::vt r_v = svneg_x(pg_even, a_v); - typename acle::vt r_v = svneg_m(pg_even, a_v); + typename acle::vt r_v = svneg_m(a_v, pg_even, a_v); svst1(pg1, out.v, r_v); return out; diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index cb486401..387a1f99 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -385,13 +385,13 @@ struct Conj{ inline vecf operator()(vecf a){ pred pg_odd = acle::pg_odd(); //return svneg_x(pg_odd, a); - return svneg_m(pg_odd, a); + return svneg_m(a, pg_odd, a); } // Complex double inline vecd operator()(vecd a){ pred pg_odd = acle::pg_odd(); //return svneg_x(pg_odd, a); - return svneg_m(pg_odd, a); + return svneg_m(a, pg_odd, a); } }; @@ -404,7 +404,7 @@ struct TimesMinusI{ vecf a_v = svtbl(a, tbl_swap); //return svneg_x(pg_odd, a_v); - return svneg_m(pg_odd, a_v); + return svneg_m(a_v, pg_odd, a_v); } // Complex double inline vecd operator()(vecd a, vecd b){ @@ -414,7 +414,7 @@ struct TimesMinusI{ vecd a_v = svtbl(a, tbl_swap); //return svneg_x(pg_odd, a_v); - return svneg_m(pg_odd, a_v); + return svneg_m(a_v, pg_odd, a_v); } }; @@ -427,7 +427,7 @@ struct TimesI{ vecf a_v = svtbl(a, tbl_swap); //return svneg_x(pg_even, a_v); - return svneg_m(pg_even, a_v); + return svneg_m(a_v, pg_even, a_v); } // Complex double inline vecd operator()(vecd a, vecd b){ @@ -437,7 +437,7 @@ struct TimesI{ vecd a_v = svtbl(a, tbl_swap); //return svneg_x(pg_even, a_v); - return svneg_m(pg_even, a_v); + return svneg_m(a_v, pg_even, a_v); } }; From 032f7dde1aa10c131c0f30ca97994b3b3577d0d0 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 18 May 2020 19:10:36 +0200 Subject: [PATCH 094/147] update SVE readme, asm generator --- Grid/simd/gridverter.py | 11 +++++++---- SVE_README.txt | 25 ++++++++++++++++--------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py index 7628159b..f00a5019 100755 --- a/Grid/simd/gridverter.py +++ b/Grid/simd/gridverter.py @@ -116,7 +116,7 @@ STORE_BASE_PTR_COLOR_OFFSET = 2 OPT = """ * interleave prefetching and compute in MULT_2SPIN -* could test storing U's in MULT_2SPIN to L1d, might be beneficial for life time cache lines +* could test storing U's in MULT_2SPIN to L1d for cache line update * structure reordering: MAYBEPERM after MULT_2SPIN ? """ @@ -375,7 +375,12 @@ class Register: def zero(self, zeroreg=False): d['zero'] += d['factor'] d['C'] += F' {self.name} = 0; \\\n' - d['I'] += F' {self.name} = __svzero({self.name}); \\\n' + #d['I'] += F' {self.name} = __svzero({self.name}); \\\n' only armclang + + if PRECISION == 'double': + d['I'] += F' {self.name} = svdup_f64(0.); \\\n' + else: + d['I'] += F' {self.name} = svdup_f32(0.); \\\n' if zeroreg == True: d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' @@ -906,8 +911,6 @@ else: define(F'LOCK_GAUGE(A)') define(F'UNLOCK_GAUGE(A)') define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') -define(F'COMPLEX_SIGNS(A)') -define(F'LOAD64(A,B)') define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') diff --git a/SVE_README.txt b/SVE_README.txt index 886732ca..609cf111 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -1,32 +1,39 @@ -* gcc 10.0.1 VLA +* gcc 10.0.1 VLA (merlin) ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -* gcc 10.0.1 fixed-size ACLE +* gcc 10.0.1 fixed-size ACLE (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-op enmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" -* armclang 20.0 VLA +* armclang 20.0 VLA (merlin) ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -must use armclang 20.0 with ARMCLANGCOMPAT, otherwise Benchmark_wilson gives wrong result +TODO check ARMCLANGCOMPAT -* armclang 20.1 VLA +* armclang 20.1 VLA (merlin) ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static -must use armclang 20.1 with ARMCLANGCOMPAT, otherwise Benchmark_wilson gives wrong result +TODO check ARMCLANGCOMPAT -* Fujitsu FCC +* armclang 20.1 VLA (fjt cluster) + +../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" + +TODO check ARMCLANGCOMPAT + + +* Fujitsu fcc ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" -* Fujitsu FCC w/ MPI +* Fujitsu fcc w/ MPI ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" From 9f212679f1ee86b66198a5239db852e24224db4d Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 18 May 2020 19:55:18 +0200 Subject: [PATCH 095/147] support fcmla in vector_types, untested --- Grid/simd/Grid_a64fx-2.h | 46 +++++++++++++++++++++++--------- Grid/simd/Grid_a64fx-fixedsize.h | 39 ++++++++++++++++++++------- Grid/simd/Grid_vector_types.h | 8 ++++++ 3 files changed, 71 insertions(+), 22 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 55bf1e72..cc25faa3 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -388,8 +388,29 @@ struct MultComplex{ typename acle::vt z_v = acle::zero(); // using FCMLA - typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 90); - r_v = svcmla_x(pg1, r_v, a_v, b_v, 0); + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0); + r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultAddComplex{ + // Complex a*b+c + template + inline vec mac(const vec &a, const vec &b, const vec &c){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v);; + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); + r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); svst1(pg1, out.v, r_v); @@ -897,15 +918,16 @@ typedef Optimization::Vstream VstreamSIMD; template using ReduceSIMD = Optimization::Reduce; // Arithmetic operations -typedef Optimization::Sum SumSIMD; -typedef Optimization::Sub SubSIMD; -typedef Optimization::Div DivSIMD; -typedef Optimization::Mult MultSIMD; -typedef Optimization::MultComplex MultComplexSIMD; -typedef Optimization::MultRealPart MultRealPartSIMD; -typedef Optimization::MaddRealPart MaddRealPartSIMD; -typedef Optimization::Conj ConjSIMD; -typedef Optimization::TimesMinusI TimesMinusISIMD; -typedef Optimization::TimesI TimesISIMD; +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultAddComplex MultAddComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; NAMESPACE_END(Grid) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 387a1f99..e1770988 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -367,6 +367,24 @@ struct MultComplex{ } }; +struct MultAddComplex{ + // Complex a*b+c + // Complex float + inline vecf mac(vecf a, vecf b, vecf c){ + pred pg1 = acle::pg1(); + // using FCMLA + vecf r_v = svcmla_x(pg1, c, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } + // Complex double + inline vecd mac(vecd a, vecd b, vecd c){ + pred pg1 = acle::pg1(); + // using FCMLA + vecf r_v = svcmla_x(pg1, c, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } +}; + struct Div{ // Real float inline vecf operator()(vecf a, vecf b){ @@ -772,15 +790,16 @@ typedef Optimization::Vstream VstreamSIMD; template using ReduceSIMD = Optimization::Reduce; // Arithmetic operations -typedef Optimization::Sum SumSIMD; -typedef Optimization::Sub SubSIMD; -typedef Optimization::Div DivSIMD; -typedef Optimization::Mult MultSIMD; -typedef Optimization::MultComplex MultComplexSIMD; -typedef Optimization::MultRealPart MultRealPartSIMD; -typedef Optimization::MaddRealPart MaddRealPartSIMD; -typedef Optimization::Conj ConjSIMD; -typedef Optimization::TimesMinusI TimesMinusISIMD; -typedef Optimization::TimesI TimesISIMD; +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultAddComplex MultAddComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; NAMESPACE_END(Grid); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 3a2e7228..1e1ae71e 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -298,11 +298,19 @@ public: /////////////////////////////////////////////// // FIXME -- alias this to an accelerator_inline MAC struct. + #if defined(A64FX) || defined(A64FXFIXEDSIZE) // on A64FX use FCMLA + friend accelerator_inline void mac(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ a, + const Grid_simd *__restrict__ x) { + y->v = Optimization::MultAddComplex::mac(a->v, x->v, y->v); + }; + #else friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { *y = (*a) * (*x) + (*y); }; + #endif friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, From 323a651c71a14e02e1f9761e28c8b48888a4bfc6 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 18 May 2020 19:58:27 +0200 Subject: [PATCH 096/147] correct typo --- Grid/simd/Grid_a64fx-fixedsize.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index e1770988..46cb9354 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -380,7 +380,7 @@ struct MultAddComplex{ inline vecd mac(vecd a, vecd b, vecd c){ pred pg1 = acle::pg1(); // using FCMLA - vecf r_v = svcmla_x(pg1, c, a, b, 0); + vecd r_v = svcmla_x(pg1, c, a, b, 0); return svcmla_x(pg1, r_v, a, b, 90); } }; From 6b6bf537d33831ce25f9d87abb55bb1975558220 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 18 May 2020 20:31:44 +0200 Subject: [PATCH 097/147] comment out mac in vector types --- Grid/simd/Grid_vector_types.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 1e1ae71e..199d1487 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -298,19 +298,24 @@ public: /////////////////////////////////////////////// // FIXME -- alias this to an accelerator_inline MAC struct. - #if defined(A64FX) || defined(A64FXFIXEDSIZE) // on A64FX use FCMLA + + // A64FX: use FCMLA + /* + #if defined(A64FX) || defined(A64FXFIXEDSIZE) // A64FX: use FCMLA friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { y->v = Optimization::MultAddComplex::mac(a->v, x->v, y->v); }; #else + #endif + + */ friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { *y = (*a) * (*x) + (*y); }; - #endif friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, From 9e085bd04edf96de76f56f0c90661b91a870f0db Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 20 May 2020 19:16:30 +0200 Subject: [PATCH 098/147] guard prevents multiple A64FX build messages --- .../implementation/WilsonKernelsAsmA64FX.h | 6 ++--- Grid/simd/Grid_a64fx-fixedsize.h | 2 +- Grid/simd/Grid_vector_types.h | 11 +++++++-- SVE_README.txt | 24 +++++++++++++++++++ 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index efb0746a..34da3110 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -39,16 +39,16 @@ Author: Nils Meyer Regensburg University // enable A64FX body #define WILSONKERNELSASMBODYA64FX -#pragma message("Including A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") +#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine /////////////////////////////////////////////////////////// #if defined(DSLASHINTRIN) -#pragma message ("including A64FX Dslash: intrin") +#pragma message ("A64FX Dslash: intrin") #include #else -#pragma message ("including A64FX Dslash: asm") +#pragma message ("A64FX Dslash: asm") #include #endif diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 46cb9354..edccfa82 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -39,7 +39,7 @@ // gcc 10 features #if __ARM_FEATURE_SVE_BITS==512 -#pragma message("Fixed-size SVE ACLE") +#pragma message("building for A64FX / fixed SVE data type size") /* gcc 10.0.1 and gcc 10.1 bug using ACLE data types CAS-159553-Y1K4C6 workaround: use gcc's internal data types, bugfix expected for gcc 10.2 typedef svbool_t pred __attribute__((arm_sve_vector_bits(512))); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 199d1487..ddf37ba9 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -120,11 +120,15 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here #include #if defined(A64FX) // VLA + #ifndef MSGVLA #pragma message("building for A64FX / SVE ACLE VLA") + #define MSGVLA + #endif #if defined(ARMCLANGCOMPAT) + #ifndef MSGCOMPAT #pragma message("applying armclang fix") - //#else - //#pragma message("not applying armclang fix") + #define MSGCOMPAT + #endif #endif #include "Grid_a64fx-2.h" #endif @@ -132,7 +136,10 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #include "Grid_a64fx-fixedsize.h" #endif #else + #ifndef MSGGEN #pragma message("building for GEN") // generic + #define MSGGEN + #endif #include "Grid_generic.h" #endif #endif diff --git a/SVE_README.txt b/SVE_README.txt index 609cf111..be4f1baa 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -8,6 +8,22 @@ ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +* gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI + +export OMPI_CC=gcc-10.0.1 +export OMPI_CXX=g++-10.0.1 +export MPICH_CC=gcc-10.0.1 +export MPICH_CXX=g++-10.0.1 + +$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.24/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.24/lib64 -lrt" + +works! but binaries do not finish when running via job scheduler. problem with MPI_finalize ? + +interactive login: mpirun -np 1 ./Benchmark_wilson_sweep --dslash-asm + [WARN] PLE 0610 plexec The process terminated with the signal.(rank=0)(nid=0xff010008)(sig=9) ? + +-------------------------------------------------------- + * armclang 20.0 VLA (merlin) ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static @@ -29,6 +45,14 @@ TODO check ARMCLANGCOMPAT TODO check ARMCLANGCOMPAT +* armclang 20.1 VLA (fjt cluster) + +../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.24/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.24/lib64" + +No ARMCLANGCOMPAT -> still correct ? + +-------------------------------------------------------- + * Fujitsu fcc ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" From b780b7b7a00f10ddc230f733aab7c8cf82e5ce10 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 20 May 2020 19:20:59 +0200 Subject: [PATCH 099/147] guard prevents multiple TOFU messages --- Grid/communicator/Communicator_mpi3.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 31566352..cb03ca46 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -44,12 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { -// temporarily enable Fugaku/Tofu support by default +// Fugaku Tofu: enable by default #if defined (A64FX) || defined (A64FXFIXEDSIZE) +#ifndef TOFU #define TOFU +#pragma message ("MPI_THREAD_SINGLE") +#endif #endif -#if defined (TOFU) // hack for FUGAKU, credits go to Issaku Kanamori +#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori nCommThreads=1; MPI_Init(argc,argv); #else From 9a860597613f20ed2fdebb4cbee4adc8dcf57e4a Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 20 May 2020 20:05:42 +0200 Subject: [PATCH 100/147] symmetrize VLA and fixed size build messages --- Grid/communicator/Communicator_mpi3.cc | 2 +- Grid/simd/Grid_a64fx-fixedsize.h | 1 - Grid/simd/Grid_vector_types.h | 10 +--------- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index cb03ca46..15b8cbfd 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -48,7 +48,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) #if defined (A64FX) || defined (A64FXFIXEDSIZE) #ifndef TOFU #define TOFU -#pragma message ("MPI_THREAD_SINGLE") +#pragma message ("TOFU network / MPI_THREAD_SINGLE") #endif #endif diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index edccfa82..3a0066d1 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -39,7 +39,6 @@ // gcc 10 features #if __ARM_FEATURE_SVE_BITS==512 -#pragma message("building for A64FX / fixed SVE data type size") /* gcc 10.0.1 and gcc 10.1 bug using ACLE data types CAS-159553-Y1K4C6 workaround: use gcc's internal data types, bugfix expected for gcc 10.2 typedef svbool_t pred __attribute__((arm_sve_vector_bits(512))); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index ddf37ba9..e2624e15 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -120,26 +120,18 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here #include #if defined(A64FX) // VLA - #ifndef MSGVLA #pragma message("building for A64FX / SVE ACLE VLA") - #define MSGVLA - #endif #if defined(ARMCLANGCOMPAT) - #ifndef MSGCOMPAT #pragma message("applying armclang fix") - #define MSGCOMPAT - #endif #endif #include "Grid_a64fx-2.h" #endif #if defined(A64FXFIXEDSIZE) // fixed size data types + #pragma message("building for A64FX / SVE ACLE fixed size") #include "Grid_a64fx-fixedsize.h" #endif #else - #ifndef MSGGEN #pragma message("building for GEN") // generic - #define MSGGEN - #endif #include "Grid_generic.h" #endif #endif From 81484a47604e1041c437501c265e9fe02bf2e2e7 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 20 May 2020 22:36:45 +0200 Subject: [PATCH 101/147] symmetrize Mult and MultAddComplex --- Grid/simd/Grid_a64fx-2.h | 6 ++---- Grid/simd/Grid_a64fx-fixedsize.h | 8 ++++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index cc25faa3..d92f8b40 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -400,7 +400,7 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c template - inline vec mac(const vec &a, const vec &b, const vec &c){ + inline vec mac(const vec &a, const vec b, const vec c){ vec out; svbool_t pg1 = acle::pg1(); @@ -412,9 +412,7 @@ struct MultAddComplex{ typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); - svst1(pg1, out.v, r_v); - - return out; + svst1(pg1, a.v, r_v); } }; diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 3a0066d1..9e0fc51e 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -369,18 +369,18 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c // Complex float - inline vecf mac(vecf a, vecf b, vecf c){ + inline vecf mac(vecf &a, vecf b, vecf c){ pred pg1 = acle::pg1(); // using FCMLA vecf r_v = svcmla_x(pg1, c, a, b, 0); - return svcmla_x(pg1, r_v, a, b, 90); + a = svcmla_x(pg1, r_v, a, b, 90); } // Complex double - inline vecd mac(vecd a, vecd b, vecd c){ + inline vecd mac(vecd &a, vecd b, vecd c){ pred pg1 = acle::pg1(); // using FCMLA vecd r_v = svcmla_x(pg1, c, a, b, 0); - return svcmla_x(pg1, r_v, a, b, 90); + a = svcmla_x(pg1, r_v, a, b, 90); } }; From 832485699f239cb77ec2303f6874bf81c8b5ddbc Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 20 May 2020 23:04:35 +0200 Subject: [PATCH 102/147] save some cycles in HtoD and DtoH by direct instead of multi-pass conversion --- Grid/simd/Grid_a64fx-fixedsize.h | 77 +++++++++++++++----------------- 1 file changed, 35 insertions(+), 42 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 9e0fc51e..28fafa27 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -401,13 +401,13 @@ struct Conj{ // Complex float inline vecf operator()(vecf a){ pred pg_odd = acle::pg_odd(); - //return svneg_x(pg_odd, a); + //return svneg_x(pg_odd, a); this is unsafe! return svneg_m(a, pg_odd, a); } // Complex double inline vecd operator()(vecd a){ pred pg_odd = acle::pg_odd(); - //return svneg_x(pg_odd, a); + //return svneg_x(pg_odd, a); this is unsafe! return svneg_m(a, pg_odd, a); } }; @@ -420,7 +420,7 @@ struct TimesMinusI{ pred pg_odd = acle::pg_odd(); vecf a_v = svtbl(a, tbl_swap); - //return svneg_x(pg_odd, a_v); + //return svneg_x(pg_odd, a_v); this is unsafe return svneg_m(a_v, pg_odd, a_v); } // Complex double @@ -430,7 +430,7 @@ struct TimesMinusI{ pred pg_odd = acle::pg_odd(); vecd a_v = svtbl(a, tbl_swap); - //return svneg_x(pg_odd, a_v); + //return svneg_x(pg_odd, a_v); this is unsafe return svneg_m(a_v, pg_odd, a_v); } }; @@ -443,7 +443,7 @@ struct TimesI{ pred pg_even = acle::pg_even(); vecf a_v = svtbl(a, tbl_swap); - //return svneg_x(pg_even, a_v); + //return svneg_x(pg_even, a_v); this is unsafe return svneg_m(a_v, pg_even, a_v); } // Complex double @@ -453,7 +453,7 @@ struct TimesI{ pred pg_even = acle::pg_even(); vecd a_v = svtbl(a, tbl_swap); - //return svneg_x(pg_even, a_v); + //return svneg_x(pg_even, a_v); this is unsafe return svneg_m(a_v, pg_even, a_v); } }; @@ -486,54 +486,47 @@ struct PrecisionChange { b = svcvt_f64_x(pg1d, sb_v); } static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { -/* - vech ret; - svbool_t pg1d = acle::pg1(); - svbool_t pg1h = acle::pg1(); - typename acle::vt a_v = svld1(pg1d, a.v); - typename acle::vt b_v = svld1(pg1d, b.v); - typename acle::vt c_v = svld1(pg1d, c.v); - typename acle::vt d_v = svld1(pg1d, d.v); - typename acle::vt ha_v = svcvt_f16_x(pg1d, a_v); - typename acle::vt hb_v = svcvt_f16_x(pg1d, b_v); - typename acle::vt hc_v = svcvt_f16_x(pg1d, c_v); - typename acle::vt hd_v = svcvt_f16_x(pg1d, d_v); - typename acle::vt hab_v = svuzp1(ha_v, hb_v); - typename acle::vt hcd_v = svuzp1(hc_v, hd_v); - typename acle::vt r_v = svuzp1(hab_v, hcd_v); - svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + pred pg1d = acle::pg1(); + pred pg1h = acle::pg1(); + vecd a_v = svld1(pg1d, a.v); + vecd b_v = svld1(pg1d, b.v); + vecd c_v = svld1(pg1d, c.v); + vecd d_v = svld1(pg1d, d.v); + vech ha_v = svcvt_f16_x(pg1d, a_v); + vech hb_v = svcvt_f16_x(pg1d, b_v); + vech hc_v = svcvt_f16_x(pg1d, c_v); + vech hd_v = svcvt_f16_x(pg1d, d_v); + vech hab_v = svuzp1(ha_v, hb_v); + vech hcd_v = svuzp1(hc_v, hd_v); + return r_v = svuzp1(hab_v, hcd_v); - return ret; -*/ +/* vecf sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); return StoH(sa,sb); +*/ } static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + pred pg1h = acle::pg1(); + pred pg1d = acle::pg1(); + vech sa_v = svzip1(h_v, h_v); + vech sb_v = svzip2(h_v, h_v); + vech da_v = svzip1(sa_v, sa_v); + vech db_v = svzip2(sa_v, sa_v); + vech dc_v = svzip1(sb_v, sb_v); + vech dd_v = svzip2(sb_v, sb_v); + vecd a = svcvt_f64_x(pg1d, da_v); + vecd b = svcvt_f64_x(pg1d, db_v); + vecd c = svcvt_f64_x(pg1d, dc_v); + vecd d = svcvt_f64_x(pg1d, dd_v); + /* - svbool_t pg1h = acle::pg1(); - svbool_t pg1d = acle::pg1(); - typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); - typename acle::vt sa_v = svzip1(h_v, h_v); - typename acle::vt sb_v = svzip2(h_v, h_v); - typename acle::vt da_v = svzip1(sa_v, sa_v); - typename acle::vt db_v = svzip2(sa_v, sa_v); - typename acle::vt dc_v = svzip1(sb_v, sb_v); - typename acle::vt dd_v = svzip2(sb_v, sb_v); - typename acle::vt a_v = svcvt_f64_x(pg1d, da_v); - typename acle::vt b_v = svcvt_f64_x(pg1d, db_v); - typename acle::vt c_v = svcvt_f64_x(pg1d, dc_v); - typename acle::vt d_v = svcvt_f64_x(pg1d, dd_v); - svst1(pg1d, a.v, a_v); - svst1(pg1d, b.v, b_v); - svst1(pg1d, c.v, c_v); - svst1(pg1d, d.v, d_v); -*/ vecf sa,sb; HtoS(h,sa,sb); StoD(sa,a,b); StoD(sb,c,d); +*/ } }; From f8c0a59221cfea1dd674acdfeddaa1b615d0b848 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 21 May 2020 02:48:14 +0200 Subject: [PATCH 103/147] clean up; Exch1 dp integrate, tested, working --- Grid/simd/Grid_a64fx-2.h | 2 +- Grid/simd/Grid_a64fx-fixedsize.h | 87 ++++++++++++++------------------ 2 files changed, 39 insertions(+), 50 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index d92f8b40..c03184df 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -400,7 +400,7 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c template - inline vec mac(const vec &a, const vec b, const vec c){ + inline mac(const vec &a, const vec b, const vec c){ vec out; svbool_t pg1 = acle::pg1(); diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 28fafa27..eef28b83 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -107,6 +107,18 @@ struct acle{ const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} }; return t.v; } + static inline lutd tbl_exch1a(){ // Exchange1 + const ulutd t = { .s = {0, 1, 4, 5, 2, 3, 6, 7} }; + return t.v; + } + static inline lutd tbl_exch1b(){ // Exchange1 + const ulutd t = { .s = {2, 3, 6, 7, 0, 1, 4, 5} }; + return t.v; + } + static inline lutd tbl_exch1c(){ // Exchange1 + const ulutd t = { .s = {4, 5, 0, 1, 6, 7, 2, 3} }; + return t.v; + } static inline pred pg1(){return svptrue_b64();} static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} @@ -369,14 +381,14 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c // Complex float - inline vecf mac(vecf &a, vecf b, vecf c){ + inline void mac(vecf &a, vecf b, vecf c){ pred pg1 = acle::pg1(); // using FCMLA vecf r_v = svcmla_x(pg1, c, a, b, 0); a = svcmla_x(pg1, r_v, a, b, 90); } // Complex double - inline vecd mac(vecd &a, vecd b, vecd c){ + inline void mac(vecd &a, vecd b, vecd c){ pred pg1 = acle::pg1(); // using FCMLA vecd r_v = svcmla_x(pg1, c, a, b, 0); @@ -401,13 +413,13 @@ struct Conj{ // Complex float inline vecf operator()(vecf a){ pred pg_odd = acle::pg_odd(); - //return svneg_x(pg_odd, a); this is unsafe! + //return svneg_x(pg_odd, a); this is unsafe return svneg_m(a, pg_odd, a); } // Complex double inline vecd operator()(vecd a){ pred pg_odd = acle::pg_odd(); - //return svneg_x(pg_odd, a); this is unsafe! + //return svneg_x(pg_odd, a); this is unsafe return svneg_m(a, pg_odd, a); } }; @@ -488,17 +500,13 @@ struct PrecisionChange { static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { pred pg1d = acle::pg1(); pred pg1h = acle::pg1(); - vecd a_v = svld1(pg1d, a.v); - vecd b_v = svld1(pg1d, b.v); - vecd c_v = svld1(pg1d, c.v); - vecd d_v = svld1(pg1d, d.v); - vech ha_v = svcvt_f16_x(pg1d, a_v); - vech hb_v = svcvt_f16_x(pg1d, b_v); - vech hc_v = svcvt_f16_x(pg1d, c_v); - vech hd_v = svcvt_f16_x(pg1d, d_v); + vech ha_v = svcvt_f16_x(pg1d, a); + vech hb_v = svcvt_f16_x(pg1d, b); + vech hc_v = svcvt_f16_x(pg1d, c); + vech hd_v = svcvt_f16_x(pg1d, d); vech hab_v = svuzp1(ha_v, hb_v); vech hcd_v = svuzp1(hc_v, hd_v); - return r_v = svuzp1(hab_v, hcd_v); + return svuzp1(hab_v, hcd_v); /* vecf sa,sb; @@ -510,16 +518,16 @@ struct PrecisionChange { static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { pred pg1h = acle::pg1(); pred pg1d = acle::pg1(); - vech sa_v = svzip1(h_v, h_v); - vech sb_v = svzip2(h_v, h_v); + vech sa_v = svzip1(h, h); + vech sb_v = svzip2(h, h); vech da_v = svzip1(sa_v, sa_v); vech db_v = svzip2(sa_v, sa_v); vech dc_v = svzip1(sb_v, sb_v); vech dd_v = svzip2(sb_v, sb_v); - vecd a = svcvt_f64_x(pg1d, da_v); - vecd b = svcvt_f64_x(pg1d, db_v); - vecd c = svcvt_f64_x(pg1d, dc_v); - vecd d = svcvt_f64_x(pg1d, dd_v); + a = svcvt_f64_x(pg1d, da_v); + b = svcvt_f64_x(pg1d, db_v); + c = svcvt_f64_x(pg1d, dc_v); + d = svcvt_f64_x(pg1d, dd_v); /* vecf sa,sb; @@ -579,26 +587,17 @@ struct Exchange{ out2 = svext(in1, r2_v, (uint64_t)4u); } static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){ - // FIXME - uvecd v1 = { .v = in1 }; - uvecd v2 = { .v = in2 }; - uvecd o1, o2; + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + lutd tbl_exch1a = acle::tbl_exch1a(); + lutd tbl_exch1b = acle::tbl_exch1b(); + lutd tbl_exch1c = acle::tbl_exch1c(); - const int n = 1; - const int w = 8; // w = W::r - unsigned int mask = w >> (n + 1); - // std::cout << " Exchange "<::tbl1(); @@ -713,8 +712,6 @@ struct Reduce{ }; //Complex float Reduce template <> -// inline Grid::ComplexF Reduce::operator()(svfloat32_t in){ -//inline Grid::ComplexF Reduce::operator()(__SVFloat32_t in){ inline Grid::ComplexF Reduce::operator()(vecf in){ pred pg_even = acle::pg_even(); pred pg_odd = acle::pg_odd(); @@ -724,16 +721,12 @@ inline Grid::ComplexF Reduce::operator()(vecf in){ } //Real float Reduce template <> -//inline Grid::RealF Reduce::operator()(svfloat32_t in){ -//inline Grid::RealF Reduce::operator()(__SVFloat32_t in){ inline Grid::RealF Reduce::operator()(vecf in){ pred pg1 = acle::pg1(); return svred(pg1, in); } //Complex double Reduce template <> -//inline Grid::ComplexD Reduce::operator()(svfloat64_t in){ -//inline Grid::ComplexD Reduce::operator()(__SVFloat64_t in){ inline Grid::ComplexD Reduce::operator()(vecd in){ pred pg_even = acle::pg_even(); pred pg_odd = acle::pg_odd(); @@ -743,16 +736,12 @@ inline Grid::ComplexD Reduce::operator()(vecd in){ } //Real double Reduce template <> -//inline Grid::RealD Reduce::operator()(svfloat64_t in){ -//inline Grid::RealD Reduce::operator()(__SVFloat64_t in){ inline Grid::RealD Reduce::operator()(vecd in){ pred pg1 = acle::pg1(); return svred(pg1, in); } //Integer Reduce template <> -//inline Integer Reduce::operator()(svuint32_t in){ -//inline Integer Reduce::operator()(__SVUint32_t in){ inline Integer Reduce::operator()(veci in){ pred pg1 = acle::pg1(); return svred(pg1, in); From cd27f1005d3bcee8e12ad8ac8c387996b6194def Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 21 May 2020 08:45:43 +0200 Subject: [PATCH 104/147] clean up; Exch1 sp integrate, tested, working --- Grid/simd/Grid_a64fx-fixedsize.h | 68 +++++++++++++------------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index eef28b83..a07cbff3 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -60,36 +60,20 @@ typedef __SVUint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs fo #pragma error("Oops. Illegal SVE vector size!?") #endif /* __ARM_FEATURE_SVE_BITS */ -// safety definition, not sure if it's necessary -//#define GEN_SIMD_WIDTH 64u - // low-level API NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Optimization); -// convenience union types for tables eliminate loads +// convenience union types for tables eliminating loads union ulutf { lutf v; uint32_t s[16]; }; - union ulutd { lutd v; uint64_t s[8]; }; -// FIXME convenience union types for Exchange1 -union uvecf { - vecf v; - float32_t s[16]; -}; - -union uvecd { - vecd v; - float64_t s[8]; -}; - - template struct acle{}; @@ -144,6 +128,18 @@ struct acle{ const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; return t.v; } + static inline lutf tbl_exch1a(){ // Exchange1 + const ulutf t = { .s = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 } }; + return t.v; + } + static inline lutf tbl_exch1b(){ // Exchange1 + const ulutf t = { .s = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 } }; + return t.v; + } + static inline lutf tbl_exch1c(){ // Exchange1 + const ulutf t = { .s = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7} }; + return t.v; + } static inline pred pg1(){return svptrue_b32();} static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} @@ -191,7 +187,6 @@ struct Vsplat{ } // Integer inline veci operator()(Integer a){ - // Add check whether Integer is really a uint32_t??? return svdup_u32(a); } }; @@ -538,9 +533,6 @@ struct PrecisionChange { } }; -#define VECTOR_FOR(i, w, inc) \ -for (unsigned int i = 0; i < w; i += inc) - struct Exchange{ // float static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){ @@ -550,25 +542,18 @@ struct Exchange{ out2 = svext(in1, r2_v, (uint64_t)8u); } static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ - // FIXME - uvecf v1 = { .v = in1 }; - uvecf v2 = { .v = in2 }; - uvecf o1, o2; + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI + lutf tbl_exch1a = acle::tbl_exch1a(); + lutf tbl_exch1b = acle::tbl_exch1b(); + lutf tbl_exch1c = acle::tbl_exch1c(); - const int n = 1; - const int w = 16; // w = W::r - unsigned int mask = w >> (n + 1); - // std::cout << " Exchange "< SFI lutd tbl_exch1a = acle::tbl_exch1a(); lutd tbl_exch1b = acle::tbl_exch1b(); lutd tbl_exch1c = acle::tbl_exch1c(); vecd a1_v = svtbl(in1, tbl_exch1a); vecd a2_v = svtbl(in2, tbl_exch1b); - vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u); - vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u); + vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u); + vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u); out1 = svtbl(b1_v, tbl_exch1c); out2 = svtbl(b2_v, tbl_exch1a); } From a65ce237c1b8feb62f96b42b2dc57ed0d5168348 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 21 May 2020 09:48:06 +0200 Subject: [PATCH 105/147] clean up; Exch1 VLA sp+dp integrate, tested, working --- Grid/simd/Grid_a64fx-2.h | 174 ++++++++++++++------------------------- 1 file changed, 64 insertions(+), 110 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index c03184df..0333299f 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -30,21 +30,8 @@ // Using SVE ACLE ///////////////////////////////////////////////////// -#ifndef GEN_SIMD_WIDTH -#define GEN_SIMD_WIDTH 64u -#endif - static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); -#ifdef __ARM_FEATURE_SVE - #ifdef __clang__ - //#pragma message("Using clang compiler") - #include - #endif -#else - #pragma error "Missing SVE feature" -#endif /* __ARM_FEATURE_SVE */ - NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Optimization); @@ -104,16 +91,28 @@ struct acle{ static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; - return t; + const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + return t; } static inline vec tbl0(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; - return t; + const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + return t; } static inline vec tbl1(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; - return t; + const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + return t; + } + static inline vec tbl_exch1a(){ // Exchange1 + const vec t = {0, 1, 4, 5, 2, 3, 6, 7}; + return t; + } + static inline vec tbl_exch1b(){ // Exchange1 + const vec t = {2, 3, 6, 7, 0, 1, 4, 5}; + return t; + } + static inline vec tbl_exch1c(){ // Exchange1 + const vec t = {4, 5, 0, 1, 6, 7, 2, 3}; + return t; } static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} @@ -132,20 +131,32 @@ struct acle{ static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} // exchange neighboring elements static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - return t; + const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return t; } static inline vec tbl0(){ - const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; - return t; + const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + return t; } static inline vec tbl1(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; - return t; + const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + return t; } static inline vec tbl2(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; - return t; + const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + return t; + } + static inline vec tbl_exch1a(){ // Exchange1 + const vec t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; + return t; + } + static inline vec tbl_exch1b(){ // Exchange1 + const vec t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; + return t; + } + static inline vec tbl_exch1c(){ // Exchange1 + const vec t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; + return t; } static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} @@ -186,7 +197,6 @@ struct acle{ struct Vsplat{ // Complex float inline vecf operator()(float a, float b){ - vecf out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svdup_f32(a); @@ -198,7 +208,6 @@ struct Vsplat{ // Real float inline vecf operator()(float a){ - vecf out; svbool_t pg1 = acle::pg1(); typename acle::vt r_v = svdup_f32(a); @@ -208,7 +217,6 @@ struct Vsplat{ // Complex double inline vecd operator()(double a, double b){ - vecd out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svdup_f64(a); @@ -220,7 +228,6 @@ struct Vsplat{ // Real double inline vecd operator()(double a){ - vecd out; svbool_t pg1 = acle::pg1(); typename acle::vt r_v = svdup_f64(a); @@ -230,7 +237,6 @@ struct Vsplat{ // Integer inline vec operator()(Integer a){ - vec out; svbool_t pg1 = acle::pg1(); // Add check whether Integer is really a uint32_t??? @@ -244,7 +250,6 @@ struct Vstore{ // Real template inline void operator()(vec a, T *D){ - svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); svst1(pg1, D, a_v); @@ -255,7 +260,6 @@ struct Vstream{ // Real template inline void operator()(T * a, vec b){ - svbool_t pg1 = acle::pg1(); typename acle::vt b_v = svld1(pg1, b.v); svstnt1(pg1, a, b_v); @@ -267,7 +271,6 @@ struct Vstream{ // Complex template inline vec operator()(std::complex *a){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, (T*)a); @@ -279,7 +282,6 @@ struct Vstream{ // Real template inline vec operator()(T *a){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a); @@ -296,7 +298,6 @@ struct Vstream{ struct Sum{ template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -311,7 +312,6 @@ struct Sum{ struct Sub{ template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -326,7 +326,6 @@ struct Sub{ struct Mult{ template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -341,7 +340,6 @@ struct Mult{ struct MultRealPart{ template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -360,7 +358,6 @@ struct MultRealPart{ struct MaddRealPart{ template inline vec operator()(vec a, vec b, vec c){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -380,7 +377,6 @@ struct MultComplex{ // Complex a*b template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -400,8 +396,7 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c template - inline mac(const vec &a, const vec b, const vec c){ - + inline void mac(const vec &a, const vec b, const vec c){ vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -420,7 +415,6 @@ struct Div{ // Real template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -436,7 +430,6 @@ struct Conj{ // Complex template inline vec operator()(vec a){ - vec out; svbool_t pg1 = acle::pg1(); svbool_t pg_odd = acle::pg_odd(); @@ -453,7 +446,6 @@ struct TimesMinusI{ // Complex template inline vec operator()(vec a, vec b){ - vec out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -473,7 +465,6 @@ struct TimesI{ // Complex template inline vec operator()(vec a, vec b){ - vec out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -492,7 +483,6 @@ struct TimesI{ struct PrecisionChange { static inline vech StoH (const vecf &sa,const vecf &sb) { - vech ret; svbool_t pg1s = acle::pg1(); svbool_t pg1h = acle::pg1(); @@ -502,10 +492,10 @@ struct PrecisionChange { typename acle::vt hb_v = svcvt_f16_x(pg1s, sb_v); typename acle::vt r_v = svuzp1(ha_v, hb_v); svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + return ret; } static inline void HtoS(vech h,vecf &sa,vecf &sb) { - svbool_t pg1h = acle::pg1(); svbool_t pg1s = acle::pg1(); typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); @@ -517,7 +507,6 @@ struct PrecisionChange { svst1(pg1s, sb.v, sb_v); } static inline vecf DtoS (vecd a,vecd b) { - vecf ret; svbool_t pg1d = acle::pg1(); svbool_t pg1s = acle::pg1(); @@ -527,10 +516,10 @@ struct PrecisionChange { typename acle::vt sb_v = svcvt_f32_x(pg1d, b_v); typename acle::vt r_v = svuzp1(sa_v, sb_v); svst1(pg1s, ret.v, r_v); + return ret; } static inline void StoD (vecf s,vecd &a,vecd &b) { - svbool_t pg1s = acle::pg1(); svbool_t pg1d = acle::pg1(); typename acle::vt s_v = svld1(pg1s, s.v); @@ -542,7 +531,6 @@ struct PrecisionChange { svst1(pg1d, b.v, b_v); } static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { - vech ret; svbool_t pg1d = acle::pg1(); svbool_t pg1h = acle::pg1(); @@ -568,7 +556,6 @@ struct PrecisionChange { */ } static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { - svbool_t pg1h = acle::pg1(); svbool_t pg1d = acle::pg1(); typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); @@ -600,7 +587,6 @@ struct Exchange{ // Exchange0 is valid for arbitrary SVE vector length template static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg1 = acle::pg1(); typename acle::vt a1_v = svld1(pg1, in1.v); typename acle::vt a2_v = svld1(pg1, in2.v); @@ -612,55 +598,35 @@ struct Exchange{ svst1(pg1, out2.v, r2_v); } - - -/* FIXME use svcreate etc. or switch to table lookup directly template static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into ldp + stp -> SFI + svbool_t pg1 = acle::pg1(); + const vec::uint> tbl_exch1a = acle::tbl_exch1a(); + const vec::uint> tbl_exch1b = acle::tbl_exch1b(); + const vec::uint> tbl_exch1c = acle::tbl_exch1c(); - svbool_t pg4 = acle::pg4(); - typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); - typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); - typename acle::vt4 out1_v4; - typename acle::vt4 out2_v4; - out1_v4.v0 = in1_v4.v0; - out1_v4.v1 = in1_v4.v1; - out1_v4.v2 = in2_v4.v0; - out1_v4.v3 = in2_v4.v1; - out2_v4.v0 = in1_v4.v2; - out2_v4.v1 = in1_v4.v3; - out2_v4.v2 = in2_v4.v2; - out2_v4.v3 = in2_v4.v3; - svst4(pg4, (typename acle::pt*)out1.v, out1_v4); - svst4(pg4, (typename acle::pt*)out2.v, out2_v4); + typename acle::svuint tbl_exch1a_v = svld1(pg1, tbl_exch1a.v); + typename acle::svuint tbl_exch1b_v = svld1(pg1, tbl_exch1b.v); + typename acle::svuint tbl_exch1c_v = svld1(pg1, tbl_exch1c.v); + + typename acle::vt in1_v = svld1(pg1, in1.v); + typename acle::vt in2_v = svld1(pg1, in2.v); + + typename acle::vt a1_v = svtbl(in1_v, tbl_exch1a_v); + typename acle::vt a2_v = svtbl(in2_v, tbl_exch1b_v); + typename acle::vt b1_v = svext(a2_v, a1_v, (uint64_t)(W::r / 2u)); + typename acle::vt b2_v = svext(a1_v, a2_v, (uint64_t)(W::r / 2u)); + typename acle::vt out1_v = svtbl(b1_v, tbl_exch1c_v); + typename acle::vt out2_v = svtbl(b2_v, tbl_exch1a_v); + + svst1(pg1, out1.v, out1_v); + svst1(pg1, out2.v, out2_v); } -*/ - - #define VECTOR_FOR(i, w, inc) \ - for (unsigned int i = 0; i < w; i += inc) - - template - static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ - // FIXME - const int n = 1; - const int w = W::r; - unsigned int mask = w >> (n + 1); - // std::cout << " Exchange "< static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg1 = acle::pg1(); typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); @@ -671,7 +637,6 @@ struct Exchange{ } static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ - svbool_t pg1 = acle::pg1(); typename acle::vt a1_v = svld1(pg1, in1.v); typename acle::vt a2_v = svld1(pg1, in2.v); @@ -692,17 +657,16 @@ struct Permute{ // Permute0 is valid for any SVE vector width template static inline vec Permute0(vec in) { - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); svst1(pg1, out.v, r_v); + return out; } static inline vecd Permute1(vecd in) { - vecd out; const vec::uint> tbl_swap = acle::tbl1(); svbool_t pg1 = acle::pg1(); @@ -715,7 +679,6 @@ struct Permute{ } static inline vecf Permute1(vecf in) { - vecf out; const vec::uint> tbl_swap = acle::tbl1(); svbool_t pg1 = acle::pg1(); @@ -728,7 +691,6 @@ struct Permute{ } static inline vecd Permute2(vecd in) { - vecd out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -741,7 +703,6 @@ struct Permute{ } static inline vecf Permute2(vecf in) { - vecf out; const vec::uint> tbl_swap = acle::tbl2(); svbool_t pg1 = acle::pg1(); @@ -754,7 +715,6 @@ struct Permute{ } static inline vecf Permute3(vecf in) { - vecf out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -775,7 +735,6 @@ struct Permute{ struct Rotate{ template static inline vec tRotate(vec in){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); @@ -833,7 +792,6 @@ struct Reduce{ //Complex float Reduce template <> inline Grid::ComplexF Reduce::operator()(vecf in){ - svbool_t pg1 = acle::pg1(); svbool_t pg_even = acle::pg_even(); svbool_t pg_odd = acle::pg_odd(); @@ -848,7 +806,6 @@ inline Grid::ComplexF Reduce::operator()(vecf in){ //Real float Reduce template <> inline Grid::RealF Reduce::operator()(vecf in){ - svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); float a = svred(pg1, a_v); @@ -859,7 +816,6 @@ inline Grid::RealF Reduce::operator()(vecf in){ //Complex double Reduce template <> inline Grid::ComplexD Reduce::operator()(vecd in){ - svbool_t pg1 = acle::pg1(); svbool_t pg_even = acle::pg_even(); svbool_t pg_odd = acle::pg_odd(); @@ -873,7 +829,6 @@ inline Grid::ComplexD Reduce::operator()(vecd in){ //Real double Reduce template <> inline Grid::RealD Reduce::operator()(vecd in){ - svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); double a = svred(pg1, a_v); @@ -884,7 +839,6 @@ inline Grid::RealD Reduce::operator()(vecd in){ //Integer Reduce template <> inline Integer Reduce::operator()(veci in){ - svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); Integer a = svred(pg1, a_v); From 046b1cbbc0b771f33979b124ba0a878d8f1e5cf7 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 21 May 2020 19:39:07 +0200 Subject: [PATCH 106/147] enable fcmla in tensor arithmetics; fixed-size works, VLA does not compile --- Grid/simd/Grid_a64fx-2.h | 18 ++++++++++++++++-- Grid/simd/Grid_a64fx-fixedsize.h | 16 ++++++++-------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 0333299f..65254e90 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -324,6 +324,18 @@ struct Sub{ }; struct Mult{ + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v); + typename acle::vt r_v = svmad_x(pg1, b_v, c_v, a_v); + svst1(pg1, out.v, r_v); + + return out; + } template inline vec operator()(vec a, vec b){ vec out; @@ -396,7 +408,7 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c template - inline void mac(const vec &a, const vec b, const vec c){ + inline vec operator()(vec a, vec b, vec c){ vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -407,7 +419,9 @@ struct MultAddComplex{ typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); - svst1(pg1, a.v, r_v); + svst1(pg1, out.v, r_v); + + return out; } }; diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index a07cbff3..b3b93884 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -295,14 +295,14 @@ struct Sub{ struct Mult{ // Real float fma - inline void mac(vecf &a, vecf b, vecf c){ + inline void operator()(vecf a, vecf b, vecf c){ pred pg1 = acle::pg1(); - a = svmad_x(pg1, b, c, a); + return svmad_x(pg1, b, c, a); } // Real double fma - inline void mac(vecd &a, vecd b, vecd c){ + inline void operator()(vecd a, vecd b, vecd c){ pred pg1 = acle::pg1(); - a = svmad_x(pg1, b, c, a); + return svmad_x(pg1, b, c, a); } // Real float inline vecf operator()(vecf a, vecf b){ @@ -376,18 +376,18 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c // Complex float - inline void mac(vecf &a, vecf b, vecf c){ + inline vecf operator()(vecf a, vecf b, vecf c){ pred pg1 = acle::pg1(); // using FCMLA vecf r_v = svcmla_x(pg1, c, a, b, 0); - a = svcmla_x(pg1, r_v, a, b, 90); + return svcmla_x(pg1, r_v, a, b, 90); } // Complex double - inline void mac(vecd &a, vecd b, vecd c){ + inline vecd operator()(vecd a, vecd b, vecd c){ pred pg1 = acle::pg1(); // using FCMLA vecd r_v = svcmla_x(pg1, c, a, b, 0); - a = svcmla_x(pg1, r_v, a, b, 90); + return svcmla_x(pg1, r_v, a, b, 90); } }; From 8c5a5fdfceb69ce5caabbfa3e5f32b7b8bbbd4d6 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 21 May 2020 19:41:42 +0200 Subject: [PATCH 107/147] disable fcmla in vector type building for VLA --- Grid/simd/Grid_a64fx-2.h | 18 ++--------------- Grid/simd/Grid_a64fx-fixedsize.h | 4 ++-- Grid/simd/Grid_vector_types.h | 34 +++++++++++++++++++++++++------- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 65254e90..0333299f 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -324,18 +324,6 @@ struct Sub{ }; struct Mult{ - template - inline vec operator()(vec a, vec b, vec c){ - vec out; - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, a.v); - typename acle::vt b_v = svld1(pg1, b.v); - typename acle::vt c_v = svld1(pg1, c.v); - typename acle::vt r_v = svmad_x(pg1, b_v, c_v, a_v); - svst1(pg1, out.v, r_v); - - return out; - } template inline vec operator()(vec a, vec b){ vec out; @@ -408,7 +396,7 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c template - inline vec operator()(vec a, vec b, vec c){ + inline void mac(const vec &a, const vec b, const vec c){ vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -419,9 +407,7 @@ struct MultAddComplex{ typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); - svst1(pg1, out.v, r_v); - - return out; + svst1(pg1, a.v, r_v); } }; diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index b3b93884..2a6533fe 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -295,12 +295,12 @@ struct Sub{ struct Mult{ // Real float fma - inline void operator()(vecf a, vecf b, vecf c){ + inline vecf operator()(vecf a, vecf b, vecf c){ pred pg1 = acle::pg1(); return svmad_x(pg1, b, c, a); } // Real double fma - inline void operator()(vecd a, vecd b, vecd c){ + inline vecd operator()(vecd a, vecd b, vecd c){ pred pg1 = acle::pg1(); return svmad_x(pg1, b, c, a); } diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e2624e15..f8de3d30 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -298,23 +298,21 @@ public: // FIXME -- alias this to an accelerator_inline MAC struct. - // A64FX: use FCMLA - /* - #if defined(A64FX) || defined(A64FXFIXEDSIZE) // A64FX: use FCMLA + // FIXME VLA build error + //#if defined(A64FX) || defined(A64FXFIXEDSIZE) // VLA only: build error + #if defined(A64FXFIXEDSIZE) friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { - y->v = Optimization::MultAddComplex::mac(a->v, x->v, y->v); + *y = fxmac((*a), (*x), (*y)); }; #else - #endif - - */ friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { *y = (*a) * (*x) + (*y); }; + #endif friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, @@ -793,6 +791,28 @@ accelerator_inline Grid_simd operator*(Grid_simd a, Grid_simd return ret; }; +// ----------------A64FX MAC --------------------- +// Distinguish between complex types and others +//#if defined(A64FX) || defined(A64FXFIXEDSIZE) // VLA only: build error +#if defined(A64FXFIXEDSIZE) +template = 0> +accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MultAddComplexSIMD()); + return ret; +}; + +// Real/Integer types +template = 0> +accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MultSIMD()); + return ret; +}; +#endif +// ------------------------------------- + + // Distinguish between complex types and others template = 0> accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { From 6ddcef1bca97423ac0a77f096554518ef71fa34c Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 21 May 2020 21:21:03 +0200 Subject: [PATCH 108/147] fix build error enabling fcmla/mac in vector types for VLA --- Grid/simd/Grid_a64fx-2.h | 5 +++-- Grid/simd/Grid_vector_types.h | 10 ++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 0333299f..a0463a10 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -396,7 +396,7 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c template - inline void mac(const vec &a, const vec b, const vec c){ + inline vec operator()(vec a, vec b, vec c){ vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -406,8 +406,9 @@ struct MultAddComplex{ // using FCMLA typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); + svst1(pg1, out.v, r_v); - svst1(pg1, a.v, r_v); + return out; } }; diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index f8de3d30..e1eb330d 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -299,8 +299,7 @@ public: // FIXME -- alias this to an accelerator_inline MAC struct. // FIXME VLA build error - //#if defined(A64FX) || defined(A64FXFIXEDSIZE) // VLA only: build error - #if defined(A64FXFIXEDSIZE) + #if defined(A64FX) || defined(A64FXFIXEDSIZE) friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { @@ -791,10 +790,9 @@ accelerator_inline Grid_simd operator*(Grid_simd a, Grid_simd return ret; }; -// ----------------A64FX MAC --------------------- +// ---------------- A64FX MAC ------------------- // Distinguish between complex types and others -//#if defined(A64FX) || defined(A64FXFIXEDSIZE) // VLA only: build error -#if defined(A64FXFIXEDSIZE) +#if defined(A64FX) || defined(A64FXFIXEDSIZE) template = 0> accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; @@ -810,7 +808,7 @@ accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, G return ret; }; #endif -// ------------------------------------- +// ---------------------------------------------- // Distinguish between complex types and others From 4fedd8d29f1c2b7eb56df73776839ba4873e1bbf Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 27 May 2020 14:08:34 +0200 Subject: [PATCH 109/147] switch to MPI_THREAD_SERIALIZED instead of SINGLE --- Grid/communicator/Communicator_mpi3.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 15b8cbfd..e7812178 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -54,10 +54,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) #if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori nCommThreads=1; - MPI_Init(argc,argv); + //MPI_Init(argc,argv); + MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided); #else MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); - +#endif //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { assert(0); @@ -66,7 +67,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) { assert(0); } -#endif } // Never clean up as done once. From 250008372ffc8f54ec77c3ed3e27a433d0c99a25 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 29 May 2020 15:44:25 +0200 Subject: [PATCH 110/147] update SVE readme --- SVE_README.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/SVE_README.txt b/SVE_README.txt index be4f1baa..07c18b06 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -15,7 +15,7 @@ export OMPI_CXX=g++-10.0.1 export MPICH_CC=gcc-10.0.1 export MPICH_CXX=g++-10.0.1 -$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.24/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.24/lib64 -lrt" +$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" works! but binaries do not finish when running via job scheduler. problem with MPI_finalize ? @@ -45,9 +45,9 @@ TODO check ARMCLANGCOMPAT TODO check ARMCLANGCOMPAT -* armclang 20.1 VLA (fjt cluster) +* armclang 20.1 VLA w/MPI (fjt cluster) -../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.24/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.24/lib64" +../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64" No ARMCLANGCOMPAT -> still correct ? From 5cb3530c34a85c0dd83ad28f9953e3e56a686188 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 29 May 2020 15:44:52 +0200 Subject: [PATCH 111/147] enable counters in Benchmark_wilson --- benchmarks/Benchmark_wilson.cc | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc index ac5caa34..1cd616cb 100644 --- a/benchmarks/Benchmark_wilson.cc +++ b/benchmarks/Benchmark_wilson.cc @@ -153,6 +153,11 @@ int main (int argc, char ** argv) std::cout<Barrier(); + double t0=usecond(); for(int i=0;iBarrier(); + Dw.report(); + + // guard double err0 = norm2(err); From e947b563ea6778e20695b0a8dcea400f5c5d7034 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 29 May 2020 17:11:17 +0200 Subject: [PATCH 112/147] add space in stencil output --- Grid/stencil/Stencil.h | 346 ++++++++++++++++++++--------------------- 1 file changed, 173 insertions(+), 173 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 37b866cb..a56d256d 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/Stencil.h @@ -41,13 +41,13 @@ // Stencil based code will exchange haloes and use a table lookup for neighbours. // This will be done with generality to allow easier efficient implementations. // Overlap of comms and compute is enabled by tabulating off-node connected, -// +// // Generic services // 0) Prebuild neighbour tables // 1) Compute sizes of all haloes/comms buffers; allocate them. // 2) Gather all faces, and communicate. // 3) Loop over result sites, giving nbr index/offnode info for each -// +// ////////////////////////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); @@ -59,10 +59,10 @@ NAMESPACE_BEGIN(Grid); void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,Vector > & table); -template +template void Gather_plane_simple_table (Vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); -template +template void Gather_plane_simple_table (Vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) { int num=table.size(); @@ -92,13 +92,13 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic { assert( (table.size()&0x1)==0); int num=table.size()/2; - int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane auto rhs_v = rhs.View(); auto p0=&pointers[0][0]; auto p1=&pointers[1][0]; auto tp=&table[0]; - accelerator_forNB(j, num, 1, { + accelerator_forNB(j, num, 1, { compress.CompressExchange(p0,p1, &rhs_v[0], j, so+tp[2*j ].second, so+tp[2*j+1].second, @@ -106,20 +106,20 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic }); } -struct StencilEntry { +struct StencilEntry { #ifdef GRID_NVCC - uint64_t _byte_offset; // 8 bytes - uint32_t _offset; // 4 bytes + uint64_t _byte_offset; // 8 bytes + uint32_t _offset; // 4 bytes #else - uint64_t _byte_offset; // 8 bytes + uint64_t _byte_offset; // 8 bytes uint64_t _offset; // 8 bytes (8 ever required?) #endif - uint8_t _is_local; // 1 bytes + uint8_t _is_local; // 1 bytes uint8_t _permute; // 1 bytes uint8_t _around_the_world; // 1 bytes uint8_t _pad; // 1 bytes }; -// Could pack to 8 + 4 + 4 = 128 bit and use +// Could pack to 8 + 4 + 4 = 128 bit and use template class CartesianStencilView { @@ -145,18 +145,18 @@ class CartesianStencilView { accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; } - accelerator_inline int GetNodeLocal(int osite,int point) { + accelerator_inline int GetNodeLocal(int osite,int point) { return this->_entries_p[point+this->_npoints*osite]._is_local; } - accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { - ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; + accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { + ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; } accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; local = this->_entries_p[ent]._is_local; perm = this->_entries_p[ent]._permute; - if (perm) ptype = this->_permute_type[point]; + if (perm) ptype = this->_permute_type[point]; if (local) { return base + this->_entries_p[ent]._byte_offset; } else { @@ -171,7 +171,7 @@ class CartesianStencilView { else return cbase + this->_entries_p[ent]._byte_offset; } - accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) + accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) { Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); } @@ -211,12 +211,12 @@ public: cobj * mpi_p; Integer buffer_size; }; - + protected: GridBase * _grid; -public: +public: GridBase *Grid(void) const { return _grid; } //////////////////////////////////////////////////////////////////////// @@ -230,7 +230,7 @@ public: View_type accessor(*( (View_type *) this)); return accessor; } - + int face_table_computed; std::vector > > face_table ; Vector surface_list; @@ -280,7 +280,7 @@ public: //////////////////////////////////////// // Stencil query //////////////////////////////////////// - inline int SameNode(int point) { + inline int SameNode(int point) { int dimension = this->_directions[point]; int displacement = this->_distances[point]; @@ -304,7 +304,7 @@ public: // FIXME this logic needs to be sorted for three link term // assert( (displacement==1) || (displacement==-1)); // Present hack only works for >= 4^4 subvol per node - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); @@ -344,7 +344,7 @@ public: comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; } } - + void CollateThreads(void) { int nthreads = CartesianCommunicator::nCommThreads; @@ -368,7 +368,7 @@ public: if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen if ( t1 > last ) last = t1; // max time seen - + } commtime+= last-first; } @@ -430,30 +430,30 @@ public: this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); } - } - - template void HaloExchange(const Lattice &source,compressor &compress) + } + + template void HaloExchange(const Lattice &source,compressor &compress) { Prepare(); HaloGather(source,compress); Communicate(); - CommsMergeSHM(compress); - CommsMerge(compress); + CommsMergeSHM(compress); + CommsMerge(compress); } - + template int HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) { int dimension = this->_directions[point]; int displacement = this->_distances[point]; - + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; - + // Map to always positive shift modulo global full dimension. int shift = (displacement+fd)%fd; assert (source.Checkerboard()== this->_checkerboard); - + // the permute type int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; @@ -471,7 +471,7 @@ public: auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx); is_same_node = is_same_node && tmp; splicetime+=usecond(); - } else { + } else { nosplicetime-=usecond(); auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx); is_same_node = is_same_node && tmp; @@ -497,7 +497,7 @@ public: } return is_same_node; } - + template void HaloGather(const Lattice &source,compressor &compress) { @@ -508,9 +508,9 @@ public: // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); halogtime-=usecond(); - + u_comm_offset=0; - + // Gather all comms buffers int face_idx=0; for(int point = 0 ; point < this->_npoints; point++) { @@ -523,16 +523,16 @@ public: accelerator_barrier(); halogtime+=usecond(); } - + ///////////////////////// // Implementation ///////////////////////// void Prepare(void) { - Decompressions.resize(0); - DecompressionsSHM.resize(0); - Mergers.resize(0); - MergersSHM.resize(0); + Decompressions.resize(0); + DecompressionsSHM.resize(0); + Mergers.resize(0); + MergersSHM.resize(0); Packets.resize(0); calls++; } @@ -561,22 +561,22 @@ public: mv.push_back(m); } template void CommsMerge(decompressor decompress) { - CommsMerge(decompress,Mergers,Decompressions); + CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - mpi3synctime-=usecond(); + mpi3synctime-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime+=usecond(); - shmmergetime-=usecond(); + mpi3synctime+=usecond(); + shmmergetime-=usecond(); CommsMerge(decompress,MergersSHM,DecompressionsSHM); - shmmergetime+=usecond(); + shmmergetime+=usecond(); } template - void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { mergetime-=usecond(); - for(int i=0;i_npoints;point++){ this->same_node[point] = this->SameNode(point); } - + for(int site = 0 ;site< vol4;site++){ int local = 1; for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ local = 0; } } - if(local == 0) { + if(local == 0) { surface_list.push_back(site); } } @@ -638,11 +638,11 @@ public: int checkerboard, const std::vector &directions, const std::vector &distances, - Parameters p) - : shm_bytes_thr(npoints), - comm_bytes_thr(npoints), + Parameters p) + : shm_bytes_thr(npoints), + comm_bytes_thr(npoints), comm_enter_thr(npoints), - comm_leave_thr(npoints), + comm_leave_thr(npoints), comm_time_thr(npoints) { face_table_computed=0; @@ -653,7 +653,7 @@ public: ///////////////////////////////////// this->_npoints = npoints; this->_comm_buf_size.resize(npoints), - this->_permute_type.resize(npoints), + this->_permute_type.resize(npoints), this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels this->_directions = StencilVector(directions); this->_distances = StencilVector(distances); @@ -663,24 +663,24 @@ public: surface_list.resize(0); int osites = _grid->oSites(); - + _entries.resize(this->_npoints* osites); this->_entries_p = &_entries[0]; for(int ii=0;ii_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; this->_permute_type[point]=_grid->PermuteType(dimension); - + this->_checkerboard = checkerboard; - + ////////////////////////// // the permute type ////////////////////////// @@ -690,25 +690,25 @@ public: int rotate_dim = _grid->_simd_layout[dimension]>2; assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported - + int sshift[2]; - + ////////////////////////// // Underlying approach. For each local site build - // up a table containing the npoint "neighbours" and whether they + // up a table containing the npoint "neighbours" and whether they // live in lattice or a comms buffer. ////////////////////////// if ( !comm_dim ) { sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd); - + if ( sshift[0] == sshift[1] ) { Local(point,dimension,shift,0x3); } else { Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes Local(point,dimension,shift,0x2);// both with block stride loop iteration } - } else { + } else { // All permute extract done in comms phase prior to Stencil application // So tables are the same whether comm_dim or splice_dim sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); @@ -750,23 +750,23 @@ public: int ld = _grid->_ldimensions[dimension]; int gd = _grid->_gdimensions[dimension]; int ly = _grid->_simd_layout[dimension]; - + // Map to always positive shift modulo global full dimension. int shift = (shiftpm+fd)%fd; // the permute type int permute_dim =_grid->PermuteDim(dimension); - - for(int x=0;x_ostride[dimension]; - + int cb= (cbmask==0x2)? Odd : Even; - + int sshift = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb); int sx = (x+sshift)%rd; - + int wraparound=0; if ( (shiftpm==-1) && (sx>x) ) { wraparound = 1; @@ -774,7 +774,7 @@ public: if ( (shiftpm== 1) && (sxNsimd(); - + int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int pd = _grid->_processors[dimension]; int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; - + assert(comm_dim==1); int shift = (shiftpm + fd) %fd; assert(shift>=0); assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; this->_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and // send to one or more remote nodes. - + int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb); - - for(int x=0;xPermuteType(dimension); - + int sx = (x+sshift)%rd; - + int offnode = 0; if ( simd_layout > 1 ) { - + for(int i=0;i>(permute_type+1)); int ic= (i&inner_bit)? 1:0; int my_coor = rd*ic + x; int nbr_coor = my_coor+sshift; int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - if ( nbr_proc ) { + + if ( nbr_proc ) { offnode =1; } } - - } else { + + } else { int comm_proc = ((x+sshift)/rd)%pd; offnode = (comm_proc!= 0); } - + int wraparound=0; if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -850,24 +850,24 @@ public: wraparound = 1; } if (!offnode) { - + int permute_slice=0; - CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); - + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + } else { int words = buffer_size; if (cbmask != 0x3) words=words>>1; - + // int rank = grid->_processor; // int recv_from_rank; // int xmit_to_rank; - + int unified_buffer_offset = _unified_buffer_size; _unified_buffer_size += words; - + ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase - + } } } @@ -875,13 +875,13 @@ public: void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) { int rd = _grid->_rdimensions[dimension]; - + if ( !_grid->CheckerBoarded(dimension) ) { - + int o = 0; // relative offset to base within plane - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*_grid->_ostride[dimension]; // offset in buffer - + // Simple block stride gather of SIMD objects for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ @@ -893,18 +893,18 @@ public: } o +=_grid->_slice_stride[dimension]; } - + } else { - - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane + + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane - + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ - + int ocb=1<<_grid->CheckerBoardFromOindex(o+b); - + if ( ocb&cbmask ) { int idx = point+(lo+o+b)*this->_npoints; _entries[idx]._offset =ro+o+b; @@ -912,24 +912,24 @@ public: _entries[idx]._permute=permute; _entries[idx]._around_the_world=wrap; } - + } o +=_grid->_slice_stride[dimension]; } - + } } // Routine builds up integer table for each site in _offsets, _is_local, _permute void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) { int rd = _grid->_rdimensions[dimension]; - + if ( !_grid->CheckerBoarded(dimension) ) { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane int bo = 0; // offset in buffer - + // Simple block stride gather of SIMD objects for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ @@ -941,16 +941,16 @@ public: } o +=_grid->_slice_stride[dimension]; } - - } else { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + + } else { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane int bo = 0; // offset in buffer - + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ - + int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { int idx = point+(so+o+b)*this->_npoints; @@ -964,16 +964,16 @@ public: } } } - + template int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; - + assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); - + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int pd = _grid->_processors[dimension]; @@ -985,37 +985,37 @@ public: assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; - + int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + int shm_receive_only = 1; - for(int x=0;x>1; - + int bytes = words * compress.CommDatumSize(); - - int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + + int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane if ( !face_table_computed ) { face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); } - + // int rank = _grid->_processor; int recv_from_rank; int xmit_to_rank; _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - + assert (xmit_to_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank()); - + ///////////////////////////////////////////////////////// // try the direct copy if possible ///////////////////////////////////////////////////////// @@ -1028,13 +1028,13 @@ public: } send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); - if ( send_buf==NULL ) { + if ( send_buf==NULL ) { send_buf = this->u_send_buf_p; - } - + } + // Find out if we get the direct copy. void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p); - if (success==NULL) { + if (success==NULL) { // we found a packet that comes from MPI and contributes to this leg of stencil shm_receive_only = 0; } @@ -1043,9 +1043,9 @@ public: assert(send_buf!=NULL); Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; gathertime+=usecond(); - + if ( compress.DecompressionStep() ) { - + if ( shm_receive_only ) { // Early decompress before MPI is finished is possible AddDecompress(&this->u_recv_buf_p[u_comm_offset], &recv_buf[u_comm_offset], @@ -1074,7 +1074,7 @@ public: } return shm_receive_only; } - + template int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) { @@ -1102,7 +1102,7 @@ public: /////////////////////////////////////////////// int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // int words = sizeof(cobj)/sizeof(vector_type); - + assert(cbmask==0x3); // Fixme think there is a latent bug if not true // This assert will trap it if ever hit. Not hit normally so far int reduced_buffer_size = buffer_size; @@ -1118,22 +1118,22 @@ public: /////////////////////////////////////////// // Work out what to send where /////////////////////////////////////////// - + int cb = (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + // loop over outer coord planes orthog to dim int shm_receive_only = 1; - for(int x=0;x= rd ); if ( any_offnode ) { - - for(int i=0;iShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + // shm == receive pointer if offnode // shm == Translate[send pointer] if on node -- my view of his send pointer cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if (shm==NULL) { + if (shm==NULL) { shm = rp; // we found a packet that comes from MPI and contributes to this shift. // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil. @@ -1188,15 +1188,15 @@ public: AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - - } else { - + + } else { + rpointers[i] = sp; - + } } - if ( shm_receive_only ) { + if ( shm_receive_only ) { AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM); } else { AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); @@ -1231,9 +1231,9 @@ public: shm_bytes = 0.; calls = 0.; }; - + void Report(void) { -#define AVERAGE(A) +#define AVERAGE(A) #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; RealD NN = _grid->NodeCount(); @@ -1250,7 +1250,7 @@ public: } } if (threaded) commtime += t; - + _grid->GlobalSum(commtime); commtime/=NP; if ( calls > 0. ) { std::cout << GridLogMessage << " Stencil calls "< Date: Fri, 29 May 2020 17:13:59 +0200 Subject: [PATCH 113/147] add counter support in WilsonFermion.h --- Grid/qcd/action/fermion/WilsonFermion.h | 32 +++++++++++++++++-------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index a3f5d2d7..eb9efa41 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -50,14 +50,14 @@ public: double, nu); WilsonAnisotropyCoefficients(): - isAnisotropic(false), - t_direction(Nd-1), - xi_0(1.0), + isAnisotropic(false), + t_direction(Nd-1), + xi_0(1.0), nu(1.0){} }; template -class WilsonFermion : public WilsonKernels, public WilsonFermionStatic +class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { public: INHERIT_IMPL_TYPES(Impl); @@ -74,6 +74,20 @@ public: FermionField _tmp; FermionField &tmp(void) { return _tmp; } + void Report(void); + void ZeroCounters(void); + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + double DhopTotalTime; + + double DerivCalls; + double DerivCommTime; + double DerivComputeTime; + double DerivDhopComputeTime; + ////////////////////////////////////////////////////////////////// // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent @@ -138,7 +152,7 @@ public: // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, - const ImplParams &p = ImplParams(), + const ImplParams &p = ImplParams(), const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() ); // DoubleStore impl dependent @@ -170,9 +184,9 @@ public: LebesgueOrder Lebesgue; LebesgueOrder LebesgueEvenOdd; - + WilsonAnisotropyCoefficients anisotropyCoeff; - + /////////////////////////////////////////////////////////////// // Conserved current utilities /////////////////////////////////////////////////////////////// @@ -184,7 +198,7 @@ public: void SeqConservedCurrent(PropagatorField &q_in, PropagatorField &q_out, Current curr_type, - unsigned int mu, + unsigned int mu, unsigned int tmin, unsigned int tmax, ComplexField &lattice_cmplx); @@ -194,5 +208,3 @@ typedef WilsonFermion WilsonFermionF; typedef WilsonFermion WilsonFermionD; NAMESPACE_END(Grid); - - From 38164f8480a4c6732b01f7ed54ce724c884ea65d Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 29 May 2020 17:59:26 +0200 Subject: [PATCH 114/147] include counters in WilsonFermionImplementation.h --- .../WilsonFermionImplementation.h | 207 ++++++++++++++---- 1 file changed, 160 insertions(+), 47 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 76b904e9..951486f2 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -43,7 +43,7 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p, const WilsonAnisotropyCoefficients &anis) - : + : Kernels(p), _grid(&Fgrid), _cbgrid(&Hgrid), @@ -70,8 +70,91 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, } +template +void WilsonFermion::Report(void) +{ + RealD NP = _FourDimGrid->_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); + RealD volume = Ls; + Coordinate latt = _FourDimGrid->GlobalDimensions(); + for(int mu=0;mu 0 ) { + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl; + std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; + + // Average the compute time + _FourDimGrid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + } + + if ( DerivCalls > 0 ) { + std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; + std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " < 0 || DhopCalls > 0){ + std::cout << GridLogMessage << "WilsonFermion Stencil" < 0){ + std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" < +void WilsonFermion::ZeroCounters(void) { + DhopCalls = 0; // ok + DhopCommTime = 0; + DhopComputeTime = 0; + DhopComputeTime2= 0; + DhopFaceTime = 0; + DhopTotalTime = 0; + + DerivCalls = 0; // ok + DerivCommTime = 0; + DerivComputeTime = 0; + DerivDhopComputeTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); + Stencil.ZeroCountersi(); + StencilEven.ZeroCountersi(); + StencilOdd.ZeroCountersi(); +} + + template -void WilsonFermion::ImportGauge(const GaugeField &_Umu) +void WilsonFermion::ImportGauge(const GaugeField &_Umu) { GaugeField HUmu(_Umu.Grid()); @@ -132,7 +215,7 @@ void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) { DhopOE(in, out, DaggerYes); } } - + template void WilsonFermion::Mooee(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); @@ -151,7 +234,7 @@ void WilsonFermion::MooeeInv(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); out = (1.0/(diag_mass))*in; } - + template void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); @@ -159,59 +242,59 @@ void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) } template void WilsonFermion::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector twist) -{ +{ typedef typename FermionField::vector_type vector_type; typedef typename FermionField::scalar_type ScalComplex; typedef Lattice > LatComplex; - - // what type LatticeComplex + + // what type LatticeComplex conformable(_grid,out.Grid()); - + Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT }; - + Coordinate latt_size = _grid->_fdimensions; - + FermionField num (_grid); num = Zero(); LatComplex wilson(_grid); wilson= Zero(); LatComplex one (_grid); one = ScalComplex(1.0,0.0); - + LatComplex denom(_grid); denom= Zero(); - LatComplex kmu(_grid); + LatComplex kmu(_grid); ScalComplex ci(0.0,1.0); // momphase = n * 2pi / L for(int mu=0;mu void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag) { + DerivCalls++; assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -229,8 +313,11 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, FermionField Atilde(B.Grid()); Atilde = A; + DerivCommTime-=usecond(); st.HaloExchange(B, compressor); + DerivCommTime+=usecond(); + DerivComputeTime-=usecond(); for (int mu = 0; mu < Nd; mu++) { //////////////////////////////////////////////////////////////////////// // Flip gamma (1+g)<->(1-g) if dag @@ -238,6 +325,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, int gamma = mu; if (!dag) gamma += Nd; + DerivDhopComputeTime -= usecond(); int Ls=1; Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); @@ -245,7 +333,9 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, // spin trace outer product ////////////////////////////////////////////////// Impl::InsertForce4D(mat, Btilde, Atilde, mu); + DerivDhopComputeTime += usecond(); } + DerivComputeTime += usecond(); } template @@ -265,7 +355,7 @@ void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, co conformable(U.Grid(), V.Grid()); //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido) // Motivation: look at the SchurDiff operator - + assert(V.Checkerboard() == Even); assert(U.Checkerboard() == Odd); mat.Checkerboard() = Odd; @@ -288,6 +378,7 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co template void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=2; conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -298,6 +389,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da template void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=1; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -309,6 +401,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int template void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=1; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -319,18 +412,18 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d } template -void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { DhopDir(in, out, dir, disp); } template -void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) +void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) { DhopDirAll(in, out); } template -void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); @@ -342,12 +435,12 @@ void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int DhopDirCalc(in, out, dirdisp, gamma, DaggerNo); }; template -void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) +void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) { Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); - assert((out.size()==8)||(out.size()==9)); + assert((out.size()==8)||(out.size()==9)); for(int dir=0;dir::DhopDirAll(const FermionField &in, std::vector -void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) { int Ls=1; uint64_t Nsite=in.oSites(); @@ -371,15 +464,16 @@ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) + FermionField &out, int dag) { + DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); else -#endif +#endif DhopInternalSerial(st,lo,U,in,out,dag); - + DhopTotalTime+=usecond(); } template @@ -397,38 +491,53 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO ///////////////////////////// std::vector > requests; st.Prepare(); + DhopFaceTime-=usecond(); st.HaloGather(in,compressor); + DhopFaceTime+=usecond(); + + DhopCommTime -=usecond(); st.CommunicateBegin(requests); ///////////////////////////// // Overlap with comms ///////////////////////////// + DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor); + DhopFaceTime+=usecond(); ///////////////////////////// // do the compute interior ///////////////////////////// int Opt = WilsonKernelsStatic::Opt; + DhopComputeTime-=usecond(); if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); - } + } + DhopComputeTime+=usecond(); ///////////////////////////// // Complete comms ///////////////////////////// st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + + DhopFaceTime-=usecond(); st.CommsMerge(compressor); + DhopFaceTime+=usecond(); ///////////////////////////// // do the compute exterior ///////////////////////////// + + DhopComputeTime2-=usecond(); if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } + DhopComputeTime2+=usecond(); }; @@ -439,20 +548,24 @@ void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, FermionField &out, int dag) { assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); + DhopCommTime-=usecond(); st.HaloExchange(in, compressor); + DhopCommTime+=usecond(); + DhopComputeTime-=usecond(); int Opt = WilsonKernelsStatic::Opt; if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } + DhopComputeTime+=usecond(); }; /*Change ends */ /******************************************************************************* * Conserved current utilities for Wilson fermions, for contracting propagators - * to make a conserved current sink or inserting the conserved current + * to make a conserved current sink or inserting the conserved current * sequentially. ******************************************************************************/ template @@ -493,11 +606,11 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, template -void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, +void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, PropagatorField &q_out, Current curr_type, unsigned int mu, - unsigned int tmin, + unsigned int tmin, unsigned int tmax, ComplexField &lattice_cmplx) { @@ -535,24 +648,24 @@ void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, Integer timeSlices = Reduce(t_mask()); if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], - q_out_v[sU], + Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], + q_out_v[sU], Umu_v, sU, mu, t_mask); } // Repeat for backward direction. - t_mask() = ((coords_v[sU] >= (tmin + tshift)) && + t_mask() = ((coords_v[sU] >= (tmin + tshift)) && (coords_v[sU] <= (tmax + tshift))); - - //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) + + //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) unsigned int t0 = 0; if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 )); - + timeSlices = Reduce(t_mask()); if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], - q_out_v[sU], + Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], + q_out_v[sU], Umu_v, sU, mu, t_mask); } }); From 91c81cab30033d178b3f801258630b0fbb89c502 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 29 May 2020 18:19:22 +0200 Subject: [PATCH 115/147] some corrections; compiles on my laptop; untested --- .../WilsonFermionImplementation.h | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 951486f2..cb852a63 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -73,10 +73,10 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, template void WilsonFermion::Report(void) { - RealD NP = _FourDimGrid->_Nprocessors; - RealD NN = _FourDimGrid->NodeCount(); - RealD volume = Ls; - Coordinate latt = _FourDimGrid->GlobalDimensions(); + RealD NP = _grid->_Nprocessors; + RealD NN = _grid->NodeCount(); + RealD volume = 1; + Coordinate latt = _grid->GlobalDimensions(); for(int mu=0;mu 0 ) { @@ -89,14 +89,14 @@ void WilsonFermion::Report(void) std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; // Average the compute time - _FourDimGrid->GlobalSum(DhopComputeTime); + _grid->GlobalSum(DhopComputeTime); DhopComputeTime/=NP; - RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; - RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; @@ -110,13 +110,15 @@ void WilsonFermion::Report(void) std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " < 0 || DhopCalls > 0){ std::cout << GridLogMessage << "WilsonFermion Stencil" < Date: Fri, 29 May 2020 18:44:00 +0200 Subject: [PATCH 116/147] more mods; counters seem to work correctly --- benchmarks/Benchmark_wilson.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc index 1cd616cb..b1011b05 100644 --- a/benchmarks/Benchmark_wilson.cc +++ b/benchmarks/Benchmark_wilson.cc @@ -156,7 +156,7 @@ int main (int argc, char ** argv) // Counters Dw.ZeroCounters(); - FGrid->Barrier(); + Grid.Barrier(); double t0=usecond(); for(int i=0;iBarrier(); - Dw.report(); + Grid.Barrier(); + Dw.Report(); // guard From 936071773e2eef176b5a154ba4b44d9a6349ab69 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 29 May 2020 22:15:59 +0200 Subject: [PATCH 117/147] correct throughput in wilson and dwf --- benchmarks/Benchmark_dwf.cc | 15 +++++++++------ benchmarks/Benchmark_wilson.cc | 8 ++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 53630650..728cfd76 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -201,11 +201,14 @@ int main (int argc, char ** argv) double volume=Ls; for(int mu=0;mu Date: Sat, 30 May 2020 10:55:17 +0200 Subject: [PATCH 118/147] update calculation of data --- benchmarks/Benchmark_dwf.cc | 12 ++++++------ benchmarks/Benchmark_wilson.cc | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 728cfd76..b76e58c8 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -204,11 +204,11 @@ int main (int argc, char ** argv) auto nsimd = vComplex::Nsimd(); auto simdwidth = sizeof(vComplex); - // RF: Nd Wilson * Ls, Nc colors - double data_L1 = (volume * ((2*Nd+1)*Nd*Nc + (2*Nd)*Nc*Nc) * simdwidth / nsimd * ncall) / (1024.*1024.*1024.); + // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors + double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); - // mem: Nd+1 fermion, Nd gauge, Nc colors - double data_L2 = (volume * (2*(Nd+1)+1)*Nd*Nc * simdwidth / nsimd * ncall + (volume/Ls) *(2*Nd)*Nc*Nc * simdwidth / nsimd) / (1024.*1024.*1024.); + // mem: Nd Wilson * Ls, Nd gauge, Nc colors + double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); std::cout< Date: Tue, 2 Jun 2020 10:32:44 +0200 Subject: [PATCH 119/147] use fcadd in TimesI and TimesMinusI instead of tbl and neg --- Grid/simd/Grid_a64fx-fixedsize.h | 46 ++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 2a6533fe..200fbe0f 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -419,6 +419,7 @@ struct Conj{ } }; +/* struct TimesMinusI{ // Complex float inline vecf operator()(vecf a, vecf b){ @@ -441,7 +442,29 @@ struct TimesMinusI{ return svneg_m(a_v, pg_odd, a_v); } }; +*/ +// alternative implementation using fcadd +// this is not optimal because we have op1 = op2 + TimesMinusI(op3) etc +// ideally we have AddTimesMinusI(op1,op2,op3) +struct TimesMinusI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + vecf z_v = acle::zero(); + + return svcadd_x(pred, zero, a, 270); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + vecd z_v = acle::zero(); + + return svcadd_x(pred, zero, a, 270); + } +}; + +/* struct TimesI{ // Complex float inline vecf operator()(vecf a, vecf b){ @@ -464,6 +487,29 @@ struct TimesI{ return svneg_m(a_v, pg_even, a_v); } }; +*/ + +// alternative implementation using fcadd +// this is not optimal because we have op1 = op2 + TimesI(op3) etc +// ideally we have AddTimesI(op1,op2,op3) +struct TimesI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + vecf z_v = acle::zero(); + + return svcadd_x(pred, zero, a, 90); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + vecd z_v = acle::zero(); + + return svcadd_x(pred, zero, a, 90); + } +}; + + struct PrecisionChange { static inline vech StoH (vecf sa, vecf sb) { From b4735c99041f16a64ed654aaa39da90facbddd47 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Tue, 2 Jun 2020 10:38:05 +0200 Subject: [PATCH 120/147] correct zero in svcadd --- Grid/simd/Grid_a64fx-fixedsize.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 200fbe0f..db26da46 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -453,14 +453,14 @@ struct TimesMinusI{ pred pg1 = acle::pg1(); vecf z_v = acle::zero(); - return svcadd_x(pred, zero, a, 270); + return svcadd_x(pred, z_v, a, 270); } // Complex double inline vecd operator()(vecd a, vecd b){ pred pg1 = acle::pg1(); vecd z_v = acle::zero(); - return svcadd_x(pred, zero, a, 270); + return svcadd_x(pred, z_v, a, 270); } }; @@ -498,14 +498,14 @@ struct TimesI{ pred pg1 = acle::pg1(); vecf z_v = acle::zero(); - return svcadd_x(pred, zero, a, 90); + return svcadd_x(pred, z_v, a, 90); } // Complex double inline vecd operator()(vecd a, vecd b){ pred pg1 = acle::pg1(); vecd z_v = acle::zero(); - return svcadd_x(pred, zero, a, 90); + return svcadd_x(pred, z_v, a, 90); } }; From 71cf9851e716b8312872c4407cc99618052b87b0 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Tue, 2 Jun 2020 10:44:15 +0200 Subject: [PATCH 121/147] correct type for vecd in TimesI and TimesMinusI --- Grid/simd/Grid_a64fx-fixedsize.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index db26da46..193e11da 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -457,8 +457,8 @@ struct TimesMinusI{ } // Complex double inline vecd operator()(vecd a, vecd b){ - pred pg1 = acle::pg1(); - vecd z_v = acle::zero(); + pred pg1 = acle::pg1(); + vecd z_v = acle::zero(); return svcadd_x(pred, z_v, a, 270); } @@ -502,8 +502,8 @@ struct TimesI{ } // Complex double inline vecd operator()(vecd a, vecd b){ - pred pg1 = acle::pg1(); - vecd z_v = acle::zero(); + pred pg1 = acle::pg1(); + vecd z_v = acle::zero(); return svcadd_x(pred, z_v, a, 90); } From 7bee4ebb54e6fc9a27fbe599a6df4b7035cd8947 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Tue, 2 Jun 2020 10:51:39 +0200 Subject: [PATCH 122/147] correct predication for svcadd --- Grid/simd/Grid_a64fx-fixedsize.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 193e11da..6f27a4ec 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -453,14 +453,14 @@ struct TimesMinusI{ pred pg1 = acle::pg1(); vecf z_v = acle::zero(); - return svcadd_x(pred, z_v, a, 270); + return svcadd_x(pg1, z_v, a, 270); } // Complex double inline vecd operator()(vecd a, vecd b){ pred pg1 = acle::pg1(); vecd z_v = acle::zero(); - return svcadd_x(pred, z_v, a, 270); + return svcadd_x(pg1, z_v, a, 270); } }; @@ -498,14 +498,14 @@ struct TimesI{ pred pg1 = acle::pg1(); vecf z_v = acle::zero(); - return svcadd_x(pred, z_v, a, 90); + return svcadd_x(pg1, z_v, a, 90); } // Complex double inline vecd operator()(vecd a, vecd b){ pred pg1 = acle::pg1(); vecd z_v = acle::zero(); - return svcadd_x(pred, z_v, a, 90); + return svcadd_x(pg1, z_v, a, 90); } }; From 5050833b42196629438ed674fee22b02e35d61fa Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Tue, 2 Jun 2020 13:08:57 +0200 Subject: [PATCH 123/147] revert changes due to performance penalty in Wilson using MPI --- Grid/simd/Grid_a64fx-fixedsize.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 6f27a4ec..602d56f6 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -419,7 +419,6 @@ struct Conj{ } }; -/* struct TimesMinusI{ // Complex float inline vecf operator()(vecf a, vecf b){ @@ -442,11 +441,14 @@ struct TimesMinusI{ return svneg_m(a_v, pg_odd, a_v); } }; -*/ // alternative implementation using fcadd // this is not optimal because we have op1 = op2 + TimesMinusI(op3) etc // ideally we have AddTimesMinusI(op1,op2,op3) +// +// makes performance worse in Benchmark_wilson using MPI +// increases halogtime and gathertime +/* struct TimesMinusI{ // Complex float inline vecf operator()(vecf a, vecf b){ @@ -463,8 +465,8 @@ struct TimesMinusI{ return svcadd_x(pg1, z_v, a, 270); } }; +*/ -/* struct TimesI{ // Complex float inline vecf operator()(vecf a, vecf b){ @@ -487,11 +489,15 @@ struct TimesI{ return svneg_m(a_v, pg_even, a_v); } }; -*/ + // alternative implementation using fcadd // this is not optimal because we have op1 = op2 + TimesI(op3) etc // ideally we have AddTimesI(op1,op2,op3) +// +// makes performance worse in Benchmark_wilson using MPI +// increases halogtime and gathertime +/* struct TimesI{ // Complex float inline vecf operator()(vecf a, vecf b){ @@ -508,7 +514,7 @@ struct TimesI{ return svcadd_x(pg1, z_v, a, 90); } }; - +*/ struct PrecisionChange { From 5ee3ea2144bafee061a883b9aa855e4a78848e8a Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 3 Jun 2020 11:58:20 +0200 Subject: [PATCH 124/147] round-up after testing of prefetches in stencil close --- .../WilsonKernelsAsmBodyA64FX.h | 10 ---------- .../WilsonKernelsImplementation.h | 3 +++ Grid/simd/Grid_a64fx-fixedsize.h | 19 +++++++++++++++++++ Grid/stencil/Stencil.h | 19 +++++++++++++++++++ 4 files changed, 41 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 406e5c25..cebb4327 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -164,12 +164,7 @@ Author: Nils Meyer Regensburg University if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ RECON; \ nmu++; \ } @@ -180,12 +175,7 @@ Author: Nils Meyer Regensburg University if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ RECON; \ nmu++; \ } diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 81216e03..348f1425 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -445,18 +445,21 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); /* printf("."); */ return;} + //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); /* printf("-"); */ return;} + //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); /* printf("+"); */ return;} + //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} #endif } assert(0 && " Kernel optimisation case not covered "); diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 602d56f6..95e45759 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -799,6 +799,25 @@ typedef veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; + +/* PF 256 worse than PF 64 +inline void prefetch_HINT_T0(const char *ptr){ + static int64_t last_ptr; + int64_t vptr = reinterpret_cast(ptr) & 0x7fffffffffffff00ll; + if (last_ptr != vptr) { + last_ptr = vptr; + pred pg1 = Optimization::acle::pg1(); + svprfd(pg1, reinterpret_cast(ptr), SV_PLDL1STRM); + svprfd(pg1, ptr, SV_PLDL1STRM); + } +}; +*/ +/* beneficial for operators? +inline void prefetch_HINT_T0(const char *ptr){ + pred pg1 = Optimization::acle::pg1(); + svprfd(pg1, ptr, SV_PLDL1STRM); +}; +*/ inline void prefetch_HINT_T0(const char *ptr){}; // Function name aliases diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index a56d256d..1f1ebbb2 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -68,8 +68,27 @@ void Gather_plane_simple_table (Vector >& table,const Lattice int num=table.size(); std::pair *table_v = & table[0]; auto rhs_v = rhs.View(); + + // main loop accelerator_forNB( i,num, vobj::Nsimd(), { typedef decltype(coalescedRead(buffer[0])) compressed_t; + // prefetching: + // +1% performance for Wilson on 32**4 + // -2% performance for DW on 24**4 x 12 + /* + const int dist = 2; + if (i+dist < num){ + svbool_t pg1 = svptrue_b64(); + + // prefetch input + auto in = rhs_v(so+table_v[i+dist].second); + svprfd(pg1, (char*)&in, SV_PLDL1STRM); + + // prefetch store buffer + uint64_t o = table_v[i+dist].first; + svprfd(pg1, (char*)&buffer[off+o], SV_PSTL1STRM); + } +*/ compressed_t tmp_c; uint64_t o = table_v[i].first; compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); From 9872c768250ea390c3e6a1353bbc838cf1cacee4 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 3 Jun 2020 15:20:13 +0200 Subject: [PATCH 125/147] introduce AddTimesI and SubTimesI; slight benefit in operators, but < 1%; breaks all other impls --- Grid/qcd/spin/TwoSpinor.h | 128 +++++++++++++++++-------------- Grid/simd/Grid_a64fx-fixedsize.h | 41 ++++++++-- Grid/simd/Grid_vector_types.h | 43 ++++++++++- Grid/tensors/Tensor_reality.h | 122 +++++++++++++++++++++++++---- 4 files changed, 256 insertions(+), 78 deletions(-) diff --git a/Grid/qcd/spin/TwoSpinor.h b/Grid/qcd/spin/TwoSpinor.h index 924594ab..24ca54af 100644 --- a/Grid/qcd/spin/TwoSpinor.h +++ b/Grid/qcd/spin/TwoSpinor.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/spin/TwoSpinor.h @@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////////////////////////// -// Normalisation alert; the g5 project is 1/2(1+-G5) +// Normalisation alert; the g5 project is 1/2(1+-G5) // the xyzt projects are (1+-Gxyzt) // // * xyzt project @@ -59,7 +59,7 @@ NAMESPACE_BEGIN(Grid); // // Both four spinor and two spinor result variants are provided. // -// The four spinor project will be recursively provided to Lattice wide routines, and likely used in +// The four spinor project will be recursively provided to Lattice wide routines, and likely used in // the domain wall and mobius implementations. // ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -74,13 +74,17 @@ NAMESPACE_BEGIN(Grid); // To fail is not to err (Cryptic clue: suggest to Google SFINAE ;) ) template > = 0> accelerator_inline void spProjXp (iVector &hspin,const iVector &fspin) { - hspin(0)=fspin(0)+timesI(fspin(3)); - hspin(1)=fspin(1)+timesI(fspin(2)); + //hspin(0)=fspin(0)+timesI(fspin(3)); + //hspin(1)=fspin(1)+timesI(fspin(2)); + hspin(0)=addTimesI(fspin(0), fspin(3)); + hspin(1)=addTimesI(fspin(1), fspin(2)); } template > = 0> accelerator_inline void spProjXm (iVector &hspin,const iVector &fspin) { - hspin(0)=fspin(0)-timesI(fspin(3)); - hspin(1)=fspin(1)-timesI(fspin(2)); + //hspin(0)=fspin(0)-timesI(fspin(3)); + //hspin(1)=fspin(1)-timesI(fspin(2)); + hspin(0)=subTimesI(fspin(0), fspin(3)); + hspin(1)=subTimesI(fspin(1), fspin(2)); } // 0 0 0 -1 [0] -+ [3] @@ -105,14 +109,18 @@ template > = 0> accelerator_inline void s */ template > = 0> accelerator_inline void spProjZp (iVector &hspin,const iVector &fspin) { - hspin(0)=fspin(0)+timesI(fspin(2)); - hspin(1)=fspin(1)-timesI(fspin(3)); + //hspin(0)=fspin(0)+timesI(fspin(2)); + //hspin(1)=fspin(1)-timesI(fspin(3)); + hspin(0)=addTimesI(fspin(0), fspin(2)); + hspin(1)=subTimesI(fspin(1), fspin(3)); } template > = 0> accelerator_inline void spProjZm (iVector &hspin,const iVector &fspin) { //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - hspin(0)=fspin(0)-timesI(fspin(2)); - hspin(1)=fspin(1)+timesI(fspin(3)); + //hspin(0)=fspin(0)-timesI(fspin(2)); + //hspin(1)=fspin(1)+timesI(fspin(3)); + hspin(0)=subTimesI(fspin(0), fspin(2)); + hspin(1)=addTimesI(fspin(1), fspin(3)); } /*Gt * 0 0 1 0 [0]+-[2] @@ -133,8 +141,8 @@ template > = 0> accelerator_inline void s hspin(1)=fspin(1)-fspin(3); } /*G5 - * 1 0 0 0 - * 0 1 0 0 + * 1 0 0 0 + * 0 1 0 0 * 0 0 -1 0 * 0 0 0 -1 */ @@ -152,7 +160,7 @@ template > = 0> accelerator_inline void s hspin(0)=fspin(2); hspin(1)=fspin(3); } - + // template accelerator_inline void fspProj5p (iVector &rfspin,const iVector &fspin) template > = 0> accelerator_inline void spProj5p (iVector &rfspin,const iVector &fspin) { @@ -202,16 +210,20 @@ template > = 0> accelerator_inline void a //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); - fspin(2)-=timesI(hspin(1)); - fspin(3)-=timesI(hspin(0)); + //fspin(2)-=timesI(hspin(1)); + //fspin(3)-=timesI(hspin(0)); + fspin(2)=subTimesI(fspin(2), hspin(1)); + fspin(3)=subTimesI(fspin(3), hspin(0)); } template > = 0> accelerator_inline void accumReconXm (iVector &fspin,const iVector &hspin) { //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); - fspin(2)+=timesI(hspin(1)); - fspin(3)+=timesI(hspin(0)); + //fspin(2)+=timesI(hspin(1)); + //fspin(3)+=timesI(hspin(0)); + fspin(2)=addTimesI(fspin(2), hspin(1)); + fspin(3)=addTimesI(fspin(3), hspin(0)); } // 0 0 0 -1 [0] -+ [3] @@ -279,16 +291,20 @@ template > = 0> accelerator_inline void a //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); - fspin(2)-=timesI(hspin(0)); - fspin(3)+=timesI(hspin(1)); + //fspin(2)-=timesI(hspin(0)); + //fspin(3)+=timesI(hspin(1)); + fspin(2)=subTimesI(fspin(2), hspin(0)); + fspin(3)=addTimesI(fspin(3), hspin(1)); } template > = 0> accelerator_inline void accumReconZm (iVector &fspin,const iVector &hspin) { //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); - fspin(2)+=timesI(hspin(0)); - fspin(3)-=timesI(hspin(1)); + //fspin(2)+=timesI(hspin(0)); + //fspin(3)-=timesI(hspin(1)); + fspin(2)=addTimesI(fspin(2), hspin(0)); + fspin(3)=subTimesI(fspin(3), hspin(1)); } /*Gt * 0 0 1 0 [0]+-[2] @@ -329,8 +345,8 @@ template > = 0> accelerator_inline void a fspin(3)-=hspin(1); } /*G5 - * 1 0 0 0 - * 0 1 0 0 + * 1 0 0 0 + * 0 1 0 0 * 0 0 -1 0 * 0 0 0 -1 */ @@ -383,7 +399,7 @@ template accelerator_inline void spProjXp (iScalar accelerator_inline void spProjXp (iMatrix &hspin,const iMatrix &fspin) { - for(int i=0;i > = 0> accel } template accelerator_inline void spReconXp (iMatrix &hspin,const iMatrix &fspin) { - for(int i=0;i > = 0> accel } template accelerator_inline void accumReconXp (iMatrix &hspin,const iMatrix &fspin) { - for(int i=0;i > = 0> accel template accelerator_inline void spProjXm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconXm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconXm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjYp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconYp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconYp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjYm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconYm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconYm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjZp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconZp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconZp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjZm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconZm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconZm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjTp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconTp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconTp (iMatrix &hspin, const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjTm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconTm (iMatrix &hspin, const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconTm (iMatrix &hspin, const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spRecon5p (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumRecon5p (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accelerator_inlin template accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel } template accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { - for(int i=0;i > = 0> accel template accelerator_inline void spRecon5m (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumRecon5m (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accelerator_inlin template accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i::pg1(); + return svcadd_x(pg1, a, b, 90); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svcadd_x(pg1, a, b, 90); + } +}; +// a - i * b +struct SubTimesI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svcadd_x(pg1, a, b, 270); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svcadd_x(pg1, a, b, 270); + } +}; + struct TimesI{ // Complex float inline vecf operator()(vecf a, vecf b){ @@ -493,7 +521,7 @@ struct TimesI{ // alternative implementation using fcadd // this is not optimal because we have op1 = op2 + TimesI(op3) etc -// ideally we have AddTimesI(op1,op2,op3) +// ideally we have op1 = AddTimesI(op2,op3) // // makes performance worse in Benchmark_wilson using MPI // increases halogtime and gathertime @@ -800,7 +828,7 @@ typedef veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; -/* PF 256 worse than PF 64 +/* PF 256 inline void prefetch_HINT_T0(const char *ptr){ static int64_t last_ptr; int64_t vptr = reinterpret_cast(ptr) & 0x7fffffffffffff00ll; @@ -812,7 +840,7 @@ inline void prefetch_HINT_T0(const char *ptr){ } }; */ -/* beneficial for operators? +/* PF 64 inline void prefetch_HINT_T0(const char *ptr){ pred pg1 = Optimization::acle::pg1(); svprfd(pg1, ptr, SV_PLDL1STRM); @@ -839,5 +867,8 @@ typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; +typedef Optimization::AddTimesI AddTimesISIMD; +typedef Optimization::SubTimesI SubTimesISIMD; + NAMESPACE_END(Grid); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e1eb330d..c1cb5770 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -298,7 +298,7 @@ public: // FIXME -- alias this to an accelerator_inline MAC struct. - // FIXME VLA build error +// specialize mac for A64FX #if defined(A64FX) || defined(A64FXFIXEDSIZE) friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, @@ -894,6 +894,47 @@ accelerator_inline Grid_simd timesI(const Grid_simd &in) { return in; } +// ----------------------------------------------------------------------------- + +// SVE only +/////////////////////// +// AddTimesI +/////////////////////// +template = 0> +accelerator_inline void addTimesI(Grid_simd &ret, const Grid_simd &in1, const Grid_simd &in2) { + ret.v = binary(in1.v, in2.v, AddTimesISIMD()); +} +template = 0> +accelerator_inline Grid_simd addTimesI(const Grid_simd &in1, const Grid_simd &in2) { + Grid_simd ret; + ret = addTimesI(in1, in2); + return ret; +} +template = 0> +accelerator_inline Grid_simd addTimesI(const Grid_simd &in1, const Grid_simd &in2) { + return in1; +} +/////////////////////// +// SubTimesI +/////////////////////// +template = 0> +accelerator_inline void subTimesI(Grid_simd &ret, const Grid_simd &in1, const Grid_simd &in2) { + ret.v = binary(in1.v, in2.v, SubTimesISIMD()); +} +template = 0> +accelerator_inline Grid_simd subTimesI(const Grid_simd &in1, const Grid_simd &in2) { + Grid_simd ret; + ret = subTimesI(in1, in2); + return ret; +} +template = 0> +accelerator_inline Grid_simd subTimesI(const Grid_simd &in1, const Grid_simd &in2) { + return in1; +} + +// end SVE +// ----------------------------------------------------------------------------- + ///////////////////// // Inner, outer ///////////////////// diff --git a/Grid/tensors/Tensor_reality.h b/Grid/tensors/Tensor_reality.h index ca1d52ca..5fc7a0f2 100644 --- a/Grid/tensors/Tensor_reality.h +++ b/Grid/tensors/Tensor_reality.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/tensors/Tensor_reality.h @@ -31,16 +31,16 @@ Author: neo NAMESPACE_BEGIN(Grid); -/////////////////////////////////////////////// +/////////////////////////////////////////////// // multiply by I; make recursive. -/////////////////////////////////////////////// -template accelerator_inline iScalar timesI(const iScalar&r) +/////////////////////////////////////////////// +template accelerator_inline iScalar timesI(const iScalar&r) { iScalar ret; timesI(ret._internal,r._internal); return ret; } -template accelerator_inline iVector timesI(const iVector&r) +template accelerator_inline iVector timesI(const iVector&r) { iVector ret; for(int i=0;i accelerator_inline iMatrix timesI(const iMa return ret; } -template accelerator_inline void timesI(iScalar &ret,const iScalar&r) +template accelerator_inline void timesI(iScalar &ret,const iScalar&r) { timesI(ret._internal,r._internal); } -template accelerator_inline void timesI(iVector &ret,const iVector&r) +template accelerator_inline void timesI(iVector &ret,const iVector&r) { for(int i=0;i accelerator_inline void timesI(iMatrix &re } -template accelerator_inline iScalar timesMinusI(const iScalar&r) +template accelerator_inline iScalar timesMinusI(const iScalar&r) { iScalar ret; timesMinusI(ret._internal,r._internal); return ret; } -template accelerator_inline iVector timesMinusI(const iVector&r) +template accelerator_inline iVector timesMinusI(const iVector&r) { iVector ret; for(int i=0;i accelerator_inline iMatrix timesMinusI(cons return ret; } -template accelerator_inline void timesMinusI(iScalar &ret,const iScalar&r) +template accelerator_inline void timesMinusI(iScalar &ret,const iScalar&r) { timesMinusI(ret._internal,r._internal); } -template accelerator_inline void timesMinusI(iVector &ret,const iVector&r) +template accelerator_inline void timesMinusI(iVector &ret,const iVector&r) { for(int i=0;i accelerator_inline void timesMinusI(iMatrix accelerator_inline iScalar addTimesI(const iScalar&r1, const iScalar&r2) +{ + iScalar ret; + addTimesI(ret._internal,r1._internal,r2._internal); + return ret; +} +template accelerator_inline iVector addTimesI(const iVector&r1, const iVector&r2) +{ + iVector ret; + for(int i=0;i accelerator_inline iMatrix addTimesI(const iMatrix&r1, const iMatrix&r2) +{ + iMatrix ret; + for(int i=0;i accelerator_inline void addTimesI(iScalar &ret,const iScalar&r1,const iScalar&r2) +{ + addTimesI(ret._internal,r1._internal,r2._internal); +} +template accelerator_inline void addTimesI(iVector &ret,const iVector&r1,const iVector&r2) +{ + for(int i=0;i accelerator_inline void addTimesI(iMatrix &ret,const iMatrix&r1,const iMatrix&r2) +{ + for(int i=0;i accelerator_inline iScalar subTimesI(const iScalar&r1, const iScalar&r2) +{ + iScalar ret; + subTimesI(ret._internal,r1._internal,r2._internal); + return ret; +} +template accelerator_inline iVector subTimesI(const iVector&r1, const iVector&r2) +{ + iVector ret; + for(int i=0;i accelerator_inline iMatrix subTimesI(const iMatrix&r1, const iMatrix&r2) +{ + iMatrix ret; + for(int i=0;i accelerator_inline void subTimesI(iScalar &ret,const iScalar&r1,const iScalar&r2) +{ + subTimesI(ret._internal,r1._internal,r2._internal); +} +template accelerator_inline void subTimesI(iVector &ret,const iVector&r1,const iVector&r2) +{ + for(int i=0;i accelerator_inline void subTimesI(iMatrix &ret,const iMatrix&r1,const iMatrix&r2) +{ + for(int i=0;i accelerator_inline iScalar conjugate(const iScalar&r) { iScalar ret; @@ -147,9 +237,9 @@ template accelerator_inline iMatrix conjugate(const return ret; } -/////////////////////////////////////////////// +/////////////////////////////////////////////// // Adj function for scalar, vector, matrix -/////////////////////////////////////////////// +/////////////////////////////////////////////// template accelerator_inline iScalar adj(const iScalar&r) { iScalar ret; @@ -206,7 +296,7 @@ template accelerator_inline auto real(const iVector } return ret; } - + template accelerator_inline auto imag(const iScalar &z) -> iScalar { iScalar ret; From 93a37c8f68cf89c8b8578e929d545ce6b05f75b4 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 8 Jun 2020 09:39:50 +0200 Subject: [PATCH 126/147] test prefetch to L2 in stencil --- Grid/stencil/Stencil.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1f1ebbb2..0b2544f9 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -75,20 +75,20 @@ void Gather_plane_simple_table (Vector >& table,const Lattice // prefetching: // +1% performance for Wilson on 32**4 // -2% performance for DW on 24**4 x 12 - /* - const int dist = 2; + + const int dist = 7; if (i+dist < num){ svbool_t pg1 = svptrue_b64(); // prefetch input auto in = rhs_v(so+table_v[i+dist].second); - svprfd(pg1, (char*)&in, SV_PLDL1STRM); + svprfd(pg1, (char*)&in, SV_PLDL2STRM); // prefetch store buffer uint64_t o = table_v[i+dist].first; - svprfd(pg1, (char*)&buffer[off+o], SV_PSTL1STRM); + svprfd(pg1, (char*)&buffer[off+o], SV_PSTL2STRM); } -*/ + compressed_t tmp_c; uint64_t o = table_v[i].first; compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); From 433766ac62a72645c282642046d49d8889a3ef29 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 8 Jun 2020 12:02:53 +0200 Subject: [PATCH 127/147] revert Add/SubTimesI and prefetching in stencil This reverts commit 9b2699226c7a3ca8d45f843f4f8e4658fa082163. --- .../WilsonKernelsAsmBodyA64FX.h | 10 ++ .../WilsonKernelsImplementation.h | 3 - Grid/qcd/spin/TwoSpinor.h | 128 ++++++++---------- Grid/simd/Grid_a64fx-fixedsize.h | 102 -------------- Grid/simd/Grid_vector_types.h | 43 +----- Grid/stencil/Stencil.h | 19 --- Grid/tensors/Tensor_reality.h | 122 +++-------------- 7 files changed, 83 insertions(+), 344 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index cebb4327..406e5c25 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -164,7 +164,12 @@ Author: Nils Meyer Regensburg University if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ RECON; \ nmu++; \ } @@ -175,7 +180,12 @@ Author: Nils Meyer Regensburg University if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ RECON; \ nmu++; \ } diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 348f1425..81216e03 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -445,21 +445,18 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); /* printf("."); */ return;} - //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); /* printf("-"); */ return;} - //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); /* printf("+"); */ return;} - //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} #endif } assert(0 && " Kernel optimisation case not covered "); diff --git a/Grid/qcd/spin/TwoSpinor.h b/Grid/qcd/spin/TwoSpinor.h index 24ca54af..924594ab 100644 --- a/Grid/qcd/spin/TwoSpinor.h +++ b/Grid/qcd/spin/TwoSpinor.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/spin/TwoSpinor.h @@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////////////////////////// -// Normalisation alert; the g5 project is 1/2(1+-G5) +// Normalisation alert; the g5 project is 1/2(1+-G5) // the xyzt projects are (1+-Gxyzt) // // * xyzt project @@ -59,7 +59,7 @@ NAMESPACE_BEGIN(Grid); // // Both four spinor and two spinor result variants are provided. // -// The four spinor project will be recursively provided to Lattice wide routines, and likely used in +// The four spinor project will be recursively provided to Lattice wide routines, and likely used in // the domain wall and mobius implementations. // ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -74,17 +74,13 @@ NAMESPACE_BEGIN(Grid); // To fail is not to err (Cryptic clue: suggest to Google SFINAE ;) ) template > = 0> accelerator_inline void spProjXp (iVector &hspin,const iVector &fspin) { - //hspin(0)=fspin(0)+timesI(fspin(3)); - //hspin(1)=fspin(1)+timesI(fspin(2)); - hspin(0)=addTimesI(fspin(0), fspin(3)); - hspin(1)=addTimesI(fspin(1), fspin(2)); + hspin(0)=fspin(0)+timesI(fspin(3)); + hspin(1)=fspin(1)+timesI(fspin(2)); } template > = 0> accelerator_inline void spProjXm (iVector &hspin,const iVector &fspin) { - //hspin(0)=fspin(0)-timesI(fspin(3)); - //hspin(1)=fspin(1)-timesI(fspin(2)); - hspin(0)=subTimesI(fspin(0), fspin(3)); - hspin(1)=subTimesI(fspin(1), fspin(2)); + hspin(0)=fspin(0)-timesI(fspin(3)); + hspin(1)=fspin(1)-timesI(fspin(2)); } // 0 0 0 -1 [0] -+ [3] @@ -109,18 +105,14 @@ template > = 0> accelerator_inline void s */ template > = 0> accelerator_inline void spProjZp (iVector &hspin,const iVector &fspin) { - //hspin(0)=fspin(0)+timesI(fspin(2)); - //hspin(1)=fspin(1)-timesI(fspin(3)); - hspin(0)=addTimesI(fspin(0), fspin(2)); - hspin(1)=subTimesI(fspin(1), fspin(3)); + hspin(0)=fspin(0)+timesI(fspin(2)); + hspin(1)=fspin(1)-timesI(fspin(3)); } template > = 0> accelerator_inline void spProjZm (iVector &hspin,const iVector &fspin) { //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; - //hspin(0)=fspin(0)-timesI(fspin(2)); - //hspin(1)=fspin(1)+timesI(fspin(3)); - hspin(0)=subTimesI(fspin(0), fspin(2)); - hspin(1)=addTimesI(fspin(1), fspin(3)); + hspin(0)=fspin(0)-timesI(fspin(2)); + hspin(1)=fspin(1)+timesI(fspin(3)); } /*Gt * 0 0 1 0 [0]+-[2] @@ -141,8 +133,8 @@ template > = 0> accelerator_inline void s hspin(1)=fspin(1)-fspin(3); } /*G5 - * 1 0 0 0 - * 0 1 0 0 + * 1 0 0 0 + * 0 1 0 0 * 0 0 -1 0 * 0 0 0 -1 */ @@ -160,7 +152,7 @@ template > = 0> accelerator_inline void s hspin(0)=fspin(2); hspin(1)=fspin(3); } - + // template accelerator_inline void fspProj5p (iVector &rfspin,const iVector &fspin) template > = 0> accelerator_inline void spProj5p (iVector &rfspin,const iVector &fspin) { @@ -210,20 +202,16 @@ template > = 0> accelerator_inline void a //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); - //fspin(2)-=timesI(hspin(1)); - //fspin(3)-=timesI(hspin(0)); - fspin(2)=subTimesI(fspin(2), hspin(1)); - fspin(3)=subTimesI(fspin(3), hspin(0)); + fspin(2)-=timesI(hspin(1)); + fspin(3)-=timesI(hspin(0)); } template > = 0> accelerator_inline void accumReconXm (iVector &fspin,const iVector &hspin) { //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); - //fspin(2)+=timesI(hspin(1)); - //fspin(3)+=timesI(hspin(0)); - fspin(2)=addTimesI(fspin(2), hspin(1)); - fspin(3)=addTimesI(fspin(3), hspin(0)); + fspin(2)+=timesI(hspin(1)); + fspin(3)+=timesI(hspin(0)); } // 0 0 0 -1 [0] -+ [3] @@ -291,20 +279,16 @@ template > = 0> accelerator_inline void a //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); - //fspin(2)-=timesI(hspin(0)); - //fspin(3)+=timesI(hspin(1)); - fspin(2)=subTimesI(fspin(2), hspin(0)); - fspin(3)=addTimesI(fspin(3), hspin(1)); + fspin(2)-=timesI(hspin(0)); + fspin(3)+=timesI(hspin(1)); } template > = 0> accelerator_inline void accumReconZm (iVector &fspin,const iVector &hspin) { //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); - //fspin(2)+=timesI(hspin(0)); - //fspin(3)-=timesI(hspin(1)); - fspin(2)=addTimesI(fspin(2), hspin(0)); - fspin(3)=subTimesI(fspin(3), hspin(1)); + fspin(2)+=timesI(hspin(0)); + fspin(3)-=timesI(hspin(1)); } /*Gt * 0 0 1 0 [0]+-[2] @@ -345,8 +329,8 @@ template > = 0> accelerator_inline void a fspin(3)-=hspin(1); } /*G5 - * 1 0 0 0 - * 0 1 0 0 + * 1 0 0 0 + * 0 1 0 0 * 0 0 -1 0 * 0 0 0 -1 */ @@ -399,7 +383,7 @@ template accelerator_inline void spProjXp (iScalar accelerator_inline void spProjXp (iMatrix &hspin,const iMatrix &fspin) { - for(int i=0;i > = 0> accel } template accelerator_inline void spReconXp (iMatrix &hspin,const iMatrix &fspin) { - for(int i=0;i > = 0> accel } template accelerator_inline void accumReconXp (iMatrix &hspin,const iMatrix &fspin) { - for(int i=0;i > = 0> accel template accelerator_inline void spProjXm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconXm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconXm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjYp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconYp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconYp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjYm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconYm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconYm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjZp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconZp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconZp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjZm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconZm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconZm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjTp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconTp (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconTp (iMatrix &hspin, const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProjTm (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spReconTm (iMatrix &hspin, const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumReconTm (iMatrix &hspin, const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void spRecon5p (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumRecon5p (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accelerator_inlin template accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel } template accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { - for(int i=0;i > = 0> accel template accelerator_inline void spRecon5m (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accel template accelerator_inline void accumRecon5m (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i > = 0> accelerator_inlin template accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; - for(int i=0;i::pg1(); - vecf z_v = acle::zero(); - - return svcadd_x(pg1, z_v, a, 270); - } - // Complex double - inline vecd operator()(vecd a, vecd b){ - pred pg1 = acle::pg1(); - vecd z_v = acle::zero(); - - return svcadd_x(pg1, z_v, a, 270); - } -}; -*/ - -// SVE only, fcadd returns a +- i*b -// a + i * b -struct AddTimesI{ - // Complex float - inline vecf operator()(vecf a, vecf b){ - pred pg1 = acle::pg1(); - return svcadd_x(pg1, a, b, 90); - } - // Complex double - inline vecd operator()(vecd a, vecd b){ - pred pg1 = acle::pg1(); - return svcadd_x(pg1, a, b, 90); - } -}; -// a - i * b -struct SubTimesI{ - // Complex float - inline vecf operator()(vecf a, vecf b){ - pred pg1 = acle::pg1(); - return svcadd_x(pg1, a, b, 270); - } - // Complex double - inline vecd operator()(vecd a, vecd b){ - pred pg1 = acle::pg1(); - return svcadd_x(pg1, a, b, 270); - } -}; - struct TimesI{ // Complex float inline vecf operator()(vecf a, vecf b){ @@ -518,33 +465,6 @@ struct TimesI{ } }; - -// alternative implementation using fcadd -// this is not optimal because we have op1 = op2 + TimesI(op3) etc -// ideally we have op1 = AddTimesI(op2,op3) -// -// makes performance worse in Benchmark_wilson using MPI -// increases halogtime and gathertime -/* -struct TimesI{ - // Complex float - inline vecf operator()(vecf a, vecf b){ - pred pg1 = acle::pg1(); - vecf z_v = acle::zero(); - - return svcadd_x(pg1, z_v, a, 90); - } - // Complex double - inline vecd operator()(vecd a, vecd b){ - pred pg1 = acle::pg1(); - vecd z_v = acle::zero(); - - return svcadd_x(pg1, z_v, a, 90); - } -}; -*/ - - struct PrecisionChange { static inline vech StoH (vecf sa, vecf sb) { pred pg1s = acle::pg1(); @@ -827,25 +747,6 @@ typedef veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; - -/* PF 256 -inline void prefetch_HINT_T0(const char *ptr){ - static int64_t last_ptr; - int64_t vptr = reinterpret_cast(ptr) & 0x7fffffffffffff00ll; - if (last_ptr != vptr) { - last_ptr = vptr; - pred pg1 = Optimization::acle::pg1(); - svprfd(pg1, reinterpret_cast(ptr), SV_PLDL1STRM); - svprfd(pg1, ptr, SV_PLDL1STRM); - } -}; -*/ -/* PF 64 -inline void prefetch_HINT_T0(const char *ptr){ - pred pg1 = Optimization::acle::pg1(); - svprfd(pg1, ptr, SV_PLDL1STRM); -}; -*/ inline void prefetch_HINT_T0(const char *ptr){}; // Function name aliases @@ -867,8 +768,5 @@ typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; -typedef Optimization::AddTimesI AddTimesISIMD; -typedef Optimization::SubTimesI SubTimesISIMD; - NAMESPACE_END(Grid); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index c1cb5770..e1eb330d 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -298,7 +298,7 @@ public: // FIXME -- alias this to an accelerator_inline MAC struct. -// specialize mac for A64FX + // FIXME VLA build error #if defined(A64FX) || defined(A64FXFIXEDSIZE) friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, @@ -894,47 +894,6 @@ accelerator_inline Grid_simd timesI(const Grid_simd &in) { return in; } -// ----------------------------------------------------------------------------- - -// SVE only -/////////////////////// -// AddTimesI -/////////////////////// -template = 0> -accelerator_inline void addTimesI(Grid_simd &ret, const Grid_simd &in1, const Grid_simd &in2) { - ret.v = binary(in1.v, in2.v, AddTimesISIMD()); -} -template = 0> -accelerator_inline Grid_simd addTimesI(const Grid_simd &in1, const Grid_simd &in2) { - Grid_simd ret; - ret = addTimesI(in1, in2); - return ret; -} -template = 0> -accelerator_inline Grid_simd addTimesI(const Grid_simd &in1, const Grid_simd &in2) { - return in1; -} -/////////////////////// -// SubTimesI -/////////////////////// -template = 0> -accelerator_inline void subTimesI(Grid_simd &ret, const Grid_simd &in1, const Grid_simd &in2) { - ret.v = binary(in1.v, in2.v, SubTimesISIMD()); -} -template = 0> -accelerator_inline Grid_simd subTimesI(const Grid_simd &in1, const Grid_simd &in2) { - Grid_simd ret; - ret = subTimesI(in1, in2); - return ret; -} -template = 0> -accelerator_inline Grid_simd subTimesI(const Grid_simd &in1, const Grid_simd &in2) { - return in1; -} - -// end SVE -// ----------------------------------------------------------------------------- - ///////////////////// // Inner, outer ///////////////////// diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 0b2544f9..a56d256d 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -68,27 +68,8 @@ void Gather_plane_simple_table (Vector >& table,const Lattice int num=table.size(); std::pair *table_v = & table[0]; auto rhs_v = rhs.View(); - - // main loop accelerator_forNB( i,num, vobj::Nsimd(), { typedef decltype(coalescedRead(buffer[0])) compressed_t; - // prefetching: - // +1% performance for Wilson on 32**4 - // -2% performance for DW on 24**4 x 12 - - const int dist = 7; - if (i+dist < num){ - svbool_t pg1 = svptrue_b64(); - - // prefetch input - auto in = rhs_v(so+table_v[i+dist].second); - svprfd(pg1, (char*)&in, SV_PLDL2STRM); - - // prefetch store buffer - uint64_t o = table_v[i+dist].first; - svprfd(pg1, (char*)&buffer[off+o], SV_PSTL2STRM); - } - compressed_t tmp_c; uint64_t o = table_v[i].first; compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); diff --git a/Grid/tensors/Tensor_reality.h b/Grid/tensors/Tensor_reality.h index 5fc7a0f2..ca1d52ca 100644 --- a/Grid/tensors/Tensor_reality.h +++ b/Grid/tensors/Tensor_reality.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/tensors/Tensor_reality.h @@ -31,16 +31,16 @@ Author: neo NAMESPACE_BEGIN(Grid); -/////////////////////////////////////////////// +/////////////////////////////////////////////// // multiply by I; make recursive. -/////////////////////////////////////////////// -template accelerator_inline iScalar timesI(const iScalar&r) +/////////////////////////////////////////////// +template accelerator_inline iScalar timesI(const iScalar&r) { iScalar ret; timesI(ret._internal,r._internal); return ret; } -template accelerator_inline iVector timesI(const iVector&r) +template accelerator_inline iVector timesI(const iVector&r) { iVector ret; for(int i=0;i accelerator_inline iMatrix timesI(const iMa return ret; } -template accelerator_inline void timesI(iScalar &ret,const iScalar&r) +template accelerator_inline void timesI(iScalar &ret,const iScalar&r) { timesI(ret._internal,r._internal); } -template accelerator_inline void timesI(iVector &ret,const iVector&r) +template accelerator_inline void timesI(iVector &ret,const iVector&r) { for(int i=0;i accelerator_inline void timesI(iMatrix &re } -template accelerator_inline iScalar timesMinusI(const iScalar&r) +template accelerator_inline iScalar timesMinusI(const iScalar&r) { iScalar ret; timesMinusI(ret._internal,r._internal); return ret; } -template accelerator_inline iVector timesMinusI(const iVector&r) +template accelerator_inline iVector timesMinusI(const iVector&r) { iVector ret; for(int i=0;i accelerator_inline iMatrix timesMinusI(cons return ret; } -template accelerator_inline void timesMinusI(iScalar &ret,const iScalar&r) +template accelerator_inline void timesMinusI(iScalar &ret,const iScalar&r) { timesMinusI(ret._internal,r._internal); } -template accelerator_inline void timesMinusI(iVector &ret,const iVector&r) +template accelerator_inline void timesMinusI(iVector &ret,const iVector&r) { for(int i=0;i accelerator_inline void timesMinusI(iMatrix accelerator_inline iScalar addTimesI(const iScalar&r1, const iScalar&r2) -{ - iScalar ret; - addTimesI(ret._internal,r1._internal,r2._internal); - return ret; -} -template accelerator_inline iVector addTimesI(const iVector&r1, const iVector&r2) -{ - iVector ret; - for(int i=0;i accelerator_inline iMatrix addTimesI(const iMatrix&r1, const iMatrix&r2) -{ - iMatrix ret; - for(int i=0;i accelerator_inline void addTimesI(iScalar &ret,const iScalar&r1,const iScalar&r2) -{ - addTimesI(ret._internal,r1._internal,r2._internal); -} -template accelerator_inline void addTimesI(iVector &ret,const iVector&r1,const iVector&r2) -{ - for(int i=0;i accelerator_inline void addTimesI(iMatrix &ret,const iMatrix&r1,const iMatrix&r2) -{ - for(int i=0;i accelerator_inline iScalar subTimesI(const iScalar&r1, const iScalar&r2) -{ - iScalar ret; - subTimesI(ret._internal,r1._internal,r2._internal); - return ret; -} -template accelerator_inline iVector subTimesI(const iVector&r1, const iVector&r2) -{ - iVector ret; - for(int i=0;i accelerator_inline iMatrix subTimesI(const iMatrix&r1, const iMatrix&r2) -{ - iMatrix ret; - for(int i=0;i accelerator_inline void subTimesI(iScalar &ret,const iScalar&r1,const iScalar&r2) -{ - subTimesI(ret._internal,r1._internal,r2._internal); -} -template accelerator_inline void subTimesI(iVector &ret,const iVector&r1,const iVector&r2) -{ - for(int i=0;i accelerator_inline void subTimesI(iMatrix &ret,const iMatrix&r1,const iMatrix&r2) -{ - for(int i=0;i accelerator_inline iScalar conjugate(const iScalar&r) { iScalar ret; @@ -237,9 +147,9 @@ template accelerator_inline iMatrix conjugate(const return ret; } -/////////////////////////////////////////////// +/////////////////////////////////////////////// // Adj function for scalar, vector, matrix -/////////////////////////////////////////////// +/////////////////////////////////////////////// template accelerator_inline iScalar adj(const iScalar&r) { iScalar ret; @@ -296,7 +206,7 @@ template accelerator_inline auto real(const iVector } return ret; } - + template accelerator_inline auto imag(const iScalar &z) -> iScalar { iScalar ret; From 2111052fbe5b9623ca8033e9f04f9b5d5efd4461 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 12 Jun 2020 14:49:19 +0200 Subject: [PATCH 128/147] apply VLA patch for memcpy reduction suggested by Arm, CAS-162542-D6W7Z7 --- Grid/simd/Grid_a64fx-2.h | 46 +++++++++++++++++++++---- Grid/simd/Grid_vector_types.h | 65 +++++++++++++++++++++++++---------- 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index a0463a10..6f7229ec 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -57,11 +57,39 @@ NAMESPACE_BEGIN(Optimization); constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; }; + #ifdef ARMCLANGCOMPAT + // SIMD vector immediate types + template + struct vec_imm { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + vec() = default; + vec(const vec &rhs) { this->operator=(rhs); } + vec(const vec_imm &rhs) { + // v = rhs.v + svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v)); + } + + inline vec &operator=(const vec &rhs) { + // v = rhs.v + svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v)); + return *this; + }; + }; + + #else // not defines ARMCLANGCOMPAT + #define vec_imm vec // SIMD vector types template struct vec { alignas(GEN_SIMD_WIDTH) T v[W::r]; }; + #endif typedef vec vecf; typedef vec vecd; @@ -91,27 +119,33 @@ struct acle{ static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + //const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + const vec_imm t = {1, 0, 3, 2, 5, 4, 7, 6}; return t; } static inline vec tbl0(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + //const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3}; return t; } static inline vec tbl1(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + //const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + const vec_imm t = {2, 3, 0, 1, 6, 7, 4, 5}; return t; } static inline vec tbl_exch1a(){ // Exchange1 - const vec t = {0, 1, 4, 5, 2, 3, 6, 7}; + //const vec t = {0, 1, 4, 5, 2, 3, 6, 7}; + const vec_imm t = {0, 1, 4, 5, 2, 3, 6, 7}; return t; } static inline vec tbl_exch1b(){ // Exchange1 - const vec t = {2, 3, 6, 7, 0, 1, 4, 5}; + //const vec t = {2, 3, 6, 7, 0, 1, 4, 5}; + const vec_imm t = {2, 3, 6, 7, 0, 1, 4, 5}; return t; } static inline vec tbl_exch1c(){ // Exchange1 - const vec t = {4, 5, 0, 1, 6, 7, 2, 3}; + //const vec t = {4, 5, 0, 1, 6, 7, 2, 3}; + const vec_imm t = {4, 5, 0, 1, 6, 7, 2, 3}; return t; } static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e1eb330d..527fde18 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -122,7 +122,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #if defined(A64FX) // VLA #pragma message("building for A64FX / SVE ACLE VLA") #if defined(ARMCLANGCOMPAT) - #pragma message("applying armclang fix") + #pragma message("applying armclang patch") #endif #include "Grid_a64fx-2.h" #endif @@ -247,21 +247,37 @@ public: return sizeof(Vector_type) / sizeof(Scalar_type); } -#ifdef ARMCLANGCOMPAT - accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { - svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); - svst1(svptrue_b8(), (int8_t*)this, tmp); - //v = rhs.v; - return *this; - }; + #ifdef ARMCLANGCOMPAT + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); + return *this; + }; + + #else - accelerator_inline Grid_simd &operator=(const Grid_simd &rhs) { - svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); - svst1(svptrue_b8(), (int8_t*)this, tmp); - //v = rhs.v; - return *this; - }; -#else accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { v = rhs.v; return *this; @@ -270,11 +286,24 @@ public: v = rhs.v; return *this; }; // faster than not declaring it and leaving to the compiler -#endif + + #endif accelerator Grid_simd() = default; - accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps - accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + + #ifdef ARMCLANGCOMPAT + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); } + #else + accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps + accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + #endif accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; // Enable if complex type template accelerator_inline From 2402b4940e4296cb7c50992d7ad1eac19307c75b Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 12 Jun 2020 15:17:38 +0200 Subject: [PATCH 129/147] vec_imm in float --- Grid/simd/Grid_a64fx-2.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 6f7229ec..51a6bfd0 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -165,31 +165,38 @@ struct acle{ static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} // exchange neighboring elements static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + //const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + const vec_imm t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; return t; } static inline vec tbl0(){ - const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + //const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + const vec_imm t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; return t; } static inline vec tbl1(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + //const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; return t; } static inline vec tbl2(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + //const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; return t; } static inline vec tbl_exch1a(){ // Exchange1 - const vec t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; + //const vec t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; + const vec_imm t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; return t; } static inline vec tbl_exch1b(){ // Exchange1 - const vec t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; + //const vec t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; + const vec_imm t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; return t; } static inline vec tbl_exch1c(){ // Exchange1 - const vec t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; + //const vec t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; + const vec_imm t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; return t; } static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} From 8dbf790f62aabded881b2c2bcf9e0a7c6558f514 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 12 Jun 2020 17:12:34 +0200 Subject: [PATCH 130/147] correct tbl2 for sp --- Grid/simd/Grid_a64fx-2.h | 5 +++-- Grid/simd/Grid_vector_types.h | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 51a6bfd0..221562de 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -82,7 +82,7 @@ NAMESPACE_BEGIN(Optimization); }; }; - #else // not defines ARMCLANGCOMPAT + #else // no ARMCLANGCOMPAT #define vec_imm vec // SIMD vector types template @@ -181,7 +181,7 @@ struct acle{ } static inline vec tbl2(){ //const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; - const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + const vec_imm t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; return t; } static inline vec tbl_exch1a(){ // Exchange1 @@ -889,6 +889,7 @@ inline Integer Reduce::operator()(veci in){ } #undef svred +#undef vec_imm NAMESPACE_END(Optimization) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 527fde18..634cf470 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -122,7 +122,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #if defined(A64FX) // VLA #pragma message("building for A64FX / SVE ACLE VLA") #if defined(ARMCLANGCOMPAT) - #pragma message("applying armclang patch") + #pragma message("applying data types patch") #endif #include "Grid_a64fx-2.h" #endif @@ -327,8 +327,9 @@ public: // FIXME -- alias this to an accelerator_inline MAC struct. - // FIXME VLA build error - #if defined(A64FX) || defined(A64FXFIXEDSIZE) + // safety exclude fxmac from VLA (causing wrong results?) + //#if defined(A64FX) || defined(A64FXFIXEDSIZE) + #if defined(A64FXFIXEDSIZE) friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { From 2a23f133e837c1ee20544a1aee197b0ec7b05325 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 12 Jun 2020 17:30:38 +0200 Subject: [PATCH 131/147] reenable fcmla for VLA --- Grid/simd/Grid_vector_types.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 634cf470..246c6a6a 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -327,9 +327,7 @@ public: // FIXME -- alias this to an accelerator_inline MAC struct. - // safety exclude fxmac from VLA (causing wrong results?) - //#if defined(A64FX) || defined(A64FXFIXEDSIZE) - #if defined(A64FXFIXEDSIZE) + #if defined(A64FX) || defined(A64FXFIXEDSIZE) friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { From 87266ce09948fefc74e30c71946927590f63ff09 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 12 Jun 2020 18:37:19 +0200 Subject: [PATCH 132/147] comment out fcmla in vector types: need also MultAddReal --- Grid/simd/Grid_vector_types.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 246c6a6a..33ba8f75 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -327,7 +327,8 @@ public: // FIXME -- alias this to an accelerator_inline MAC struct. - #if defined(A64FX) || defined(A64FXFIXEDSIZE) + //#if defined(A64FX) || defined(A64FXFIXEDSIZE) + #if 0 friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { @@ -820,7 +821,8 @@ accelerator_inline Grid_simd operator*(Grid_simd a, Grid_simd // ---------------- A64FX MAC ------------------- // Distinguish between complex types and others -#if defined(A64FX) || defined(A64FXFIXEDSIZE) +//#if defined(A64FX) || defined(A64FXFIXEDSIZE) +#if 0 template = 0> accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; @@ -832,6 +834,7 @@ accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, G template = 0> accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; +// MultSIMD takes only 2 args -> need MultAddReal ret.v = trinary(a.v, b.v, c.v, MultSIMD()); return ret; }; From 92281ec22d3235c96d18bea6b02749394db80177 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 12 Jun 2020 18:49:05 +0200 Subject: [PATCH 133/147] add 3 op Mult for VLA --- Grid/simd/Grid_a64fx-2.h | 14 ++++++++++++++ Grid/simd/Grid_vector_types.h | 7 ++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 221562de..2ad8591c 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -8,6 +8,8 @@ Author: Nils Meyer + with support from Arm + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -365,6 +367,18 @@ struct Sub{ }; struct Mult{ + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v); + typename acle::vt r_v = svmla_x(pg1, c_v, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } template inline vec operator()(vec a, vec b){ vec out; diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 33ba8f75..246c6a6a 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -327,8 +327,7 @@ public: // FIXME -- alias this to an accelerator_inline MAC struct. - //#if defined(A64FX) || defined(A64FXFIXEDSIZE) - #if 0 + #if defined(A64FX) || defined(A64FXFIXEDSIZE) friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { @@ -821,8 +820,7 @@ accelerator_inline Grid_simd operator*(Grid_simd a, Grid_simd // ---------------- A64FX MAC ------------------- // Distinguish between complex types and others -//#if defined(A64FX) || defined(A64FXFIXEDSIZE) -#if 0 +#if defined(A64FX) || defined(A64FXFIXEDSIZE) template = 0> accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; @@ -834,7 +832,6 @@ accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, G template = 0> accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; -// MultSIMD takes only 2 args -> need MultAddReal ret.v = trinary(a.v, b.v, c.v, MultSIMD()); return ret; }; From 36ea0e222a1f0a5654c8ef6cfb08f5c0a4aebaeb Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 13 Jun 2020 13:42:35 +0200 Subject: [PATCH 134/147] type traits for ComplexF/D in VLA patch; cosmetics in VLS intrinsics --- Grid/simd/Grid_a64fx-fixedsize.h | 3 --- Grid/simd/Grid_vector_types.h | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 2a6533fe..6b450012 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -33,9 +33,6 @@ // Using SVE ACLE with fixed-size data types ///////////////////////////////////////////////////// -/* TODO - * Exchange1 -*/ // gcc 10 features #if __ARM_FEATURE_SVE_BITS==512 diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 246c6a6a..d42396cb 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -187,6 +187,12 @@ template struct is_complex : public std::false_type {}; template <> struct is_complex : public std::true_type {}; template <> struct is_complex : public std::true_type {}; +template struct is_ComplexD : public std::false_type {}; +template <> struct is_ComplexD : public std::true_type {}; + +template struct is_ComplexF : public std::false_type {}; +template <> struct is_ComplexF : public std::true_type {}; + template struct is_real : public std::false_type {}; template struct is_real::value, void>::type> : public std::true_type {}; @@ -262,6 +268,7 @@ public: return *this; }; + /* template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { //v = rhs.v; @@ -275,6 +282,37 @@ public: svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); return *this; }; + */ + + // ComplexF + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (float32_t*)this, svld1(svptrue_b8(), (float32_t*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (float32_t*)this, svld1(svptrue_b8(), (float32_t*)&(rhs.v))); + return *this; + }; + + // ComplexD + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (float64_t*)this, svld1(svptrue_b8(), (float64_t*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (float64_t*)this, svld1(svptrue_b8(), (float64_t*)&(rhs.v))); + return *this; + }; #else From d1210ca12ae0a3e65f1b72c9f0e963c9d12e8b90 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 13 Jun 2020 13:59:32 +0200 Subject: [PATCH 135/147] switch to double/float instead of float64_t/float32_t in VLA patch --- Grid/simd/Grid_vector_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index d42396cb..6d6e5103 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -288,14 +288,14 @@ public: template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { //v = rhs.v; - svst1(svptrue_b8(), (float32_t*)this, svld1(svptrue_b8(), (float32_t*)&(rhs.v))); + svst1(svptrue_b8(), (float*)this, svld1(svptrue_b8(), (float*)&(rhs.v))); return *this; }; template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { //v = rhs.v; - svst1(svptrue_b8(), (float32_t*)this, svld1(svptrue_b8(), (float32_t*)&(rhs.v))); + svst1(svptrue_b8(), (float*)this, svld1(svptrue_b8(), (float*)&(rhs.v))); return *this; }; @@ -303,14 +303,14 @@ public: template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { //v = rhs.v; - svst1(svptrue_b8(), (float64_t*)this, svld1(svptrue_b8(), (float64_t*)&(rhs.v))); + svst1(svptrue_b8(), (double*)this, svld1(svptrue_b8(), (double*)&(rhs.v))); return *this; }; template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { //v = rhs.v; - svst1(svptrue_b8(), (float64_t*)this, svld1(svptrue_b8(), (float64_t*)&(rhs.v))); + svst1(svptrue_b8(), (double*)this, svld1(svptrue_b8(), (double*)&(rhs.v))); return *this; }; From a25e4b3d0ce53c4d45df860d23581bd388a55991 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Sat, 13 Jun 2020 14:44:37 +0200 Subject: [PATCH 136/147] pred 32/64 for float/double instead of 8 in VLA patch --- Grid/simd/Grid_vector_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 6d6e5103..727ed668 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -288,14 +288,14 @@ public: template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { //v = rhs.v; - svst1(svptrue_b8(), (float*)this, svld1(svptrue_b8(), (float*)&(rhs.v))); + svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v))); return *this; }; template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { //v = rhs.v; - svst1(svptrue_b8(), (float*)this, svld1(svptrue_b8(), (float*)&(rhs.v))); + svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v))); return *this; }; @@ -303,14 +303,14 @@ public: template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { //v = rhs.v; - svst1(svptrue_b8(), (double*)this, svld1(svptrue_b8(), (double*)&(rhs.v))); + svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v))); return *this; }; template accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { //v = rhs.v; - svst1(svptrue_b8(), (double*)this, svld1(svptrue_b8(), (double*)&(rhs.v))); + svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v))); return *this; }; From cc958aa9ed7e46651cf624c56cc06bb4568ec784 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 15 Jun 2020 14:21:38 +0200 Subject: [PATCH 137/147] switch back to standard MPI_init due to wrong results in Benchmark_wilson using comms-overlap --- Grid/communicator/Communicator_mpi3.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index e7812178..c9990045 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -54,8 +54,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) #if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori nCommThreads=1; - //MPI_Init(argc,argv); - MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided); + MPI_Init(argc,argv); + + // comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs + // other comms schemes are ok + //MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided); #else MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); #endif From 465856331a39ec4d83ecb62f3ade29e96de0a47b Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Mon, 15 Jun 2020 15:39:39 +0200 Subject: [PATCH 138/147] switch back to serialized; wrong results on single too --- Grid/communicator/Communicator_mpi3.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index c9990045..71dd6ffe 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -48,17 +48,18 @@ void CartesianCommunicator::Init(int *argc, char ***argv) #if defined (A64FX) || defined (A64FXFIXEDSIZE) #ifndef TOFU #define TOFU -#pragma message ("TOFU network / MPI_THREAD_SINGLE") +#pragma message ("TOFU network / MPI_THREAD_SERIALIZED") #endif #endif #if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori nCommThreads=1; - MPI_Init(argc,argv); + // wrong results here too + //MPI_Init(argc,argv); // comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs // other comms schemes are ok - //MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided); + MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided); #else MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); #endif From a87e45ba2531de57fc268d6f1098adcf3834a903 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 18 Jun 2020 11:23:08 +0200 Subject: [PATCH 139/147] SVE readme update --- SVE_README.txt | 5 ----- benchmarks/Benchmark_wilson.cc | 3 +++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/SVE_README.txt b/SVE_README.txt index 07c18b06..ec84c5de 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -17,11 +17,6 @@ export MPICH_CXX=g++-10.0.1 $ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" -works! but binaries do not finish when running via job scheduler. problem with MPI_finalize ? - -interactive login: mpirun -np 1 ./Benchmark_wilson_sweep --dslash-asm - [WARN] PLE 0610 plexec The process terminated with the signal.(rank=0)(nid=0xff010008)(sig=9) ? - -------------------------------------------------------- * armclang 20.0 VLA (merlin) diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc index 51d440c9..25d9402d 100644 --- a/benchmarks/Benchmark_wilson.cc +++ b/benchmarks/Benchmark_wilson.cc @@ -182,6 +182,9 @@ int main (int argc, char ** argv) auto nsimd = vComplex::Nsimd(); auto simdwidth = sizeof(vComplex); + std::cout< Date: Tue, 30 Jun 2020 19:27:08 +0200 Subject: [PATCH 140/147] disable TOFU by default --- Grid/communicator/Communicator_mpi3.cc | 2 ++ SVE_README.txt | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 71dd6ffe..e9399ddc 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char ***argv) if ( !flag ) { // Fugaku Tofu: enable by default +/* #if defined (A64FX) || defined (A64FXFIXEDSIZE) #ifndef TOFU #define TOFU #pragma message ("TOFU network / MPI_THREAD_SERIALIZED") #endif #endif +*/ #if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori nCommThreads=1; diff --git a/SVE_README.txt b/SVE_README.txt index ec84c5de..b8c7e03c 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -1,3 +1,12 @@ +* QPACE4 interactive login + +scl enable gcc-toolset-10 bash +module load mpi/openmpi-aarch64 + +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" + +------------------------------------------------------------------------------ + * gcc 10.0.1 VLA (merlin) ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static @@ -35,7 +44,7 @@ TODO check ARMCLANGCOMPAT * armclang 20.1 VLA (fjt cluster) -../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" +../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" TODO check ARMCLANGCOMPAT From fd3c8b0e854fcc7ba5eb45141cd44a6b5682d158 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 1 Jul 2020 09:00:38 +0200 Subject: [PATCH 141/147] correct build instructions qp4 --- SVE_README.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/SVE_README.txt b/SVE_README.txt index b8c7e03c..b8d44439 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -1,9 +1,15 @@ -* QPACE4 interactive login +* gcc 10.1 prebuild, QPACE4 interactive login + +scl enable gcc-toolset-10 bash + +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" + +* gcc 10.1 prebuild w/ MPI, QPACE4 interactive login scl enable gcc-toolset-10 bash module load mpi/openmpi-aarch64 -../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" ------------------------------------------------------------------------------ From 67db4993c26013928abd30d3b80b3e9fd0c787af Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Tue, 7 Jul 2020 19:54:52 +0200 Subject: [PATCH 142/147] reset head, update SVE readme --- SVE_README.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/SVE_README.txt b/SVE_README.txt index b8d44439..e6edc329 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -13,6 +13,12 @@ module load mpi/openmpi-aarch64 ------------------------------------------------------------------------------ +* armclang 20.2 (qp4) + +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN" + +------------------------------------------------------------------------------ + * gcc 10.0.1 VLA (merlin) ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static From 337d9dc0431c6b4570f73425452f05ca61a6b0ad Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 8 Jul 2020 08:13:40 +0200 Subject: [PATCH 143/147] move barrier in Benchmark_wilson --- benchmarks/Benchmark_wilson.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc index 0456c3c3..b834e887 100644 --- a/benchmarks/Benchmark_wilson.cc +++ b/benchmarks/Benchmark_wilson.cc @@ -162,6 +162,10 @@ int main (int argc, char ** argv) for(int i=0;i Date: Wed, 8 Jul 2020 12:43:51 +0200 Subject: [PATCH 144/147] enable --enable-simd=A64FX in configure --- .../implementation/WilsonKernelsAsmA64FX.h | 7 ++-- Grid/simd/Grid_vector_types.h | 34 +++++++++++++++++-- configure.ac | 9 +++++ 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 34da3110..2e587dfa 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -29,7 +29,8 @@ Author: Nils Meyer Regensburg University /* END LEGAL */ #pragma once -#if defined(A64FXASM) +//#if defined(A64FXASM) +#if defined(A64FX) // safety include #include @@ -39,13 +40,13 @@ Author: Nils Meyer Regensburg University // enable A64FX body #define WILSONKERNELSASMBODYA64FX -#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") +//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine /////////////////////////////////////////////////////////// #if defined(DSLASHINTRIN) -#pragma message ("A64FX Dslash: intrin") +//#pragma message ("A64FX Dslash: intrin") #include #else #pragma message ("A64FX Dslash: asm") diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 727ed668..33ebe8c1 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -120,7 +120,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here #include #if defined(A64FX) // VLA - #pragma message("building for A64FX / SVE ACLE VLA") + #pragma message("building A64FX / SVE ACLE VLA") #if defined(ARMCLANGCOMPAT) #pragma message("applying data types patch") #endif @@ -131,11 +131,41 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #include "Grid_a64fx-fixedsize.h" #endif #else - #pragma message("building for GEN") // generic + #pragma message("building GEN") // generic #include "Grid_generic.h" #endif #endif +#ifdef A64FX + #include + #ifdef __ARM_FEATURE_SVE_BITS + //#pragma message("building A64FX SVE VLS") + #include "Grid_a64fx-fixedsize.h" + #else + #pragma message("building A64FX SVE VLA") + #if defined(ARMCLANGCOMPAT) + #pragma message("applying data types patch") + #endif + #include "Grid_a64fx-2.h" + #endif +#endif + +/* +#ifdef A64FXVLA +#pragma message("building A64FX VLA") +#if defined(ARMCLANGCOMPAT) + #pragma message("applying data types patch") +#endif +#include +#include "Grid_a64fx-2.h" +#endif + +#ifdef A64FXVLS +#pragma message("building A64FX VLS") +#include +#include "Grid_a64fx-fixedsize.h" +#endif +*/ #ifdef SSE4 #include "Grid_sse4.h" diff --git a/configure.ac b/configure.ac index 878d56e6..f0bea6a4 100644 --- a/configure.ac +++ b/configure.ac @@ -392,6 +392,15 @@ case ${ax_cv_cxx_compiler_vendor} in [generic SIMD vector width (in bytes)]) SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" SIMD_FLAGS='';; + A64FX) + case ${ax_cv_cxx_compiler_vendor} in + gnu) + AC_DEFINE([A64FX],[1],[A64FX / 512-bit SVE VLS]) + SIMD_FLAGS='-march=armv8.2-a+sve -msve-vector-bits=512 -fno-gcse -DDSLASHINTRIN';; + clang) + AC_DEFINE([A64FX],[1],[A64FX / 512-bit SVE VLA]) + SIMD_FLAGS='-mcpu=a64fx -DARMCLANGCOMPAT -DDSLASHINTRIN';; + esac;; NEONv8) AC_DEFINE([NEONV8],[1],[ARMv8 NEON]) SIMD_FLAGS='-march=armv8-a';; From d9474c6cb6b378f78b7e47dc09283e3758f2a685 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 9 Jul 2020 10:07:02 +0200 Subject: [PATCH 145/147] compiler-independent build using --enable-simd=A64FX --- Grid/simd/Grid_vector_types.h | 2 +- SVE_README.txt | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 33ebe8c1..c07077a3 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -131,7 +131,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #include "Grid_a64fx-fixedsize.h" #endif #else - #pragma message("building GEN") // generic + //#pragma message("building GEN") // generic #include "Grid_generic.h" #endif #endif diff --git a/SVE_README.txt b/SVE_README.txt index e6edc329..0c167c4a 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -1,3 +1,13 @@ +* gcc 10.1 prebuild, QPACE4 interactive login w/ MPI + +scl enable gcc-toolset-10 bash +module load mpi/openmpi-aarch64 + +../configure --enable-simd=A64FX --enable-comms=mpi3 --enable-shm=shmget CXX=mpicxx CC=mpicc + + +================================== deprecated ================================================ + * gcc 10.1 prebuild, QPACE4 interactive login scl enable gcc-toolset-10 bash From 906b78811b59bafe75249a1e458e0b4f618ef5c9 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 22 Jul 2020 08:57:01 +0200 Subject: [PATCH 146/147] exit in Init when using --comms-overlap --- Grid/util/Init.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 656e29a9..a508f5a6 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -318,6 +318,13 @@ void Grid_init(int *argc,char ***argv) Grid_debug_handler_init(); } +#if defined(A64FX) + if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ + std::cout << "Option --comms-overlap currently not supported on QPACE4. Exiting." + exit(EXIT_FAILURE); + } +#endif + ////////////////////////////////////////////////////////// // Memory manager ////////////////////////////////////////////////////////// From ea7f8fda5ede7085e45e3e9335f17f6a215786c2 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 22 Jul 2020 09:34:05 +0200 Subject: [PATCH 147/147] fix typo --- Grid/util/Init.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index a508f5a6..6a84a2f2 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -320,7 +320,7 @@ void Grid_init(int *argc,char ***argv) #if defined(A64FX) if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ - std::cout << "Option --comms-overlap currently not supported on QPACE4. Exiting." + std::cout << "Option --comms-overlap currently not supported on QPACE4. Exiting." << std::endl; exit(EXIT_FAILURE); } #endif