From 4dd9e39e0d465e7cad3aef001dc0edf5e65b0ea6 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 00:54:31 +0100 Subject: [PATCH 1/5] up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1 --- .../implementation/WilsonKernelsAsmA64FX.h | 268 +- .../WilsonKernelsAsmBodyA64FX.h | 105 +- Grid/simd/Fujitsu_A64FX_asm_double.h | 148 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 148 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 160 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 160 +- Grid/simd/Fujitsu_A64FX_undef.h | 1 + Grid/simd/gridverter.py | 2377 ----------------- 8 files changed, 447 insertions(+), 2920 deletions(-) delete mode 100755 Grid/simd/gridverter.py diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 2e587dfa..ffec05a0 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -38,9 +38,6 @@ Author: Nils Meyer Regensburg University // undefine everything related to kernels #include -// enable A64FX body -#define WILSONKERNELSASMBODYA64FX -//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine @@ -63,119 +60,89 @@ Author: Nils Meyer Regensburg University #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + ///////////////////////////////////////////////////////////////// @@ -185,119 +152,89 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + // undefine @@ -330,119 +267,89 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, double @@ -451,124 +358,93 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + // undefs -#undef WILSONKERNELSASMBODYA64FX #include #endif //A64FXASM diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 406e5c25..83588a7d 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -25,6 +25,11 @@ Author: Nils Meyer Regensburg University See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + +// GCC 10 messes up SVE instruction scheduling using -O3 only, +// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders +// performance is better than armclang 20.2 + #ifdef KERNEL_DAG #define DIR0_PROJ XP_PROJ #define DIR1_PROJ YP_PROJ @@ -97,7 +102,7 @@ Author: Nils Meyer Regensburg University PROJ; \ MAYBEPERM(PERMUTE_DIR,perm); \ } else { \ - LOAD_CHI(base); \ + LOAD_CHI(base); \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ MULT_2SPIN_1(Dir); \ @@ -110,6 +115,15 @@ Author: Nils Meyer Regensburg University } \ RECON; \ +/* +NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty + though I expected that it would improve on performance + + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ +*/ + #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ @@ -126,73 +140,63 @@ Author: Nils Meyer Regensburg University #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ - LOAD_CHIMU(base); \ - LOAD_TABLE(PERMUTE_DIR); \ - PROJ; \ - MAYBEPERM(PERMUTE_DIR,perm); \ - }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - if ( local || st.same_node[Dir] ) { \ - MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ - MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - PREFETCH_CHIMU_L2(basep); \ - } else { PREFETCH_CHIMU(base); } \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_1(Dir); \ + MULT_2SPIN_2; \ + RECON; \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + PREFETCH_CHIMU_L2(basep); \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif + //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR - #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - nmu++; \ + RECON; \ + nmu++; \ } -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - nmu=0; \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + { ZERO_PSI;} \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - nmu++; \ + RECON; \ + nmu++; \ } #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif + + { int nmu; int local,perm, ptype; @@ -209,7 +213,6 @@ Author: Nils Meyer Regensburg University int ssn=ssU+1; if(ssn>=nmax) ssn=0; // int sUn=lo.Reorder(ssn); int sUn=ssn; - LOCK_GAUGE(0); #else int sU =ssU; int ssn=ssU+1; if(ssn>=nmax) ssn=0; @@ -295,6 +298,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + // { uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); #ifdef SHOW @@ -308,6 +316,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + //{ uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); #ifdef SHOW @@ -321,6 +334,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + //{ uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); + //} + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); #ifdef SHOW @@ -341,6 +359,7 @@ Author: Nils Meyer Regensburg University base = (uint64_t) &out[ss]; basep= st.GetPFInfo(nent,plocal); ent++; basep = (uint64_t) &out[ssn]; + //PREFETCH_RESULT_L1_STORE(base); RESULT(base,basep); #ifdef SHOW diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 76c556d7..bbc4efe7 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd @@ -70,11 +71,18 @@ Author: Nils Meyer #define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ + uint64_t baseU; \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ {1, 0, 3, 2, 5, 4, 7, 6}, \ {0, 1, 2, 4, 5, 6, 7, 8} };\ +asm ( \ + "ptrue p5.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ asm ( \ "fmov z31.d , 0 \n\t" \ : \ @@ -130,7 +138,7 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ @@ -149,7 +157,7 @@ asm ( \ // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ @@ -163,12 +171,12 @@ asm ( \ #define LOAD_CHI_A64FXd(base) \ { \ asm ( \ - "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -178,19 +186,18 @@ asm ( \ #define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ { \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -201,19 +208,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -224,19 +230,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -293,17 +298,16 @@ asm ( \ ); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -312,14 +316,14 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_1_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ "movprfx z18.d, p5/m, z31.d \n\t" \ "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ "movprfx z21.d, p5/m, z31.d \n\t" \ @@ -338,9 +342,9 @@ asm ( \ "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -560,7 +564,6 @@ asm ( \ #define TM_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ @@ -715,7 +718,6 @@ asm ( \ // ZERO_PSI #define ZERO_PSI_A64FXd \ asm ( \ - "ptrue p5.d \n\t" \ "fmov z0.d , 0 \n\t" \ "fmov z1.d , 0 \n\t" \ "fmov z2.d , 0 \n\t" \ @@ -733,13 +735,13 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ { \ asm ( \ - "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index d809f83b..e629f617 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf @@ -70,11 +71,18 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ + uint64_t baseU; \ const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ +asm ( \ + "ptrue p5.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ asm ( \ "fmov z31.s , 0 \n\t" \ : \ @@ -130,7 +138,7 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ @@ -149,7 +157,7 @@ asm ( \ // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ @@ -163,12 +171,12 @@ asm ( \ #define LOAD_CHI_A64FXf(base) \ { \ asm ( \ - "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -178,19 +186,18 @@ asm ( \ #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ { \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -201,19 +208,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -224,19 +230,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -293,17 +298,16 @@ asm ( \ ); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -312,14 +316,14 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_1_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ "movprfx z18.s, p5/m, z31.s \n\t" \ "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ "movprfx z21.s, p5/m, z31.s \n\t" \ @@ -338,9 +342,9 @@ asm ( \ "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -560,7 +564,6 @@ asm ( \ #define TM_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ @@ -715,7 +718,6 @@ asm ( \ // ZERO_PSI #define ZERO_PSI_A64FXf \ asm ( \ - "ptrue p5.s \n\t" \ "fmov z0.s , 0 \n\t" \ "fmov z1.s , 0 \n\t" \ "fmov z2.s , 0 \n\t" \ @@ -733,13 +735,13 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ { \ asm ( \ - "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 232610f2..361246fc 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd @@ -70,6 +71,7 @@ Author: Nils Meyer #define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ + uint64_t baseU; \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ @@ -126,18 +128,18 @@ Author: Nils Meyer // RESULT #define RESULT_A64FXd(base) \ { \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ @@ -156,7 +158,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ @@ -170,7 +172,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ @@ -178,62 +180,62 @@ Author: Nils Meyer // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ { \ - Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \ - Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \ - Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \ - Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \ - Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \ - Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \ + Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0)); \ + Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1)); \ + Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2)); \ + Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3)); \ + Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4)); \ + Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ { \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_CHIMU_0213 #define LOAD_CHIMU_0213_A64FXd \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ } // LOAD_CHIMU_0312 #define LOAD_CHIMU_0312_A64FXd \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_TABLE0 #define LOAD_TABLE0 \ @@ -261,26 +263,26 @@ Author: Nils Meyer Chi_12 = svtbl(Chi_12, table0); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ } // MULT_2SPIN #define MULT_2SPIN_1_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ @@ -293,9 +295,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \ } // MULT_2SPIN_BACKEND #define MULT_2SPIN_2_A64FXd \ @@ -570,12 +572,12 @@ Author: Nils Meyer result_31 = svdup_f64(0.); \ result_32 = svdup_f64(0.); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ } // PREFETCH_RESULT_L1_STORE (prefetch store to L1) #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 180e5f4f..30273b6e 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf @@ -70,6 +71,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ + uint64_t baseU; \ const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ @@ -126,18 +128,18 @@ Author: Nils Meyer // RESULT #define RESULT_A64FXf(base) \ { \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ @@ -156,7 +158,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ @@ -170,7 +172,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ @@ -178,62 +180,62 @@ Author: Nils Meyer // LOAD_CHI #define LOAD_CHI_A64FXf(base) \ { \ - Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \ - Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \ - Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \ - Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \ - Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \ - Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \ + Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0)); \ + Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1)); \ + Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2)); \ + Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3)); \ + Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4)); \ + Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ { \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_CHIMU_0213 #define LOAD_CHIMU_0213_A64FXf \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ } // LOAD_CHIMU_0312 #define LOAD_CHIMU_0312_A64FXf \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_TABLE0 #define LOAD_TABLE0 \ @@ -261,26 +263,26 @@ Author: Nils Meyer Chi_12 = svtbl(Chi_12, table0); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ } // MULT_2SPIN #define MULT_2SPIN_1_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ @@ -293,9 +295,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \ } // MULT_2SPIN_BACKEND #define MULT_2SPIN_2_A64FXf \ @@ -570,12 +572,12 @@ Author: Nils Meyer result_31 = svdup_f32(0.); \ result_32 = svdup_f32(0.); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ } // PREFETCH_RESULT_L1_STORE (prefetch store to L1) #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 81eec37a..51762a60 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -46,6 +46,7 @@ Author: Nils Meyer #undef MULT_2SPIN_2 #undef MAYBEPERM #undef LOAD_CHI +#undef ZERO_PSI #undef XP_PROJ #undef YP_PROJ #undef ZP_PROJ diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py deleted file mode 100755 index f00a5019..00000000 --- a/Grid/simd/gridverter.py +++ /dev/null @@ -1,2377 +0,0 @@ -#!/usr/bin/python3 - -import re -import argparse -import sys - -# Grid for A64FX -# -# * should align std::vector to (multiples of) cache block size = 256 bytes - -# place benchmark runtime in cycles here ! -measured_cycles = 690 #1500 #775 #1500 - - -# command line parser -parser = argparse.ArgumentParser(description="Dslash generator.") -parser.add_argument("--single", action="store_true", default="False") -parser.add_argument("--double", action="store_true", default="True") -parser.add_argument("--debug", action="store_true", default="False") -parser.add_argument("--gridbench", action="store_true", default="False") -args = parser.parse_args() - -print(args) - -ASM_LOAD_CHIMU = True # load chimu -ASM_LOAD_GAUGE = True # load gauge -ASM_LOAD_TABLE = True # load table -ASM_STORE = True # store result - -# Disable all loads and stores in asm for benchmarking purposes -#DISABLE_ASM_LOAD_STORE = True -DISABLE_ASM_LOAD_STORE = False - -if DISABLE_ASM_LOAD_STORE: - ASM_LOAD_CHIMU = True # load chimu - ASM_LOAD_GAUGE = True # load gauge - ASM_LOAD_TABLE = True # load table - ASM_STORE = False # store result - -# Alternative implementation using PROJ specific loads works, -# but be careful with predication - -ALTERNATIVE_LOADS = False -#ALTERNATIVE_LOADS = not ALTERNATIVE_LOADS # True - -# Alternative register mapping, -# must use with my_wilson4.h and my_wilson4pf.h - -ALTERNATIVE_REGISTER_MAPPING = False -#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING - -if ALTERNATIVE_REGISTER_MAPPING == True: - ALTERNATIVE_LOADS = False - -# use movprfx -MOVPRFX = False -MOVPRFX = not MOVPRFX - - -PREFETCH = False -PREFETCH = not PREFETCH # True - -PRECISION = 'double' # DP by default -PRECSUFFIX = 'A64FXd' -if args.single == True: - PRECISION = 'single' - PRECSUFFIX = 'A64FXf' - -_DEBUG = False #True # insert debugging output -if args.debug == True: - _DEBUG = True - -GRIDBENCH = False -if args.gridbench == True: - GRIDBENCH = True - -print("PRECISION = ", PRECISION) -print("DEBUG = ", _DEBUG) -print("ALTERNATIVE_LOADS = ", ALTERNATIVE_LOADS) -print("ALTERNATIVE_REGISTER_MAPPING = ", ALTERNATIVE_REGISTER_MAPPING) -print("MOVPRFX = ", MOVPRFX) -print("DISABLE_ASM_LOAD_STORE = ", DISABLE_ASM_LOAD_STORE) -print("GRIDBENCH = ", GRIDBENCH) - -print("") - -#sys.exit(0) - - -#_DEBUG = True # insert debugging output - -FETCH_BASE_PTR_COLOR_OFFSET = 2 # offset for scalar plus signed immediate addressing -STORE_BASE_PTR_COLOR_OFFSET = 2 - -# 64-bit gp register usage !!! armclang 20.0 complains about the register choice !!! -# table address: x30 -# data address: x29 -# store address: x28 -# debug address: r8 - -# Max performance of complex FMA using FCMLA instruction -# is 25% peak. -# -# Issue latency of FCMLA is 2 cycles. -# Need 2 FCMLA instructions for complex FMA. -# Complete complex FMA takes 4 cycles. -# Peak throughput is 4 * 8 Flops DP = 32 Flops DP in 4 cycles. -# A64FX FMA throughput is 4 * 8 * 2 * 2 = 132 Flops DP in 4 cycles. -# -> 25% peak FMA -# -# In: 3x 512 bits = 192 bytes -# Out: 1x 512 bits = 64 bytes -# Tot: 4x 512 bits = 256 bytes -# -# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) - -OPT = """ -* interleave prefetching and compute in MULT_2SPIN -* could test storing U's in MULT_2SPIN to L1d for cache line update -* structure reordering: MAYBEPERM after MULT_2SPIN ? -""" - -filename = 'XXX' -LEGAL = """/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: {} - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -""" - -class Register: - - def __init__(self, variable, asmreg='X', predication=False): - global d - x = 'Y' - if predication == False: - x = asmreg # + d['asmsuffix'] - else: - x = asmreg - self.asmreg = x - self.asmregwithsuffix = asmreg + d['asmsuffix'] - self.asmregbyte = asmreg + '.b' - self.name = variable - self.asmname = variable - self.asmnamebyte = variable + '.b' - self.predication = predication - - d['registers'] += 1 - - def define(self, statement): - global d - d['C'] += F'#define {self.name} {statement}' - #d['A'] += F'#define {self.name} {statement}' - - def declare(self, predication=False): - global d - - if self.predication == False: - d['C'] += F' Simd {self.name}; \\\n' - - predtype = 'svfloat64_t' - if PRECISION == 'single': - predtype = 'svfloat32_t' - - d['I'] += F' {predtype} {self.name}; \\\n' - else: - d['I'] += F' svbool_t {self.name}; \\\n' - #d['A'] += F'#define {self.name} {self.asmreg} \n' - - def loadpredication(self, target='A'): - global d - if (target == 'A'): - d['A'] += F' "ptrue {self.asmregwithsuffix} \\n\\t" \\\n' - d['asmclobber'].append(F'"{self.asmreg}"') - - def loadtable(self, t): - global d - d['load'] += d['factor'] - gpr = d['asmtableptr'] - - cast = 'uint64_t' - #asm_opcode = 'ld1d' - #if PRECISION == 'single': - # asm_opcode = 'ld1w' - # cast = 'uint32_t' - asm_opcode = 'ldr' - if PRECISION == 'single': - asm_opcode = 'ldr' - cast = 'uint32_t' - - d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' - - # using immediate index break-out works - if asm_opcode == 'ldr': - # ldr version - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' - else: - # ld1 version - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' - - d['asminput'].append(F'[tableptr] "r" (&lut[0])') - d['asminput'].append(F'[index] "i" ({t})') - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - - def load(self, address, target='ALL', cast='float64_t', colors=3, offset=FETCH_BASE_PTR_COLOR_OFFSET): - global d - d['load'] += d['factor'] - indices = re.findall(r'\d+', address) - index = (int(indices[0]) - offset) * colors + int(indices[1]) - - #asm_opcode = 'ld1d' - #if PRECISION == 'single': - #asm_opcode = 'ld1w' - # cast = 'float32_t' - - asm_opcode = 'ldr' - if PRECISION == 'single': - asm_opcode = 'ldr' - cast = 'float32_t' - - gpr = d['asmfetchbaseptr'] - intrinfetchbase = d['intrinfetchbase'] - if (target in ['ALL', 'C']): - d['C'] += F' {self.name} = {address}; \\\n' - if (target in ['ALL', 'I']): -# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' - d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' - if (target in ['ALL', 'A']): - if asm_opcode == 'ldr': - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' - else: - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' - - def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): - global d - d['store'] += d['factor'] - indices = re.findall(r'\d+', address) - index = (int(indices[0]) - offset) * colors + int(indices[1]) - - #asm_opcode = 'stnt1d' - #if PRECISION == 'single': - # asm_opcode = 'stnt1w' - # cast = 'float32_t' - asm_opcode = 'str' - if PRECISION == 'single': - asm_opcode = 'str' - cast = 'float32_t' - - intrinstorebase = d['intrinstorebase'] - - d['C'] += F' {address} = {self.name}; \\\n' - #d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - if asm_opcode == 'str': - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' - else: - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' - - def movestr(self, str): - global d - #d['move'] += d['factor'] - d['I'] += F' {self.name} = {str}; \\\n' - - def move(self, op1): - global d - d['move'] += d['factor'] - d['C'] += F' {self.name} = {op1.name}; \\\n' - d['I'] += F' {self.name} = {op1.name}; \\\n' - d['A'] += F' "mov {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - - # a = a + b , a = b + c - def add(self, op1, op2=None): - global d - d['add'] += d['factor'] - if op2 is None: - d['C'] += F' {self.name} = {self.name} + {op1.name}; \\\n' - d['I'] += F' {self.name} = svadd_x(pg1, {self.name}, {op1.name}); \\\n' - d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + {op2.name}; \\\n' - d['I'] += F' {self.name} = svadd_x(pg1, {op1.name}, {op2.name}); \\\n' - d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' - - # a = a -b , a = b - c - def sub(self, op1, op2=None): - global d - d['sub'] += d['factor'] - if op2 is None: - d['C'] += F' {self.name} = {self.name} - {op1.name}; \\\n' - d['I'] += F' {self.name} = svsub_x(pg1, {self.name}, {op1.name}); \\\n' - d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} - {op2.name}; \\\n' - d['I'] += F' {self.name} = svsub_x(pg1, {op1.name}, {op2.name}); \\\n' - d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' - - # a = a * b , a = b * c - def mul(self, op1, op2): - global d - d['mul'] += 2 * d['factor'] - d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = __svzero({self.name}); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "mov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mul0(self, op1, op2, op3=None, constructive=False): - global d - d['mul'] += d['factor'] - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {op1.name}, {op2.name}, {op3.name}, 0); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op2.asmregwithsuffix}, {op3.asmregwithsuffix}, 0 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - - def mul1(self, op1, op2): - global d - d['mul'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mac(self, op1, op2): - global d - d['mac'] += 2 * d['factor'] - d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mac0(self, op1, op2): - global d - d['mac'] += d['factor'] - d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - - def mac1(self, op1, op2): - global d - d['mac'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def zero(self, zeroreg=False): - d['zero'] += d['factor'] - d['C'] += F' {self.name} = 0; \\\n' - #d['I'] += F' {self.name} = __svzero({self.name}); \\\n' only armclang - - if PRECISION == 'double': - d['I'] += F' {self.name} = svdup_f64(0.); \\\n' - else: - d['I'] += F' {self.name} = svdup_f32(0.); \\\n' - - if zeroreg == True: - d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - else: - #using mov z, zero0 issue 1c, FLA, latency 6c - #d['A'] += F' "mov {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' - - #using mov z, 0 issue 1c, FLA, latency 6c - d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - - #using xor z, z, z issue 0.5c, FL*, latency 4c - #d['A'] += F' "eor {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' - - #using and z, z, zero0 issue 0.5c, FL*, latency 4c - #d['A'] += F' "and {self.asmregwithsuffix}, {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' - - #using sub z, z, z issue 0.5c, FL*, latency 9c - #d['A'] += F' "sub {self.asmregwithsuffix}, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' - - # without table - def timesI(self, op1, tempreg=None, tablereg=None): - global d - d['timesI'] += d['factor'] - d['C'] += F' {self.name} = timesI({op1.name}); \\\n' - # correct if DEBUG enabled, wrong if DEBUG disabled; no idea what's causing this - #table.load('table2', target='I', cast='uint64_t') - #d['I'] += F' {self.name} = svtbl({op1.name}, {tablereg.name}); \\\n' - #d['I'] += F' {self.name} = svneg_x(pg2, {self.name}); \\\n' - # timesI using trn tested, works but tbl should be faster - d['I'] += F' {tempreg.name} = svtrn2({op1.name}, {op1.name}); \\\n' - d['I'] += F' {tempreg.name} = svneg_x(pg1, {tempreg.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({tempreg.name}, {op1.name}); \\\n' - d['A'] += F' "trn2 {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fneg {tempreg.asmregwithsuffix}, {pg1.asmreg}/m, {tempreg.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "trn1 {self.asmregwithsuffix}, {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - - def addTimesI(self, op1, op2=None, constructive=False): - global d - d['addTimesI'] += d['factor'] - - if op2 is None: - d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - else: - if op2 is None: - d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 90); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 90 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def subTimesI(self, op1, op2=None, constructive=False): - global d - d['subTimesI'] += d['factor'] - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' - else: - if op2 is None: - d['C'] += F' {self.name} = {self.name} - timesI({op1.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 270); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 270 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} - timesI({op2.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' - - # timesMinusI is not used, def is probably wrong !!!! OPTIMIZATION with table - def timesMinusI(self, op1): - global d - d['timesMinusI'] += d['factor'] - d['C'] += F' {self.name} = timesMinusI({self.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({op1.name}, {op1.name}); \\\n' - d['I'] += F' {self.name} = svneg_x(pg1, {self.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({op1.name}, {self.name}); \\\n' - - def permute(self, dir, tablereg=None): - global d - d['permutes'] += d['factor'] - - d['C'] += F' permute{dir}({self.name}, {self.name}); \\\n' - - d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' - d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' - - # if dir == 0: - # d['I'] += F' {self.name} = svext({self.name}, {self.name}, 4); \\\n' - # # this might not work, see intrinsics assembly - # # d['A'] += F' ext {self.name}, {self.name}, {self.name}, #4 \\\n' - # # use registers directly - # d['A'] += F' "ext {self.asmregbyte}, {self.asmregbyte}, {self.asmregbyte}, 32 \\n\\t" \\\n' - # - # elif dir in [1, 2]: - # d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' - # d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' - - def debug(self): - global d - typecast = d['cfloat'] - gpr = d['asmdebugptr'] - vregs = d['asmclobberlist'] - if (d['debug'] == True): - d['C'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' - - d['I'] += F'svst1(pg1, ({typecast}*)&debugreg.v, {self.name}); \\\n' - d['I'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' - #d['I'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' - - d['A'] += F'asm ( \\\n' - d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier - d['A'] += F' "str {self.asmreg}, [%[ptr]] \\n\\t" \\\n' - d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier - d['A'] += F' : "=m" (debugreg.v) \\\n' - d['A'] += F' : [ptr] "r" (&debugreg.v) \\\n' - d['A'] += F' : "p5", "cc", "memory" \\\n' - d['A'] += F'); \\\n' - d['A'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' - # this form of addressing is not valid! - #d['A'] += F' "str {self.asmreg}, %[ptr] \\n\\t" \\\n' -# end Register - -def define(s, target='ALL'): - x = F'#define {s} \n' - global d - if (target in ['ALL', 'C']): - d['C'] += x - if (target in ['ALL', 'I']): - d['I'] += x - if (target in ['ALL', 'A']): - d['A'] += x - -def definemultiline(s): - x = F'#define {s} \\\n' - global d - d['C'] += x - d['I'] += x - d['A'] += x - -def write(s, target='ALL'): - x = F'{s}\n' - global d - if (target in ['ALL', 'C']): - d['C'] += x - if (target in ['ALL', 'I']): - d['I'] += x - if (target in ['ALL', 'A']): - d['A'] += x - -def curlyopen(): - write(F'{{ \\') - -def curlyclose(): - write(F'}}') - -def newline(target='ALL'): - global d - - if target == 'A': - if d['A'][-2:] == '\\\n': - d['A'] = d['A'][:-2] + '\n\n' - else: - if d['C'][-2:] == '\\\n': - d['C'] = d['C'][:-2] + '\n\n' - if d['I'][-2:] == '\\\n': - d['I'] = d['I'][:-2] + '\n\n' - if d['A'][-2:] == '\\\n': - d['A'] = d['A'][:-2] + '\n\n' - -# load the base pointer for fetches -def fetch_base_ptr(address, target='A'): - global d - #d['load'] += d['factor'] - - # DEBUG - #colors=3 - #indices = re.findall(r'\d+', address) - #index = (int(indices[0]) - FETCH_BASE_PTR_COLOR_OFFSET) * colors + int(indices[1]) - #print(F'{address} (base)') - - vregs = d['asmclobberlist'] - if target == 'A': - d['asminput'].append(F'[fetchptr] "r" ({address})') - d['asmclobber'].extend(vregs) - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - if target == 'I': - #print("intrinfetchbase = ", address) - d['intrinfetchbase'] = address - -# load the base pointer for stores -def store_base_ptr(address, target='A'): - global d - #d['load'] += d['factor'] - gpr = d['asmstorebaseptr'] - vregs = d['asmclobberlist'] - if target == 'A': - d['asminput'].append(F'[storeptr] "r" ({address})') - d['asmclobber'].extend(vregs) - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - if target == 'I': - d['intrinstorebase'] = address - -def prefetch_L1(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PLDL1STRM" # weak - #policy = "PLDL1KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - -def prefetch_L2(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PLDL2STRM" # weak - #policy = "PLDL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - #d['A'] += - -def prefetch_L2_store(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PSTL2STRM" # weak - #policy = "PSTL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - -def prefetch_L1_store(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PSTL1STRM" # weak - #policy = "PSTL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - - -def asmopen(): - #write('asm volatile ( \\', target='A') - write('asm ( \\', target='A') - - # DEBUG - #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier - #write('asm volatile ( \\', target='A') - -def asmclose(): - global d - - #print(d['asminput']) - - asmin = d['asminput'] - asmin_s = '' - if len(asmin) > 0: - asmin = list(dict.fromkeys(asmin)) # remove duplicates - #print(asmin) - for el in asmin: - asmin_s += el + ',' - asmin_s = asmin_s[:-1] - #print("-> ", asmin_s) - - d['asminput'] = [] - - asmout = d['asmoutput'] - asmout_s = '' - if len(asmout) > 0: - asmout = list(dict.fromkeys(asmout)) # remove duplicates - for el in asmout: - asmout_s += el + ',' - asmout_s = asmout_s[:-1] - - d['asmoutput'] = [] - - # DEBUG put all regs into clobber by default - d['asmclobber'].extend(d['asmclobberlist']) - - asmclobber = d['asmclobber'] - asmclobber_s = '' - #print(asmclobber) - if len(asmclobber) > 0: - asmclobber = list(dict.fromkeys(asmclobber)) # remove duplicates - for el in asmclobber: - asmclobber_s += el + ',' - asmclobber_s = asmclobber_s[:-1] - - d['asmclobber'] = [] - - # DEBUG - #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier - - - write(F' : {asmout_s} \\', target='A') - write(F' : {asmin_s} \\', target='A') - write(F' : {asmclobber_s} \\', target='A') - write('); \\', target='A') - -# -------------------------------------------------------------------------------- - -# string of vector registers to be used in clobber list -#clobberlist = ['"p0"'] -clobberlist = ['"p5"'] -clobberlist.append('"cc"') -for i in range(0, 32): - clobberlist.append(F'"z{i}"') - -d = { -'debug': _DEBUG, -'C': '', -'I': '', -'A': '', -'asmsuffix': '.d', # double precision by default -'cfloat': 'float64_t', -'registers': 0, -'load': 0, -'store': 0, -'move': 0, -'movprfx': 0, -'zero': 0, -'add': 0, -'sub': 0, -'mul': 0, -'mac': 0, -'permutes': 0, -'neg': 0, -'addTimesI': 0, -'subTimesI': 0, -'timesI': 0, -'timesMinusI': 0, -'flops': 0, -'factor': 1, # multiplicity -'asmtableptr': 'x30', -'asmfetchbaseptr': 'x29', -'asmstorebaseptr': 'x28', -'asmdebugptr': 'r12', -'asminput': [], -'asmoutput': [], -'asmclobber': [], -'asmclobberlist': clobberlist, -'intrinfetchbase': '', -'intrinstorebase': '', -'cycles_LOAD_CHIMU': 0, -'cycles_PROJ': 0, -'cycles_PERM': 0, -'cycles_MULT_2SPIN': 0, -'cycles_RECON': 0, -'cycles_RESULT': 0, -'cycles_ZERO_PSI': 0, -'cycles_PREFETCH_L1': 0, -'cycles_PREFETCH_L2': 0 -} - -if PRECISION == 'single': - d['asmsuffix'] = '.s' - d['cfloat'] = 'float32_t' - -# -------------------------------------------------------------------------------- -# Grid -# -------------------------------------------------------------------------------- - -# Variables / Registers -result_00 = Register('result_00', asmreg='z0') -result_01 = Register('result_01', asmreg='z1') -result_02 = Register('result_02', asmreg='z2') -result_10 = Register('result_10', asmreg='z3') -result_11 = Register('result_11', asmreg='z4') -result_12 = Register('result_12', asmreg='z5') -result_20 = Register('result_20', asmreg='z6') -result_21 = Register('result_21', asmreg='z7') -result_22 = Register('result_22', asmreg='z8') -result_30 = Register('result_30', asmreg='z9') -result_31 = Register('result_31', asmreg='z10') -result_32 = Register('result_32', asmreg='z11') # 12 Regs -Chi_00 = Register('Chi_00', asmreg='z12') -Chi_01 = Register('Chi_01', asmreg='z13') -Chi_02 = Register('Chi_02', asmreg='z14') -Chi_10 = Register('Chi_10', asmreg='z15') -Chi_11 = Register('Chi_11', asmreg='z16') -Chi_12 = Register('Chi_12', asmreg='z17') # 6 -UChi_00 = Register('UChi_00', asmreg='z18') -UChi_01 = Register('UChi_01', asmreg='z19') -UChi_02 = Register('UChi_02', asmreg='z20') -UChi_10 = Register('UChi_10', asmreg='z21') -UChi_11 = Register('UChi_11', asmreg='z22') -UChi_12 = Register('UChi_12', asmreg='z23') # 6 -U_00 = Register('U_00', asmreg='z24') -U_10 = Register('U_10', asmreg='z25') -U_20 = Register('U_20', asmreg='z26') -U_01 = Register('U_01', asmreg='z27') -U_11 = Register('U_11', asmreg='z28') -U_21 = Register('U_21', asmreg='z29') # 6 -> 30 Registers - -table0 = Register('table0', asmreg='z30') -zero0 = Register('zero0', asmreg='z31') # 2 -> 32 Registers -# can't overload temp1 / table due to type mismatch using intrinsics :( -# typecasting SVE intrinsics variables is not allowed - -pg1 = Register('pg1', predication=True, asmreg='p5') -#pg2 = Register('pg2', predication=True, asmreg='p1') - -# Overloaded with Chi_* and UChi_* -Chimu_00 = Register('Chimu_00', asmreg=Chi_00.asmreg) -Chimu_01 = Register('Chimu_01', asmreg=Chi_01.asmreg) -Chimu_02 = Register('Chimu_02', asmreg=Chi_02.asmreg) -Chimu_10 = Register('Chimu_10', asmreg=Chi_10.asmreg) -Chimu_11 = Register('Chimu_11', asmreg=Chi_11.asmreg) -Chimu_12 = Register('Chimu_12', asmreg=Chi_12.asmreg) -if ALTERNATIVE_REGISTER_MAPPING == False: - Chimu_20 = Register('Chimu_20', asmreg=UChi_00.asmreg) - Chimu_21 = Register('Chimu_21', asmreg=UChi_01.asmreg) - Chimu_22 = Register('Chimu_22', asmreg=UChi_02.asmreg) - Chimu_30 = Register('Chimu_30', asmreg=UChi_10.asmreg) - Chimu_31 = Register('Chimu_31', asmreg=UChi_11.asmreg) - Chimu_32 = Register('Chimu_32', asmreg=UChi_12.asmreg) # 12 Registers -else: # wilson4.h - Chimu_20 = Register('Chimu_20', asmreg=U_00.asmreg) - Chimu_21 = Register('Chimu_21', asmreg=U_10.asmreg) - Chimu_22 = Register('Chimu_22', asmreg=U_20.asmreg) - Chimu_30 = Register('Chimu_30', asmreg=U_01.asmreg) - Chimu_31 = Register('Chimu_31', asmreg=U_11.asmreg) - Chimu_32 = Register('Chimu_32', asmreg=U_21.asmreg) - -# debugging output -def debugall(msg=None, group='ALL'): - global d - if (d['debug'] == False): - return - write(F'std::cout << std::endl << "DEBUG -- {msg}" << std::endl; \\') - if (group in ['ALL', 'result']): - result_00.debug() - result_01.debug() - result_02.debug() - result_10.debug() - result_11.debug() - result_12.debug() - result_20.debug() - result_21.debug() - result_22.debug() - result_30.debug() - result_31.debug() - result_32.debug() - if (group in ['ALL', 'Chi']): - Chi_00.debug() - Chi_01.debug() - Chi_02.debug() - Chi_10.debug() - Chi_11.debug() - Chi_12.debug() - if (group in ['ALL', 'UChi']): - UChi_00.debug() - UChi_01.debug() - UChi_02.debug() - UChi_10.debug() - UChi_11.debug() - UChi_12.debug() - if (group in ['ALL', 'U']): - U_00.debug() - U_10.debug() - U_20.debug() - U_01.debug() - U_11.debug() - U_21.debug() - if (group in ['ALL', 'Chimu']): - Chimu_00.debug() - Chimu_01.debug() - Chimu_02.debug() - Chimu_10.debug() - Chimu_11.debug() - Chimu_12.debug() - Chimu_20.debug() - Chimu_21.debug() - Chimu_22.debug() - Chimu_30.debug() - Chimu_31.debug() - Chimu_32.debug() - -# -------------------------------------------------------------------------------- -# Output -# -------------------------------------------------------------------------------- - -if ALTERNATIVE_LOADS == True: - define(F'LOAD_CHIMU_0213_PLUG LOAD_CHIMU_0213_{PRECSUFFIX}') - define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') - define(F'LOAD_CHIMU(x)') -else: - #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') - define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') - -if PREFETCH: - define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') - define(F'PF_GAUGE(A)') - define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') -# define(F'PREFETCH1_CHIMU(A)') - define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') -# define(F'PREFETCH_CHIMU(A)') -else: - define(F'PREFETCH_CHIMU_L1(A)') - define(F'PREFETCH_GAUGE_L1(A)') - define(F'PREFETCH_CHIMU_L2(A)') - define(F'PREFETCH_GAUGE_L2(A)') - define(F'PF_GAUGE(A)') - define(F'PREFETCH1_CHIMU(A)') - define(F'PREFETCH_CHIMU(A)') - define(F'PREFETCH_RESULT_L2_STORE(A)') - -# standard defines -define(F'LOCK_GAUGE(A)') -define(F'UNLOCK_GAUGE(A)') -define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') -define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') -define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') -define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') -define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') -# don't need zero psi, everything is done in recons -#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') -define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') -# loads projections -define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}') -define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}') -define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}') -define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}') -define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}') -define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}') -define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}') -define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}') -# recons -define(F'XP_RECON XP_RECON_{PRECSUFFIX}') -define(F'XM_RECON XM_RECON_{PRECSUFFIX}') -define(F'XM_RECON_ACCUM XM_RECON_ACCUM_{PRECSUFFIX}') -define(F'YM_RECON_ACCUM YM_RECON_ACCUM_{PRECSUFFIX}') -define(F'ZM_RECON_ACCUM ZM_RECON_ACCUM_{PRECSUFFIX}') -define(F'TM_RECON_ACCUM TM_RECON_ACCUM_{PRECSUFFIX}') -define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') -define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') -define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') -define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') -# new permutes -define(F'PERMUTE_DIR0 0') -define(F'PERMUTE_DIR1 1') -define(F'PERMUTE_DIR2 2') -define(F'PERMUTE_DIR3 3') -define(F'PERMUTE PERMUTE_{PRECSUFFIX};') -# load table -#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') -if PRECISION == 'double': - define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}') - define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}') -else: - define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}') - define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}') - - - -write('// DECLARATIONS') -definemultiline(F'DECLARATIONS_{PRECSUFFIX}') -# debugging register -if d['debug'] == True: - write(' Simd debugreg; \\') -# perm tables -if PRECISION == 'double': - write(' const uint64_t lut[4][8] = { \\') - write(' {4, 5, 6, 7, 0, 1, 2, 3}, \\') #0 = swap register halves - write(' {2, 3, 0, 1, 6, 7, 4, 5}, \\') #1 = swap halves of halves - write(' {1, 0, 3, 2, 5, 4, 7, 6}, \\') #2 = swap re/im - write(' {0, 1, 2, 4, 5, 6, 7, 8} };\\') #3 = identity -else: - write(' const uint32_t lut[4][16] = { \\') - write(' {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \\') #0 = swap register halves - write(' {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \\') #1 = swap halves of halves - write(' {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \\') #2 = swap halves of halves of halves - write(' {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \\') #3 = swap re/im - -#newline(target='A') -result_00.declare() -result_01.declare() -result_02.declare() -result_10.declare() -result_11.declare() -result_12.declare() -result_20.declare() -result_21.declare() -result_22.declare() -result_30.declare() -result_31.declare() -result_32.declare() # 12 -Chi_00.declare() -Chi_01.declare() -Chi_02.declare() -Chi_10.declare() -Chi_11.declare() -Chi_12.declare() # 6 -UChi_00.declare() -UChi_01.declare() -UChi_02.declare() -UChi_10.declare() -UChi_11.declare() -UChi_12.declare() # 6 -U_00.declare() -U_10.declare() -U_20.declare() -U_01.declare() -U_11.declare() -U_21.declare() # 6 -> 30 regs - -# all predications true -pg1.declare() -if PRECISION == 'double': - pg1.movestr('svptrue_b64()') -else: - pg1.movestr('svptrue_b32()') - -# tables -if PRECISION == 'double': - write(' svuint64_t table0; \\', target='I') # -> 31 regs -else: - write(' svuint32_t table0; \\', target='I') # -> 31 regs - -zero0.declare() - -# zero register -asmopen() -zero0.zero(zeroreg=True) -asmclose() -newline() - -define('Chimu_00 Chi_00', target='I') -define('Chimu_01 Chi_01', target='I') -define('Chimu_02 Chi_02', target='I') -define('Chimu_10 Chi_10', target='I') -define('Chimu_11 Chi_11', target='I') -define('Chimu_12 Chi_12', target='I') -if ALTERNATIVE_REGISTER_MAPPING == False: - define('Chimu_20 UChi_00', target='I') - define('Chimu_21 UChi_01', target='I') - define('Chimu_22 UChi_02', target='I') - define('Chimu_30 UChi_10', target='I') - define('Chimu_31 UChi_11', target='I') - define('Chimu_32 UChi_12', target='I') -else: # wilson4.h - define('Chimu_20 U_00', target='I') - define('Chimu_21 U_10', target='I') - define('Chimu_22 U_20', target='I') - define('Chimu_30 U_01', target='I') - define('Chimu_31 U_11', target='I') - define('Chimu_32 U_21', target='I') -newline() - - -d['cycles_RESULT'] += 12 -write('// RESULT') -definemultiline(F'RESULT_{PRECSUFFIX}(base)') -if ASM_STORE: - curlyopen() - #write(' SiteSpinor & ref(out[ss]); \\') - asmopen() - #pg1.loadpredication() - #store_base_ptr("&ref[0][0]") - #store_base_ptr(F"&ref[{STORE_BASE_PTR_COLOR_OFFSET}][0]") - store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - result_00.store("ref[0][0]") - result_01.store("ref[0][1]") - result_02.store("ref[0][2]") - result_10.store("ref[1][0]") - result_11.store("ref[1][1]") - result_12.store("ref[1][2]") - result_20.store("ref[2][0]") - result_21.store("ref[2][1]") - result_22.store("ref[2][2]") - result_30.store("ref[3][0]") - result_31.store("ref[3][1]") - result_32.store("ref[3][2]") - asmclose() - debugall('RESULT', group='result') - curlyclose() -newline() - -# prefetch spinors from memory into L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_CHIMU_L2 (prefetch to L2)') -definemultiline(F'PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"base", target='A') -prefetch_L2(F"base", 0) -prefetch_L2(F"base", 1) -prefetch_L2(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch spinors from memory into L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_CHIMU_L1 (prefetch to L1)') -definemultiline(F'PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -#pg1.loadpredication() -fetch_base_ptr(F"base", target='A') -prefetch_L1(F"base", 0) -prefetch_L1(F"base", 1) -prefetch_L1(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch gauge from memory into L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_GAUGE_L2 (prefetch to L2)') -definemultiline(F'PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') -curlyopen() -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') -else: - write(' const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"baseU", target='A') -prefetch_L2(F"baseU", -1) -prefetch_L2(F"baseU", 0) -prefetch_L2(F"baseU", 1) -prefetch_L2(F"baseU", 2) -prefetch_L2(F"baseU", 3) -prefetch_L2(F"baseU", 4) -prefetch_L2(F"baseU", 5) -prefetch_L2(F"baseU", 6) -prefetch_L2(F"baseU", 7) -#prefetch_L2(F"baseU", 8) -asmclose() -curlyclose() -newline() - -# prefetch gauge from memory into L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_GAUGE_L1 (prefetch to L1)') -definemultiline(F'PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') -curlyopen() -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"baseU", target='A') -prefetch_L1(F"baseU", 0) -prefetch_L1(F"baseU", 1) -prefetch_L1(F"baseU", 2) -asmclose() -curlyclose() -newline() - -d['factor'] = 0 -write('// LOAD_CHI') -definemultiline(F'LOAD_CHI_{PRECSUFFIX}(base)') -if ASM_LOAD_CHIMU: - curlyopen() - #write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - fetch_base_ptr(F"base", target='I') - fetch_base_ptr(F"base", target='A') - - Chi_00.load("ref[0][0]", offset=0) - Chi_01.load("ref[0][1]", offset=0) - Chi_02.load("ref[0][2]", offset=0) - Chi_10.load("ref[1][0]", offset=0) - Chi_11.load("ref[1][1]", offset=0) - Chi_12.load("ref[1][2]", offset=0) - asmclose() - debugall('LOAD_CHI', group='Chi') - curlyclose() -newline() - - - -d['factor'] = 8 -# 12 loads = 12 issues, load latency = 8+1 cycles -# (not perfectly clear to me from docs) -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU') -definemultiline(F'LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') -if ASM_LOAD_CHIMU: - curlyopen() - #write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - #fetch_base_ptr("&ref[0][0]") - #fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - # Chimu_00.load("ref[0][0]") - # Chimu_01.load("ref[0][1]") - # Chimu_02.load("ref[0][2]") - # Chimu_10.load("ref[1][0]") - # Chimu_11.load("ref[1][1]") - # Chimu_12.load("ref[1][2]") - # Chimu_20.load("ref[2][0]") - # Chimu_21.load("ref[2][1]") - # Chimu_22.load("ref[2][2]") - # Chimu_30.load("ref[3][0]") - # Chimu_31.load("ref[3][1]") - # Chimu_32.load("ref[3][2]") - - Chimu_00.load("ref[0][0]") # minimum penalty for all directions - Chimu_30.load("ref[3][0]") - Chimu_10.load("ref[1][0]") - Chimu_20.load("ref[2][0]") - - Chimu_01.load("ref[0][1]") - Chimu_31.load("ref[3][1]") - Chimu_11.load("ref[1][1]") - Chimu_21.load("ref[2][1]") - - Chimu_02.load("ref[0][2]") - Chimu_32.load("ref[3][2]") - Chimu_12.load("ref[1][2]") - Chimu_22.load("ref[2][2]") - asmclose() - debugall('LOAD_CHIMU', group='Chimu') - curlyclose() -newline() - -# alternative load chimu: dirac order 0213 -# placed into asm (...) -d['factor'] = 0 -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU_0213') -definemultiline(F'LOAD_CHIMU_0213_{PRECSUFFIX}') -if ASM_LOAD_CHIMU: - curlyopen() - write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - Chimu_00.load("ref[0][0]") # reordered - Chimu_20.load("ref[2][0]") - - Chimu_01.load("ref[0][1]") - Chimu_21.load("ref[2][1]") - - Chimu_02.load("ref[0][2]") - Chimu_22.load("ref[2][2]") - - Chimu_10.load("ref[1][0]") - Chimu_30.load("ref[3][0]") - - Chimu_11.load("ref[1][1]") - Chimu_31.load("ref[3][1]") - - Chimu_12.load("ref[1][2]") - Chimu_32.load("ref[3][2]") - asmclose() - debugall('LOAD_CHIMU_0213', group='Chimu') - curlyclose() -newline() - -# alternative load chimu: dirac order 0312 -# placed into asm (...) -d['factor'] = 0 -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU_0312') -definemultiline(F'LOAD_CHIMU_0312_{PRECSUFFIX}') -if ASM_LOAD_CHIMU: - curlyopen() - write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - Chimu_00.load("ref[0][0]") # reordered - Chimu_30.load("ref[3][0]") - - Chimu_01.load("ref[0][1]") - Chimu_31.load("ref[3][1]") - - Chimu_02.load("ref[0][2]") - Chimu_32.load("ref[3][2]") - - Chimu_10.load("ref[1][0]") - Chimu_20.load("ref[2][0]") - - Chimu_11.load("ref[1][1]") - Chimu_21.load("ref[2][1]") - - Chimu_12.load("ref[1][2]") - Chimu_22.load("ref[2][2]") - asmclose() - debugall('LOAD_CHIMU_0312', group='Chimu') - curlyclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE0') -definemultiline(F'LOAD_TABLE0') -asmopen() -table0.loadtable(0) -asmclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE1') -definemultiline(F'LOAD_TABLE1') -asmopen() -table0.loadtable(1) -asmclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE2') -definemultiline(F'LOAD_TABLE2') -asmopen() -table0.loadtable(2) -asmclose() -newline() - -d['factor'] = 0 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE3') -definemultiline(F'LOAD_TABLE3') -asmopen() -table0.loadtable(3) -asmclose() -newline() - -d['factor'] = 2 # factor is 2 -d['cycles_PERM'] += 6 * d['factor'] -write('// PERMUTE') -definemultiline(F'PERMUTE_{PRECSUFFIX}') -debugall('PERM PRE', group='Chi') -asmopen() -#table0.loadtable(2) -Chi_00.permute(2, table0) -Chi_01.permute(2, table0) -Chi_02.permute(2, table0) -Chi_10.permute(2, table0) -Chi_11.permute(2, table0) -Chi_12.permute(2, table0) -asmclose() -debugall('PERM POST', group='Chi') -newline() - -write('// LOAD_GAUGE') -definemultiline(F'LOAD_GAUGE') -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -curlyopen() -asmopen() -pg1.loadpredication() -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') -if ASM_LOAD_GAUGE: - fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - U_00.load("ref[0][0]") - U_10.load("ref[1][0]") - U_20.load("ref[2][0]") - U_01.load("ref[0][1]") - U_11.load("ref[1][1]") - U_21.load("ref[2][1]") -asmclose() -curlyclose() -newline() - -d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total -# assume all U loads are hidden -# FCMLA issue latency = 2 cycles -# measurement: latency = 16 cycles if FULLY pipelined !? -# spec says 6+6+9 cycles -# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 -d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] -write('// MULT_2SPIN') -definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)') -curlyopen() -#write(' const auto & ref(U[sU][A]); \\') -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr("&ref[0][0]") -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') -#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='I') -#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='A') -#fetch_base_ptr(F"&ref[0][{FETCH_BASE_PTR_COLOR_OFFSET}]") -if ASM_LOAD_GAUGE: - U_00.load("ref[0][0]") - U_10.load("ref[1][0]") - U_20.load("ref[2][0]") - U_01.load("ref[0][1]") - U_11.load("ref[1][1]") - U_21.load("ref[2][1]") - -if MOVPRFX == False: - UChi_00.zero() # implementation specific - UChi_10.zero() - UChi_01.zero() - UChi_11.zero() - UChi_02.zero() - UChi_12.zero() - - # round 1 - UChi_00.mul0(U_00, Chi_00) # FCMLA latency is 6+6+9 cycles - UChi_10.mul0(U_00, Chi_10) - UChi_01.mul0(U_10, Chi_00) - UChi_11.mul0(U_10, Chi_10) - UChi_02.mul0(U_20, Chi_00) - UChi_12.mul0(U_20, Chi_10) -else: - # round 1 - UChi_00.mul0(zero0, U_00, Chi_00, constructive=True) # FCMLA latency is 6+6+9 cycles - UChi_10.mul0(zero0, U_00, Chi_10, constructive=True) - UChi_01.mul0(zero0, U_10, Chi_00, constructive=True) - UChi_11.mul0(zero0, U_10, Chi_10, constructive=True) - UChi_02.mul0(zero0, U_20, Chi_00, constructive=True) - UChi_12.mul0(zero0, U_20, Chi_10, constructive=True) - -# round 2 -UChi_00.mul1(U_00, Chi_00) -UChi_10.mul1(U_00, Chi_10) -UChi_01.mul1(U_10, Chi_00) -UChi_11.mul1(U_10, Chi_10) -UChi_02.mul1(U_20, Chi_00) -UChi_12.mul1(U_20, Chi_10) # Chi_00 and Chi_10 available from here - -if ASM_LOAD_GAUGE: - U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded - U_10.load("ref[1][2]") # early load - U_20.load("ref[2][2]") # A --> -asmclose() -debugall('MULT_2SPIN_1', group='UChi') -curlyclose() -newline() - -write('// MULT_2SPIN_BACKEND') -definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}') -curlyopen() -asmopen() -# round 3 -UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and -UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) -UChi_01.mac0(U_11, Chi_01) # autonomously using intrinsics -UChi_11.mac0(U_11, Chi_11) -UChi_02.mac0(U_21, Chi_01) -UChi_12.mac0(U_21, Chi_11) -# round 4 -UChi_00.mac1(U_01, Chi_01) -UChi_10.mac1(U_01, Chi_11) -UChi_01.mac1(U_11, Chi_01) -UChi_11.mac1(U_11, Chi_11) -UChi_02.mac1(U_21, Chi_01) -UChi_12.mac1(U_21, Chi_11) -# round 5 -UChi_00.mac0(U_00, Chi_02) # <-- A -UChi_10.mac0(U_00, Chi_12) -UChi_01.mac0(U_10, Chi_02) -UChi_11.mac0(U_10, Chi_12) -UChi_02.mac0(U_20, Chi_02) -UChi_12.mac0(U_20, Chi_12) -# round 6 -UChi_00.mac1(U_00, Chi_02) -UChi_10.mac1(U_00, Chi_12) -UChi_01.mac1(U_10, Chi_02) -UChi_11.mac1(U_10, Chi_12) -UChi_02.mac1(U_20, Chi_02) -UChi_12.mac1(U_20, Chi_12) -asmclose() -debugall('MULT_2SPIN_2', group='UChi') -curlyclose() -newline() - - -#// hspin(0)=fspin(0)+timesI(fspin(3)); -#// hspin(1)=fspin(1)+timesI(fspin(2)); -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// XP_PROJ') -definemultiline(F'XP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.addTimesI(Chimu_00, Chimu_30) -Chi_01.addTimesI(Chimu_01, Chimu_31) -Chi_02.addTimesI(Chimu_02, Chimu_32) -Chi_10.addTimesI(Chimu_10, Chimu_20) -Chi_11.addTimesI(Chimu_11, Chimu_21) -Chi_12.addTimesI(Chimu_12, Chimu_22) -asmclose() -debugall('XP_PROJ', group='Chi') -curlyclose() -newline() - -#// fspin(0)=hspin(0); -#// fspin(1)=hspin(1); -#// fspin(2)=timesMinusI(hspin(1)); -#// fspin(3)=timesMinusI(hspin(0)); -# does not occur in GridBench -d['factor'] = 0 -d['cycles_RECON'] += 15 * d['factor'] -write('// XP_RECON') -definemultiline(F'XP_RECON_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -if MOVPRFX == False: - result_20.zero() - result_21.zero() - result_22.zero() - result_30.zero() - result_31.zero() - result_32.zero() - - result_20.subTimesI(UChi_10) - result_21.subTimesI(UChi_11) - result_22.subTimesI(UChi_12) - result_30.subTimesI(UChi_00) - result_31.subTimesI(UChi_01) - result_32.subTimesI(UChi_02) -else: - result_20.subTimesI(zero0, UChi_10, constructive=True) - result_21.subTimesI(zero0, UChi_11, constructive=True) - result_22.subTimesI(zero0, UChi_12, constructive=True) - result_30.subTimesI(zero0, UChi_00, constructive=True) - result_31.subTimesI(zero0, UChi_01, constructive=True) - result_32.subTimesI(zero0, UChi_02, constructive=True) - -result_00.move(UChi_00) # don't reorder ! -result_01.move(UChi_01) -result_02.move(UChi_02) -result_10.move(UChi_10) -result_11.move(UChi_11) -result_12.move(UChi_12) - -# result_00.add(UChi_00) # faster than move? -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -asmclose() -debugall('XP_RECON', group='result') -newline() - - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_RECON'] += 15 * d['factor'] -write('// XP_RECON_ACCUM') -definemultiline(F'XP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.subTimesI(UChi_10) -# result_21.subTimesI(UChi_11) -# result_22.subTimesI(UChi_12) -# result_30.subTimesI(UChi_00) -# result_31.subTimesI(UChi_01) -# result_32.subTimesI(UChi_02) -# -# result_00.add(UChi_00) # reordered -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) - -result_30.subTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_31.subTimesI(UChi_01) -result_01.add(UChi_01) - -result_32.subTimesI(UChi_02) -result_02.add(UChi_02) - -result_20.subTimesI(UChi_10) -result_10.add(UChi_10) - -result_21.subTimesI(UChi_11) -result_11.add(UChi_11) - -result_22.subTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('XP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// YP_PROJ') -definemultiline(F'YP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.sub(Chimu_00, Chimu_30) -Chi_01.sub(Chimu_01, Chimu_31) -Chi_02.sub(Chimu_02, Chimu_32) -Chi_10.add(Chimu_10, Chimu_20) -Chi_11.add(Chimu_11, Chimu_21) -Chi_12.add(Chimu_12, Chimu_22) -asmclose() -debugall('YP_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// ZP_PROJ') -definemultiline(F'ZP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.addTimesI(Chimu_00, Chimu_20) -Chi_01.addTimesI(Chimu_01, Chimu_21) -Chi_02.addTimesI(Chimu_02, Chimu_22) -Chi_10.subTimesI(Chimu_10, Chimu_30) -Chi_11.subTimesI(Chimu_11, Chimu_31) -Chi_12.subTimesI(Chimu_12, Chimu_32) -asmclose() -debugall('ZP_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// TP_PROJ') -definemultiline(F'TP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.add(Chimu_00, Chimu_20) -Chi_01.add(Chimu_01, Chimu_21) -Chi_02.add(Chimu_02, Chimu_22) -Chi_10.add(Chimu_10, Chimu_30) -Chi_11.add(Chimu_11, Chimu_31) -Chi_12.add(Chimu_12, Chimu_32) -asmclose() -debugall('TP_PROJ', group='Chi') -curlyclose() -newline() - -#// hspin(0)=fspin(0)-timesI(fspin(3)); -#// hspin(1)=fspin(1)-timesI(fspin(2)); - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// XM_PROJ') -definemultiline(F'XM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.subTimesI(Chimu_00, Chimu_30) -Chi_01.subTimesI(Chimu_01, Chimu_31) -Chi_02.subTimesI(Chimu_02, Chimu_32) -Chi_10.subTimesI(Chimu_10, Chimu_20) -Chi_11.subTimesI(Chimu_11, Chimu_21) -Chi_12.subTimesI(Chimu_12, Chimu_22) -asmclose() -debugall('XM_PROJ sub', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// XM_RECON') -definemultiline(F'XM_RECON_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() - -# only necessary if not zeroed before -if MOVPRFX == False: - result_20.zero() - result_21.zero() - result_22.zero() - result_30.zero() - result_31.zero() - result_32.zero() - - result_20.addTimesI(UChi_10) # <-- - result_21.addTimesI(UChi_11) - result_22.addTimesI(UChi_12) - result_30.addTimesI(UChi_00) - result_31.addTimesI(UChi_01) - result_32.addTimesI(UChi_02) -else: - result_20.addTimesI(zero0, UChi_10, constructive=True) # <-- - result_21.addTimesI(zero0, UChi_11, constructive=True) - result_22.addTimesI(zero0, UChi_12, constructive=True) - result_30.addTimesI(zero0, UChi_00, constructive=True) - result_31.addTimesI(zero0, UChi_01, constructive=True) - result_32.addTimesI(zero0, UChi_02, constructive=True) - -result_00.move(UChi_00) -result_01.move(UChi_01) -result_02.move(UChi_02) -result_10.move(UChi_10) -result_11.move(UChi_11) -result_12.move(UChi_12) -asmclose() -debugall('XM_RECON result', group='result') -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// YM_PROJ') -definemultiline(F'YM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.add(Chimu_00, Chimu_30) -Chi_01.add(Chimu_01, Chimu_31) -Chi_02.add(Chimu_02, Chimu_32) -Chi_10.sub(Chimu_10, Chimu_20) -Chi_11.sub(Chimu_11, Chimu_21) -Chi_12.sub(Chimu_12, Chimu_22) -asmclose() -debugall('YM_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// ZM_PROJ') -definemultiline(F'ZM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.subTimesI(Chimu_00, Chimu_20) -Chi_01.subTimesI(Chimu_01, Chimu_21) -Chi_02.subTimesI(Chimu_02, Chimu_22) -Chi_10.addTimesI(Chimu_10, Chimu_30) -Chi_11.addTimesI(Chimu_11, Chimu_31) -Chi_12.addTimesI(Chimu_12, Chimu_32) -asmclose() -debugall('ZM_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// TM_PROJ') -definemultiline(F'TM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -pg1.loadpredication() -Chi_00.sub(Chimu_00, Chimu_20) -Chi_01.sub(Chimu_01, Chimu_21) -Chi_02.sub(Chimu_02, Chimu_22) -Chi_10.sub(Chimu_10, Chimu_30) -Chi_11.sub(Chimu_11, Chimu_31) -Chi_12.sub(Chimu_12, Chimu_32) -asmclose() -debugall('TM_PROJ', group='Chi') -curlyclose() -newline() - -# does not occur in GridBench -d['factor'] = 0 -# add/sub issue latency = 1, latency is 9 -d['cycles_RECON'] += 15 * d['factor'] -write('// XM_RECON_ACCUM') -definemultiline(F'XM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -# result_20.addTimesI(UChi_10) -# result_21.addTimesI(UChi_11) -# result_22.addTimesI(UChi_12) -# result_30.addTimesI(UChi_00) -# result_31.addTimesI(UChi_01) -# result_32.addTimesI(UChi_02) -# -# # result_00.move(UChi_00) -# # result_01.move(UChi_01) -# # result_02.move(UChi_02) -# # result_10.move(UChi_10) -# # result_11.move(UChi_11) -# # result_12.move(UChi_12) -# -# # faster than move ? -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) - -result_30.addTimesI(UChi_00) # reordered -result_31.addTimesI(UChi_01) -result_32.addTimesI(UChi_02) - -result_20.addTimesI(UChi_10) -result_21.addTimesI(UChi_11) -result_22.addTimesI(UChi_12) - -result_00.add(UChi_00) -result_01.add(UChi_01) -result_02.add(UChi_02) -result_10.add(UChi_10) -result_11.add(UChi_11) -result_12.add(UChi_12) -asmclose() -debugall('XM_RECON_ACCUM', group='result') -newline() - - - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// YP_RECON_ACCUM') -definemultiline(F'YP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.add(UChi_10) -# result_21.add(UChi_11) -# result_22.add(UChi_12) -# result_30.sub(UChi_00) -# result_31.sub(UChi_01) -# result_32.sub(UChi_02) - -result_00.add(UChi_00) # reordered -result_30.sub(UChi_00) - -result_01.add(UChi_01) -result_31.sub(UChi_01) - -result_02.add(UChi_02) -result_32.sub(UChi_02) - -result_10.add(UChi_10) -result_20.add(UChi_10) - -result_11.add(UChi_11) -result_21.add(UChi_11) - -result_12.add(UChi_12) -result_22.add(UChi_12) -asmclose() -debugall('YP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// YM_RECON_ACCUM') -definemultiline(F'YM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.sub(UChi_10) -# result_21.sub(UChi_11) -# result_22.sub(UChi_12) -# result_30.add(UChi_00) -# result_31.add(UChi_01) -# result_32.add(UChi_02) - -result_00.add(UChi_00) # reordered -result_30.add(UChi_00) - -result_01.add(UChi_01) -result_31.add(UChi_01) - -result_02.add(UChi_02) -result_32.add(UChi_02) - -result_10.add(UChi_10) -result_20.sub(UChi_10) - -result_11.add(UChi_11) -result_21.sub(UChi_11) - -result_12.add(UChi_12) -result_22.sub(UChi_12) -asmclose() -debugall('YM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// ZP_RECON_ACCUM') -definemultiline(F'ZP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.subTimesI(UChi_00) -# result_21.subTimesI(UChi_01) -# result_22.subTimesI(UChi_02) -# result_30.addTimesI(UChi_10) -# result_31.addTimesI(UChi_11) -# result_32.addTimesI(UChi_12) -# -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -result_20.subTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_21.subTimesI(UChi_01) -result_01.add(UChi_01) - -result_22.subTimesI(UChi_02) -result_02.add(UChi_02) - -result_30.addTimesI(UChi_10) -result_10.add(UChi_10) - -result_31.addTimesI(UChi_11) -result_11.add(UChi_11) - -result_32.addTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('ZP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// ZM_RECON_ACCUM') -definemultiline(F'ZM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.addTimesI(UChi_00) -# result_21.addTimesI(UChi_01) -# result_22.addTimesI(UChi_02) -# result_30.subTimesI(UChi_10) -# result_31.subTimesI(UChi_11) -# result_32.subTimesI(UChi_12) -# -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -result_20.addTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_21.addTimesI(UChi_01) -result_01.add(UChi_01) - -result_22.addTimesI(UChi_02) -result_02.add(UChi_02) - -result_30.subTimesI(UChi_10) -result_10.add(UChi_10) - -result_31.subTimesI(UChi_11) -result_11.add(UChi_11) - -result_32.subTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('ZM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// TP_RECON_ACCUM') -definemultiline(F'TP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.add(UChi_00) -# result_21.add(UChi_01) -# result_22.add(UChi_02) -# result_30.add(UChi_10) -# result_31.add(UChi_11) -# result_32.add(UChi_12) - -result_00.add(UChi_00) # reordered -result_20.add(UChi_00) - -result_01.add(UChi_01) -result_21.add(UChi_01) - -result_02.add(UChi_02) -result_22.add(UChi_02) - -result_10.add(UChi_10) -result_30.add(UChi_10) - -result_11.add(UChi_11) -result_31.add(UChi_11) - -result_12.add(UChi_12) -result_32.add(UChi_12) -asmclose() -debugall('TP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// TM_RECON_ACCUM') -definemultiline(F'TM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.sub(UChi_00) -# result_21.sub(UChi_01) -# result_22.sub(UChi_02) -# result_30.sub(UChi_10) -# result_31.sub(UChi_11) -# result_32.sub(UChi_12) - -result_00.add(UChi_00) # reordered -result_20.sub(UChi_00) - -result_01.add(UChi_01) -result_21.sub(UChi_01) - -result_02.add(UChi_02) -result_22.sub(UChi_02) - -result_10.add(UChi_10) -result_30.sub(UChi_10) - -result_11.add(UChi_11) -result_31.sub(UChi_11) - -result_12.add(UChi_12) -result_32.sub(UChi_12) -asmclose() -debugall('TM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 0 -# have 12 instructions -# picking dual issue versions -d['cycles_ZERO_PSI'] += 6 * d['factor'] -write('// ZERO_PSI') -definemultiline(F'ZERO_PSI_{PRECSUFFIX}') -asmopen() -pg1.loadpredication() -result_00.zero() -result_01.zero() -result_02.zero() -result_10.zero() -result_11.zero() -result_12.zero() -result_20.zero() -result_21.zero() -result_22.zero() -result_30.zero() -result_31.zero() -result_32.zero() -asmclose() -#debugall('ZERO_PSI', group='result') -newline() - -# prefetch store spinors to L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') -definemultiline(F'PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -fetch_base_ptr(F"base", target='A') -prefetch_L2_store(F"base", 0) -prefetch_L2_store(F"base", 1) -prefetch_L2_store(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch store spinors to L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)') -definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -fetch_base_ptr(F"base", target='A') -prefetch_L1_store(F"base", 0) -prefetch_L1_store(F"base", 1) -prefetch_L1_store(F"base", 2) -asmclose() -curlyclose() -newline() - - -d['factor'] = 0 -write('// ADD_RESULT_INTERNAL') -definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}') -asmopen() -result_00.add(Chimu_00) -result_01.add(Chimu_01) -result_02.add(Chimu_02) -result_10.add(Chimu_10) -result_11.add(Chimu_11) -result_12.add(Chimu_12) -result_20.add(Chimu_20) -result_21.add(Chimu_21) -result_22.add(Chimu_22) -result_30.add(Chimu_30) -result_31.add(Chimu_31) -result_32.add(Chimu_32) -asmclose() -#debugall('ZERO_PSI', group='result') -newline() - -# -------------------------------------------------------------------------------- - -# C -f = open('w.h', 'w') -f.write(d['C']) -f.close() - -# intrin -f = open('wi.h', 'w') -f.write(d['I']) -f.close() - -filename = '' -if PRECISION == 'double': - filename = "Fujitsu_A64FX_intrin_double.h" -else: - filename = "Fujitsu_A64FX_intrin_single.h" -f = open(filename, 'w') -f.write(LEGAL.format(filename)) -f.write(d['I']) -f.close() - - -# asm -f = open('wa.h', 'w') -f.write(d['A']) -f.close() - -filename = '' -if PRECISION == 'double': - filename = "Fujitsu_A64FX_asm_double.h" -else: - filename = "Fujitsu_A64FX_asm_single.h" -f = open(filename, 'w') -f.write(LEGAL.format(filename)) -f.write(d['A']) -f.close() - - -# arithmetics instruction count, mul/mac = 2 instructions each -d['acount'] = d['add'] + d['sub'] + \ - d['mul'] + d['mac'] + d['addTimesI'] + d['subTimesI'] - -# permutations -d['permutes'] += 2*d['timesI'] + 1*d['timesMinusI'] -d['neg'] = 1*d['timesI'] + 1*d['timesMinusI'] - -# instruction count, mul/mac = 2 instructions each, +/- *i = 3 instructions each -d['icount'] = d['load'] + d['store'] + d['move'] + d['add'] + d['sub'] + \ - d['mul'] + d['mac'] + d['permutes'] + d['neg'] + \ - d['addTimesI'] + d['subTimesI'] + d['zero'] + d['movprfx'] - -# flops -d['flops'] = 4*d['mac'] + 3*d['mul'] + d['add'] + d['sub'] + \ - d['addTimesI'] + d['subTimesI'] - - - - - -print('Statistics') -print('') -print('Type Occurences Total / Arith instructions') -print('-------------------------------------------------------------------') -print('Variables {:4d}'.format(d['registers'])) -print('') -print('load {:4d}'.format(d['load'])) -print('store {:4d}'.format(d['store'])) -print('move {:4d}'.format(d['move'])) -print('movprfx {:4d}'.format(d['movprfx'])) -print('zero {:4d}'.format(d['zero'])) -print('negate {:4d}'.format(d['neg'])) - - -print('add {:4d} {:0.2f} / {:0.2f}'.\ - format(d['add'], d['add'] / d['icount'], d['add'] / d['acount'])) -print('sub {:4d} {:0.2f} / {:0.2f}'.\ - format(d['sub'], d['sub'] / d['icount'], d['sub'] / d['acount'])) -print('mul {:4d} {:0.2f} / {:0.2f}'.\ - format(d['mul'], 2*d['mul'] / d['icount'], 2*d['mul'] / d['acount'])) -print('mac {:4d} {:0.2f} / {:0.2f}'.\ - format(d['mac'], 2*d['mac'] / d['icount'], 2*d['mac'] / d['acount'])) -print('addTimesI {:4d} {:0.2f} / {:0.2f}'.\ - format(d['addTimesI'], 2*d['addTimesI'] / d['icount'], 2*d['addTimesI'] / d['acount'])) -print('subTimesI {:4d} {:0.2f} / {:0.2f}'.\ - format(d['subTimesI'], 2*d['subTimesI'] / d['icount'], 2*d['subTimesI'] / d['acount'])) - -print('timesI {:4d}'.format(d['timesI'])) -print('timesMinusI {:4d}'.format(d['timesMinusI'])) -print('permutes {:4d} {:0.2f}'.\ - format(d['permutes'], d['permutes'] / d['icount'])) -print('') -print('flops {:4d}'.format(d['flops'])) -print('instruction count {:4d}'.format(d['icount'])) -print('arith. instruction count {:4d} {:0.2f}'.\ - format(d['acount'], d['acount'] / d['icount'])) - - -# ---- static pipeline resources consumption ---- -FLA = 0 -FLA += 2 * d['mac'] + 2 * d['mul'] -FLA += 1 * d['addTimesI'] + 1 * d['subTimesI'] -FLA += 1 * d['move'] -FLA += 1 * d['permutes'] -FLA += 1 * d['store'] -FLA += 1 * d['zero'] - -FLB = 0 -FLB += 1 * d['addTimesI'] + 1 * d['subTimesI'] - -FLAB = 0 -FLAB += 1 * d['mac'] + 1 * d['mul'] -FLAB += 1 * d['add'] + 1 * d['sub'] -FLAB += 1 * d['neg'] + 1 * d['movprfx'] -#FLAB += 1 * d['zero'] - - -FL_slots = 2 * d['icount'] -FL_micro_ops = FLA + FLB + FLAB - -print('') -print('------------------------------------------------------------------') -print('') -print('Static FL slot usage') -print('') -print(' FLA {:4d}'.format(FLA)) -print(' FLB {:4d}'.format(FLB)) -print(' FLA/B {:4d}'.format(FLAB)) - -print('') -print('Static FL slot efficiency') -print('') -print(' Total FL slots {:4d}'.format(FL_slots)) -print(' FL slots occupied {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / FL_slots)) - -cycles_total = d['cycles_ZERO_PSI'] + d['cycles_LOAD_CHIMU'] + \ - d['cycles_PROJ'] + d['cycles_PERM'] + d['cycles_MULT_2SPIN'] + \ - d['cycles_RECON'] + d['cycles_RESULT'] -cycles_total_hidden = d['cycles_ZERO_PSI'] + \ - d['cycles_PROJ'] + d['cycles_MULT_2SPIN'] + \ - d['cycles_RECON'] - -# ---- dynamic estimate ---- - -print('') -print('Dynamic cycles estimate (incl. latencies)') -print('') -print(' ZERO_PSI {:4d}'.format(d['cycles_ZERO_PSI'])) -print(' LOAD_CHIMU {:4d}'.format(d['cycles_LOAD_CHIMU'])) -print(' PROJ {:4d}'.format(d['cycles_PROJ'])) -print(' PERM {:4d}'.format(d['cycles_PERM'])) -print(' MULT_2SPIN {:4d}'.format(d['cycles_MULT_2SPIN'])) -print(' RECON {:4d}'.format(d['cycles_RECON'])) -print(' STORE {:4d}'.format(d['cycles_RESULT'])) -print('') -print(' Sum {:4d}'.format(cycles_total)) -print('') -print(' Sum* {:4d}'.format(cycles_total_hidden)) -print(' Total FL slots* {:4d}'.format(cycles_total_hidden * 2)) -print(' FL slots occupied* {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency* {:0.2f}'.format(FL_micro_ops / (2*cycles_total_hidden))) -print('') -print(' *load/store/PERM hidden') - -estimated_cycles = cycles_total_hidden -# Estimate percent peak DP; dual issue, fma -pp = 100 * 4 * d['flops'] / (2*2*8*estimated_cycles) -print('') -print('Model prediction') -print('') -print(' Cycles* {:4d}'.format(estimated_cycles)) -print(' Percent peak* {:4.1f} %'.format(pp)) - -# estimated RF throughput in GB/s @ 2.2 GHz -tp10 = (d['load'] + d['store']) * 64 * 2.2 / estimated_cycles -tp2 = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / estimated_cycles -print('') -print(' Estimated RF throughput* {:4.1f} GB/s'.\ - format(tp10)) -print(' Estimated RF throughput* {:4.1f} GiB/s'.\ - format(tp2)) - -# ---- dynamic pipeline resources consumption ---- - -runtime = measured_cycles # runtime in cycles -pp_runtime = 100 * 4 * d['flops'] / (2*2*8*runtime) -runtime_FL_slots = 2 * runtime -delta = runtime - estimated_cycles - - -print('') -print('------------------------------------------------------------------') -print('') -print('Dynamic runtime analysis (cycles from measurements)') -print('') -print(' Cycles {:4d}'.format(runtime)) -print(' Percent peak {:4.1f} %'.format(pp_runtime)) -print(' Deviation from estimate {:4d} {:4.2f} %'.\ - format(delta, 100. * abs(delta/runtime))) -print(' Deviation per direction {:4.1f}'.format(delta/8)) - -# estimated RF throughput in GB/s @ 2.2 GHz -tp10_rt = (d['load'] + d['store']) * 64 * 2.2 / runtime -tp2_rt = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / runtime -print('') -print(' RF throughput {:4.1f} GB/s'.\ - format(tp10_rt)) -print(' RF throughput {:4.1f} GiB/s'.\ - format(tp2_rt)) -print('') -print(' Total FL slots {:4d}'.format(runtime_FL_slots)) -print(' FL slots occupied {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / runtime_FL_slots)) -print('') From 909acd55cd36c4b567cab30d311aab6b8674288d Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 02:00:22 +0100 Subject: [PATCH 2/5] vnum variant for prefetches --- Grid/simd/Fujitsu_A64FX_intrin_double.h | 36 ++++++++++++------------- Grid/simd/Fujitsu_A64FX_intrin_single.h | 36 ++++++++++++------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 361246fc..f195e3c5 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 30273b6e..0b874f02 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXf(base) \ From 4b882e8056b2c9dd6dceab2729104e5e615835ae Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:09:20 +0100 Subject: [PATCH 3/5] fixed lost bracket --- Grid/simd/Fujitsu_A64FX_intrin_double.h | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index f195e3c5..b645c365 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ - svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ - svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ From 6013183361d88fe7179b4fcf6b8321c0621b09ba Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:25:01 +0100 Subject: [PATCH 4/5] removed Asm impls --- Grid/simd/Fujitsu_A64FX_asm_double.h | 781 --------------------------- Grid/simd/Fujitsu_A64FX_asm_single.h | 781 --------------------------- 2 files changed, 1562 deletions(-) delete mode 100644 Grid/simd/Fujitsu_A64FX_asm_double.h delete mode 100644 Grid/simd/Fujitsu_A64FX_asm_single.h diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h deleted file mode 100644 index bbc4efe7..00000000 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ /dev/null @@ -1,781 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_asm_double.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) -#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) -#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) -#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) -#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) -#define PF_GAUGE(A) -#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) -#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) -#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); -#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) -#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd -#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJ XP_PROJ_A64FXd -#define YP_PROJ YP_PROJ_A64FXd -#define ZP_PROJ ZP_PROJ_A64FXd -#define TP_PROJ TP_PROJ_A64FXd -#define XM_PROJ XM_PROJ_A64FXd -#define YM_PROJ YM_PROJ_A64FXd -#define ZM_PROJ ZM_PROJ_A64FXd -#define TM_PROJ TM_PROJ_A64FXd -#define XP_RECON XP_RECON_A64FXd -#define XM_RECON XM_RECON_A64FXd -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 0 -#define PERMUTE_DIR1 1 -#define PERMUTE_DIR2 2 -#define PERMUTE_DIR3 3 -#define PERMUTE PERMUTE_A64FXd; -#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } -#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } -// DECLARATIONS -#define DECLARATIONS_A64FXd \ - uint64_t baseU; \ - const uint64_t lut[4][8] = { \ - {4, 5, 6, 7, 0, 1, 2, 3}, \ - {2, 3, 0, 1, 6, 7, 4, 5}, \ - {1, 0, 3, 2, 5, 4, 7, 6}, \ - {0, 1, 2, 4, 5, 6, 7, 8} };\ -asm ( \ - "ptrue p5.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -asm ( \ - "fmov z31.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXd(base) \ -{ \ -asm ( \ - "str z0, [%[storeptr], -6, mul vl] \n\t" \ - "str z1, [%[storeptr], -5, mul vl] \n\t" \ - "str z2, [%[storeptr], -4, mul vl] \n\t" \ - "str z3, [%[storeptr], -3, mul vl] \n\t" \ - "str z4, [%[storeptr], -2, mul vl] \n\t" \ - "str z5, [%[storeptr], -1, mul vl] \n\t" \ - "str z6, [%[storeptr], 0, mul vl] \n\t" \ - "str z7, [%[storeptr], 1, mul vl] \n\t" \ - "str z8, [%[storeptr], 2, mul vl] \n\t" \ - "str z9, [%[storeptr], 3, mul vl] \n\t" \ - "str z10, [%[storeptr], 4, mul vl] \n\t" \ - "str z11, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXd(base) \ -{ \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ -{ \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_TABLE0 -#define LOAD_TABLE0 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE1 -#define LOAD_TABLE1 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE2 -#define LOAD_TABLE2 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE3 -#define LOAD_TABLE3 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERMUTE -#define PERMUTE_A64FXd \ -asm ( \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_GAUGE -#define LOAD_GAUGE(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN -#define MULT_2SPIN_1_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "movprfx z18.d, p5/m, z31.d \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ - "movprfx z21.d, p5/m, z31.d \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ - "movprfx z19.d, p5/m, z31.d \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ - "movprfx z22.d, p5/m, z31.d \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ - "movprfx z20.d, p5/m, z31.d \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ - "movprfx z23.d, p5/m, z31.d \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN_BACKEND -#define MULT_2SPIN_2_A64FXd \ -{ \ -asm ( \ - "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXd \ -asm ( \ - "movprfx z6.d, p5/m, z31.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "movprfx z7.d, p5/m, z31.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "movprfx z8.d, p5/m, z31.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "movprfx z9.d, p5/m, z31.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "movprfx z10.d, p5/m, z31.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "movprfx z11.d, p5/m, z31.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXd \ -{ \ -asm ( \ - "fsub z12.d, p5/m, z12.d, z21.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z22.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z23.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z18.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z19.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z20.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXd \ -{ \ -asm ( \ - "fadd z12.d, p5/m, z12.d, z18.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z19.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z20.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z21.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z22.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXd \ -asm ( \ - "movprfx z6.d, p5/m, z31.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "movprfx z7.d, p5/m, z31.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "movprfx z8.d, p5/m, z31.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "movprfx z9.d, p5/m, z31.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "movprfx z10.d, p5/m, z31.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "movprfx z11.d, p5/m, z31.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXd \ -{ \ -asm ( \ - "fadd z12.d, p5/m, z12.d, z21.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z22.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z23.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z18.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z19.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z20.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXd \ -{ \ -asm ( \ - "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z21.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z22.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXd \ -asm ( \ - "fmov z0.d , 0 \n\t" \ - "fmov z1.d , 0 \n\t" \ - "fmov z2.d , 0 \n\t" \ - "fmov z3.d , 0 \n\t" \ - "fmov z4.d , 0 \n\t" \ - "fmov z5.d , 0 \n\t" \ - "fmov z6.d , 0 \n\t" \ - "fmov z7.d , 0 \n\t" \ - "fmov z8.d , 0 \n\t" \ - "fmov z9.d , 0 \n\t" \ - "fmov z10.d , 0 \n\t" \ - "fmov z11.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) -#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_RESULT_L1_STORE (prefetch store to L1) -#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h deleted file mode 100644 index e629f617..00000000 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ /dev/null @@ -1,781 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_asm_single.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) -#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) -#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) -#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) -#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) -#define PF_GAUGE(A) -#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) -#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) -#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); -#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) -#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf -#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJ XP_PROJ_A64FXf -#define YP_PROJ YP_PROJ_A64FXf -#define ZP_PROJ ZP_PROJ_A64FXf -#define TP_PROJ TP_PROJ_A64FXf -#define XM_PROJ XM_PROJ_A64FXf -#define YM_PROJ YM_PROJ_A64FXf -#define ZM_PROJ ZM_PROJ_A64FXf -#define TM_PROJ TM_PROJ_A64FXf -#define XP_RECON XP_RECON_A64FXf -#define XM_RECON XM_RECON_A64FXf -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 0 -#define PERMUTE_DIR1 1 -#define PERMUTE_DIR2 2 -#define PERMUTE_DIR3 3 -#define PERMUTE PERMUTE_A64FXf; -#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } -#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } -// DECLARATIONS -#define DECLARATIONS_A64FXf \ - uint64_t baseU; \ - const uint32_t lut[4][16] = { \ - {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ - {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ - {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ -asm ( \ - "ptrue p5.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -asm ( \ - "fmov z31.s , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXf(base) \ -{ \ -asm ( \ - "str z0, [%[storeptr], -6, mul vl] \n\t" \ - "str z1, [%[storeptr], -5, mul vl] \n\t" \ - "str z2, [%[storeptr], -4, mul vl] \n\t" \ - "str z3, [%[storeptr], -3, mul vl] \n\t" \ - "str z4, [%[storeptr], -2, mul vl] \n\t" \ - "str z5, [%[storeptr], -1, mul vl] \n\t" \ - "str z6, [%[storeptr], 0, mul vl] \n\t" \ - "str z7, [%[storeptr], 1, mul vl] \n\t" \ - "str z8, [%[storeptr], 2, mul vl] \n\t" \ - "str z9, [%[storeptr], 3, mul vl] \n\t" \ - "str z10, [%[storeptr], 4, mul vl] \n\t" \ - "str z11, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXf(base) \ -{ \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ -{ \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_TABLE0 -#define LOAD_TABLE0 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE1 -#define LOAD_TABLE1 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE2 -#define LOAD_TABLE2 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE3 -#define LOAD_TABLE3 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERMUTE -#define PERMUTE_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_GAUGE -#define LOAD_GAUGE(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN -#define MULT_2SPIN_1_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "movprfx z18.s, p5/m, z31.s \n\t" \ - "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ - "movprfx z21.s, p5/m, z31.s \n\t" \ - "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ - "movprfx z19.s, p5/m, z31.s \n\t" \ - "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ - "movprfx z22.s, p5/m, z31.s \n\t" \ - "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ - "movprfx z20.s, p5/m, z31.s \n\t" \ - "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ - "movprfx z23.s, p5/m, z31.s \n\t" \ - "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN_BACKEND -#define MULT_2SPIN_2_A64FXf \ -{ \ -asm ( \ - "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ - "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ - "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ - "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \ - "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \ - "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXf \ -asm ( \ - "movprfx z6.s, p5/m, z31.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ - "movprfx z7.s, p5/m, z31.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ - "movprfx z8.s, p5/m, z31.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ - "movprfx z9.s, p5/m, z31.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ - "movprfx z10.s, p5/m, z31.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ - "movprfx z11.s, p5/m, z31.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXf \ -{ \ -asm ( \ - "fsub z12.s, p5/m, z12.s, z21.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z22.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z23.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z18.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z19.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z20.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXf \ -{ \ -asm ( \ - "fadd z12.s, p5/m, z12.s, z18.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z19.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z20.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z21.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z22.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXf \ -asm ( \ - "movprfx z6.s, p5/m, z31.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ - "movprfx z7.s, p5/m, z31.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ - "movprfx z8.s, p5/m, z31.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "movprfx z9.s, p5/m, z31.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ - "movprfx z10.s, p5/m, z31.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ - "movprfx z11.s, p5/m, z31.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXf \ -{ \ -asm ( \ - "fadd z12.s, p5/m, z12.s, z21.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z22.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z23.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z18.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z19.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z20.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXf \ -{ \ -asm ( \ - "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z21.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z22.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fsub z9.s, p5/m, z9.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fsub z10.s, p5/m, z10.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fsub z11.s, p5/m, z11.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fsub z6.s, p5/m, z6.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fsub z7.s, p5/m, z7.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fsub z8.s, p5/m, z8.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fsub z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fsub z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fsub z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fsub z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fsub z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fsub z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXf \ -asm ( \ - "fmov z0.s , 0 \n\t" \ - "fmov z1.s , 0 \n\t" \ - "fmov z2.s , 0 \n\t" \ - "fmov z3.s , 0 \n\t" \ - "fmov z4.s , 0 \n\t" \ - "fmov z5.s , 0 \n\t" \ - "fmov z6.s , 0 \n\t" \ - "fmov z7.s , 0 \n\t" \ - "fmov z8.s , 0 \n\t" \ - "fmov z9.s , 0 \n\t" \ - "fmov z10.s , 0 \n\t" \ - "fmov z11.s , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) -#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_RESULT_L1_STORE (prefetch store to L1) -#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z12.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z13.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z14.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z15.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z16.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z17.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - From 45d49d86487427ea1e0b34c0d530d475f8e3e31a Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:35:18 +0100 Subject: [PATCH 5/5] clean up --- .../implementation/WilsonKernelsAsmBodyA64FX.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 83588a7d..4e463438 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -26,9 +26,9 @@ Author: Nils Meyer Regensburg University *************************************************************************************/ /* END LEGAL */ -// GCC 10 messes up SVE instruction scheduling using -O3 only, -// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders -// performance is better than armclang 20.2 +// GCC 10 messes up SVE instruction scheduling using -O3, but +// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders +// performance now is better than armclang 20.2 #ifdef KERNEL_DAG #define DIR0_PROJ XP_PROJ @@ -118,10 +118,6 @@ Author: Nils Meyer Regensburg University /* NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty though I expected that it would improve on performance - - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ */ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ @@ -149,7 +145,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty if ( local || st.same_node[Dir] ) { \ MULT_2SPIN_1(Dir); \ MULT_2SPIN_2; \ - RECON; \ + RECON; \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ PREFETCH_CHIMU(base); \ @@ -300,7 +296,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty // DC ZVA test // { uint64_t basestore = (uint64_t)&out[ss]; - // PREFETCH_RESULT_L2_STORE(basestore); } + // PREFETCH_RESULT_L2_STORE(basestore); } ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); @@ -336,8 +332,8 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty // DC ZVA test //{ uint64_t basestore = (uint64_t)&out[ss]; - // PREFETCH_RESULT_L2_STORE(basestore); - //} + // PREFETCH_RESULT_L2_STORE(basestore); } + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);