mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 12:04:33 +00:00 
			
		
		
		
	Merge pull request #329 from nmeyer-ur/feature/a64fx-3
Revised dslash/dwf kernels for A64FX
This commit is contained in:
		| @@ -38,9 +38,6 @@ Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
| // undefine everything related to kernels | // undefine everything related to kernels | ||||||
| #include <simd/Fujitsu_A64FX_undef.h> | #include <simd/Fujitsu_A64FX_undef.h> | ||||||
|  |  | ||||||
| // enable A64FX body |  | ||||||
| #define WILSONKERNELSASMBODYA64FX |  | ||||||
| //#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") |  | ||||||
|  |  | ||||||
|     /////////////////////////////////////////////////////////// |     /////////////////////////////////////////////////////////// | ||||||
|     // If we are A64FX specialise the single precision routine |     // If we are A64FX specialise the single precision routine | ||||||
| @@ -63,119 +60,89 @@ Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
| #define INTERIOR_AND_EXTERIOR | #define INTERIOR_AND_EXTERIOR | ||||||
| #undef INTERIOR | #undef INTERIOR | ||||||
| #undef EXTERIOR | #undef EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #undef INTERIOR_AND_EXTERIOR | #undef INTERIOR_AND_EXTERIOR | ||||||
| #define INTERIOR | #define INTERIOR | ||||||
| #undef EXTERIOR | #undef EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #undef INTERIOR_AND_EXTERIOR | #undef INTERIOR_AND_EXTERIOR | ||||||
| #undef INTERIOR | #undef INTERIOR | ||||||
| #define EXTERIOR | #define EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////// | ||||||
| @@ -185,119 +152,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV | |||||||
| #define INTERIOR_AND_EXTERIOR | #define INTERIOR_AND_EXTERIOR | ||||||
| #undef INTERIOR | #undef INTERIOR | ||||||
| #undef EXTERIOR | #undef EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #undef INTERIOR_AND_EXTERIOR | #undef INTERIOR_AND_EXTERIOR | ||||||
| #define INTERIOR | #define INTERIOR | ||||||
| #undef EXTERIOR | #undef EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #undef INTERIOR_AND_EXTERIOR | #undef INTERIOR_AND_EXTERIOR | ||||||
| #undef INTERIOR | #undef INTERIOR | ||||||
| #define EXTERIOR | #define EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // undefine | // undefine | ||||||
| @@ -330,119 +267,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie | |||||||
| #define INTERIOR_AND_EXTERIOR | #define INTERIOR_AND_EXTERIOR | ||||||
| #undef INTERIOR | #undef INTERIOR | ||||||
| #undef EXTERIOR | #undef EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #undef INTERIOR_AND_EXTERIOR | #undef INTERIOR_AND_EXTERIOR | ||||||
| #define INTERIOR | #define INTERIOR | ||||||
| #undef EXTERIOR | #undef EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #undef INTERIOR_AND_EXTERIOR | #undef INTERIOR_AND_EXTERIOR | ||||||
| #undef INTERIOR | #undef INTERIOR | ||||||
| #define EXTERIOR | #define EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////// | ||||||
| // XYZT vectorised, dag Kernel, double | // XYZT vectorised, dag Kernel, double | ||||||
| @@ -451,124 +358,93 @@ WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV | |||||||
| #define INTERIOR_AND_EXTERIOR | #define INTERIOR_AND_EXTERIOR | ||||||
| #undef INTERIOR | #undef INTERIOR | ||||||
| #undef EXTERIOR | #undef EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #undef INTERIOR_AND_EXTERIOR | #undef INTERIOR_AND_EXTERIOR | ||||||
| #define INTERIOR | #define INTERIOR | ||||||
| #undef EXTERIOR | #undef EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #undef INTERIOR_AND_EXTERIOR | #undef INTERIOR_AND_EXTERIOR | ||||||
| #undef INTERIOR | #undef INTERIOR | ||||||
| #define EXTERIOR | #define EXTERIOR | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") | ||||||
| template<> void | template<> void | ||||||
| WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) | ||||||
| #if defined (WILSONKERNELSASMBODYA64FX) |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> | ||||||
| #else |  | ||||||
| #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // undefs | // undefs | ||||||
| #undef WILSONKERNELSASMBODYA64FX |  | ||||||
| #include <simd/Fujitsu_A64FX_undef.h> | #include <simd/Fujitsu_A64FX_undef.h> | ||||||
|  |  | ||||||
| #endif //A64FXASM | #endif //A64FXASM | ||||||
|   | |||||||
| @@ -25,6 +25,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
|  | // GCC 10 messes up SVE instruction scheduling using -O3, but | ||||||
|  | // -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders | ||||||
|  | // performance now is better than armclang 20.2 | ||||||
|  |  | ||||||
| #ifdef KERNEL_DAG | #ifdef KERNEL_DAG | ||||||
| #define DIR0_PROJ    XP_PROJ | #define DIR0_PROJ    XP_PROJ | ||||||
| #define DIR1_PROJ    YP_PROJ | #define DIR1_PROJ    YP_PROJ | ||||||
| @@ -97,7 +102,7 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|     PROJ;							                        \ |     PROJ;							                        \ | ||||||
|     MAYBEPERM(PERMUTE_DIR,perm);					        \ |     MAYBEPERM(PERMUTE_DIR,perm);					        \ | ||||||
|       } else {								                \ |       } else {								                \ | ||||||
| 	LOAD_CHI(base);							                \ | 	  LOAD_CHI(base);							                \ | ||||||
|       }									                    \ |       }									                    \ | ||||||
|       base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\ |       base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\ | ||||||
|     MULT_2SPIN_1(Dir);					                    \ |     MULT_2SPIN_1(Dir);					                    \ | ||||||
| @@ -110,6 +115,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|     }                                                       \ |     }                                                       \ | ||||||
|     RECON;								                    \ |     RECON;								                    \ | ||||||
|  |  | ||||||
|  | /* | ||||||
|  | NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty | ||||||
|  |     though I expected that it would improve on performance | ||||||
|  | */ | ||||||
|  |  | ||||||
| #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \ | #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \ | ||||||
|   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ |   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ | ||||||
|   PREFETCH1_CHIMU(base);						            \ |   PREFETCH1_CHIMU(base);						            \ | ||||||
| @@ -126,73 +136,63 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|  |  | ||||||
| #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\ | #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\ | ||||||
|       basep = st.GetPFInfo(nent,plocal); nent++;			\ |       basep = st.GetPFInfo(nent,plocal); nent++;			\ | ||||||
|       if ( local ) {							            \ |       if ( local ) {							\ | ||||||
|     LOAD_CHIMU(base);                                       \ |   LOAD_CHIMU(base);                                       \ | ||||||
|     LOAD_TABLE(PERMUTE_DIR);                                \ |   LOAD_TABLE(PERMUTE_DIR);                                \ | ||||||
|     PROJ;							                        \ |   PROJ;							                        \ | ||||||
|     MAYBEPERM(PERMUTE_DIR,perm);					        \ |   MAYBEPERM(PERMUTE_DIR,perm);					        \ | ||||||
|       }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}	    \ |       }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\ | ||||||
|       base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\ |       if ( local || st.same_node[Dir] ) {				\ | ||||||
|       if ( local || st.same_node[Dir] ) {				    \ |   MULT_2SPIN_1(Dir);					                    \ | ||||||
|     MULT_2SPIN_1(Dir);					                    \ |   MULT_2SPIN_2;					                        \ | ||||||
|     PREFETCH_CHIMU(base);                                   \ |   RECON;								\ | ||||||
|     /* PREFETCH_GAUGE_L1(NxtDir); */                        \ |       }									\ | ||||||
|     MULT_2SPIN_2;					                        \ |   base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\ | ||||||
|     if (s == 0) {                                           \ |   PREFETCH_CHIMU(base);						\ | ||||||
|        if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ |   PREFETCH_CHIMU_L2(basep);                               \ | ||||||
|     }                                                       \ |  | ||||||
|     RECON;								                    \ |  | ||||||
|     PREFETCH_CHIMU_L2(basep);                               \ |  | ||||||
|       } else { PREFETCH_CHIMU(base); }								                    \ |  | ||||||
|  |  | ||||||
| #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\ | #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\ | ||||||
|   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\ |   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\ | ||||||
|   PREFETCH1_CHIMU(base);						\ |   PREFETCH1_CHIMU(base);						\ | ||||||
|  |   { ZERO_PSI; }								\ | ||||||
|   ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) |   ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) | ||||||
|  |  | ||||||
| #define RESULT(base,basep) SAVE_RESULT(base,basep); | #define RESULT(base,basep) SAVE_RESULT(base,basep); | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||||
| // Post comms kernel | // Post comms kernel | ||||||
| //////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||||
| #ifdef EXTERIOR | #ifdef EXTERIOR | ||||||
|  |  | ||||||
|  |  | ||||||
| #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\ | #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\ | ||||||
|   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ |   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\ | ||||||
|   if((!local)&&(!st.same_node[Dir]) ) {					    \ |   if((!local)&&(!st.same_node[Dir]) ) {					\ | ||||||
|     LOAD_CHI(base);							                \ |     LOAD_CHI(base);							\ | ||||||
|     MULT_2SPIN_1(Dir);					                    \ |     MULT_2SPIN_1(Dir);					                    \ | ||||||
|     PREFETCH_CHIMU(base);                                   \ |  | ||||||
|     /* PREFETCH_GAUGE_L1(NxtDir); */                        \ |  | ||||||
|     MULT_2SPIN_2;					                        \ |     MULT_2SPIN_2;					                        \ | ||||||
|     if (s == 0) {                                           \ |     RECON;								\ | ||||||
|       if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ |     nmu++;								\ | ||||||
|     }                                                       \ |  | ||||||
|     RECON;								                    \ |  | ||||||
|     nmu++;								                    \ |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \ | #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\ | ||||||
|   nmu=0;								                    \ |   nmu=0;								\ | ||||||
|   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ |   { ZERO_PSI;}								\ | ||||||
|   if((!local)&&(!st.same_node[Dir]) ) {					    \ |   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\ | ||||||
|     LOAD_CHI(base);							                \ |   if((!local)&&(!st.same_node[Dir]) ) {					\ | ||||||
|  |     LOAD_CHI(base);							\ | ||||||
|     MULT_2SPIN_1(Dir);					                    \ |     MULT_2SPIN_1(Dir);					                    \ | ||||||
|     PREFETCH_CHIMU(base);                                   \ |  | ||||||
|     /* PREFETCH_GAUGE_L1(NxtDir); */                        \ |  | ||||||
|     MULT_2SPIN_2;					                        \ |     MULT_2SPIN_2;					                        \ | ||||||
|     if (s == 0) {                                           \ |     RECON;								\ | ||||||
|       if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ |     nmu++;								\ | ||||||
|     }                                                       \ |  | ||||||
|     RECON;								                    \ |  | ||||||
|     nmu++;								                    \ |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} | #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| { | { | ||||||
|   int nmu; |   int nmu; | ||||||
|   int local,perm, ptype; |   int local,perm, ptype; | ||||||
| @@ -209,7 +209,6 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|     int ssn=ssU+1;     if(ssn>=nmax) ssn=0; |     int ssn=ssU+1;     if(ssn>=nmax) ssn=0; | ||||||
|     //    int sUn=lo.Reorder(ssn); |     //    int sUn=lo.Reorder(ssn); | ||||||
|     int sUn=ssn; |     int sUn=ssn; | ||||||
|     LOCK_GAUGE(0); |  | ||||||
| #else | #else | ||||||
|     int sU =ssU; |     int sU =ssU; | ||||||
|     int ssn=ssU+1;     if(ssn>=nmax) ssn=0; |     int ssn=ssU+1;     if(ssn>=nmax) ssn=0; | ||||||
| @@ -295,6 +294,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|       std::cout << "----------------------------------------------------" << std::endl; |       std::cout << "----------------------------------------------------" << std::endl; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |       // DC ZVA test | ||||||
|  |       // { uint64_t basestore = (uint64_t)&out[ss]; | ||||||
|  |       //   PREFETCH_RESULT_L2_STORE(basestore); } | ||||||
|  |  | ||||||
|  |  | ||||||
|       ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); |       ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); | ||||||
|  |  | ||||||
| #ifdef SHOW | #ifdef SHOW | ||||||
| @@ -308,6 +312,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|       std::cout << "----------------------------------------------------" << std::endl; |       std::cout << "----------------------------------------------------" << std::endl; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |       // DC ZVA test | ||||||
|  |       //{ uint64_t basestore = (uint64_t)&out[ss]; | ||||||
|  |       //  PREFETCH_RESULT_L2_STORE(basestore); } | ||||||
|  |  | ||||||
|  |  | ||||||
|       ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); |       ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); | ||||||
|  |  | ||||||
| #ifdef SHOW | #ifdef SHOW | ||||||
| @@ -321,6 +330,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|       std::cout << "----------------------------------------------------" << std::endl; |       std::cout << "----------------------------------------------------" << std::endl; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |       // DC ZVA test | ||||||
|  |       //{ uint64_t basestore = (uint64_t)&out[ss]; | ||||||
|  |       //  PREFETCH_RESULT_L2_STORE(basestore); } | ||||||
|  |  | ||||||
|  |  | ||||||
|       ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); |       ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); | ||||||
|  |  | ||||||
| #ifdef SHOW | #ifdef SHOW | ||||||
| @@ -341,6 +355,7 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | |||||||
|       base = (uint64_t) &out[ss]; |       base = (uint64_t) &out[ss]; | ||||||
|       basep= st.GetPFInfo(nent,plocal); ent++; |       basep= st.GetPFInfo(nent,plocal); ent++; | ||||||
|       basep = (uint64_t) &out[ssn]; |       basep = (uint64_t) &out[ssn]; | ||||||
|  |       //PREFETCH_RESULT_L1_STORE(base); | ||||||
|       RESULT(base,basep); |       RESULT(base,basep); | ||||||
|  |  | ||||||
| #ifdef SHOW | #ifdef SHOW | ||||||
|   | |||||||
| @@ -1,779 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
|     Source file: Fujitsu_A64FX_asm_double.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2020 |  | ||||||
|  |  | ||||||
| Author: Nils Meyer <nils.meyer@ur.de> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #define LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_A64FXd(base)   |  | ||||||
| #define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)   |  | ||||||
| #define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)   |  | ||||||
| #define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)   |  | ||||||
| #define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)   |  | ||||||
| #define PF_GAUGE(A)   |  | ||||||
| #define PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)   |  | ||||||
| #define PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)   |  | ||||||
| #define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)   |  | ||||||
| #define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)   |  | ||||||
| #define LOCK_GAUGE(A)   |  | ||||||
| #define UNLOCK_GAUGE(A)   |  | ||||||
| #define MASK_REGS                      DECLARATIONS_A64FXd   |  | ||||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)   |  | ||||||
| #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)   |  | ||||||
| #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd   |  | ||||||
| #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)   |  | ||||||
| #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)   |  | ||||||
| #define XP_PROJ                        XP_PROJ_A64FXd   |  | ||||||
| #define YP_PROJ                        YP_PROJ_A64FXd   |  | ||||||
| #define ZP_PROJ                        ZP_PROJ_A64FXd   |  | ||||||
| #define TP_PROJ                        TP_PROJ_A64FXd   |  | ||||||
| #define XM_PROJ                        XM_PROJ_A64FXd   |  | ||||||
| #define YM_PROJ                        YM_PROJ_A64FXd   |  | ||||||
| #define ZM_PROJ                        ZM_PROJ_A64FXd   |  | ||||||
| #define TM_PROJ                        TM_PROJ_A64FXd   |  | ||||||
| #define XP_RECON                       XP_RECON_A64FXd   |  | ||||||
| #define XM_RECON                       XM_RECON_A64FXd   |  | ||||||
| #define XM_RECON_ACCUM                 XM_RECON_ACCUM_A64FXd   |  | ||||||
| #define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXd   |  | ||||||
| #define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXd   |  | ||||||
| #define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXd   |  | ||||||
| #define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXd   |  | ||||||
| #define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXd   |  | ||||||
| #define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXd   |  | ||||||
| #define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXd   |  | ||||||
| #define PERMUTE_DIR0                   0   |  | ||||||
| #define PERMUTE_DIR1                   1   |  | ||||||
| #define PERMUTE_DIR2                   2   |  | ||||||
| #define PERMUTE_DIR3                   3   |  | ||||||
| #define PERMUTE                        PERMUTE_A64FXd;   |  | ||||||
| #define LOAD_TABLE(Dir)                if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }   |  | ||||||
| #define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }   |  | ||||||
| // DECLARATIONS |  | ||||||
| #define DECLARATIONS_A64FXd  \ |  | ||||||
|     const uint64_t lut[4][8] = { \ |  | ||||||
|         {4, 5, 6, 7, 0, 1, 2, 3}, \ |  | ||||||
|         {2, 3, 0, 1, 6, 7, 4, 5}, \ |  | ||||||
|         {1, 0, 3, 2, 5, 4, 7, 6}, \ |  | ||||||
|         {0, 1, 2, 4, 5, 6, 7, 8} };\ |  | ||||||
| asm ( \ |  | ||||||
|     "fmov z31.d , 0 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // RESULT |  | ||||||
| #define RESULT_A64FXd(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "str z0, [%[storeptr], -6, mul vl] \n\t" \ |  | ||||||
|     "str z1, [%[storeptr], -5, mul vl] \n\t" \ |  | ||||||
|     "str z2, [%[storeptr], -4, mul vl] \n\t" \ |  | ||||||
|     "str z3, [%[storeptr], -3, mul vl] \n\t" \ |  | ||||||
|     "str z4, [%[storeptr], -2, mul vl] \n\t" \ |  | ||||||
|     "str z5, [%[storeptr], -1, mul vl] \n\t" \ |  | ||||||
|     "str z6, [%[storeptr], 0, mul vl] \n\t" \ |  | ||||||
|     "str z7, [%[storeptr], 1, mul vl] \n\t" \ |  | ||||||
|     "str z8, [%[storeptr], 2, mul vl] \n\t" \ |  | ||||||
|     "str z9, [%[storeptr], 3, mul vl] \n\t" \ |  | ||||||
|     "str z10, [%[storeptr], 4, mul vl] \n\t" \ |  | ||||||
|     "str z11, [%[storeptr], 5, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [storeptr] "r" (base + 2 * 3 * 64) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_CHIMU_L2 (prefetch to L2) |  | ||||||
| #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_CHIMU_L1 (prefetch to L1) |  | ||||||
| #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_GAUGE_L2 (prefetch to L2) |  | ||||||
| #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \ |  | ||||||
| { \ |  | ||||||
|     const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (baseU) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_GAUGE_L1 (prefetch to L1) |  | ||||||
| #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \ |  | ||||||
| { \ |  | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (baseU) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_CHI |  | ||||||
| #define LOAD_CHI_A64FXd(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ |  | ||||||
|     "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_CHIMU |  | ||||||
| #define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.d \n\t" \ |  | ||||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ |  | ||||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ |  | ||||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ |  | ||||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base + 2 * 3 * 64) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_CHIMU_0213 |  | ||||||
| #define LOAD_CHIMU_0213_A64FXd  \ |  | ||||||
| { \ |  | ||||||
|     const SiteSpinor & ref(in[offset]); \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.d \n\t" \ |  | ||||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ |  | ||||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ |  | ||||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (&ref[2][0]) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_CHIMU_0312 |  | ||||||
| #define LOAD_CHIMU_0312_A64FXd  \ |  | ||||||
| { \ |  | ||||||
|     const SiteSpinor & ref(in[offset]); \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.d \n\t" \ |  | ||||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ |  | ||||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ |  | ||||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ |  | ||||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (&ref[2][0]) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_TABLE0 |  | ||||||
| #define LOAD_TABLE0  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [tableptr] "r" (&lut[0]),[index] "i" (0) \ |  | ||||||
|     : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // LOAD_TABLE1 |  | ||||||
| #define LOAD_TABLE1  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [tableptr] "r" (&lut[0]),[index] "i" (1) \ |  | ||||||
|     : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // LOAD_TABLE2 |  | ||||||
| #define LOAD_TABLE2  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [tableptr] "r" (&lut[0]),[index] "i" (2) \ |  | ||||||
|     : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // LOAD_TABLE3 |  | ||||||
| #define LOAD_TABLE3  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [tableptr] "r" (&lut[0]),[index] "i" (3) \ |  | ||||||
|     : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // PERMUTE |  | ||||||
| #define PERMUTE_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "tbl z12.d, { z12.d }, z30.d \n\t"  \ |  | ||||||
|     "tbl z13.d, { z13.d }, z30.d \n\t"  \ |  | ||||||
|     "tbl z14.d, { z14.d }, z30.d \n\t"  \ |  | ||||||
|     "tbl z15.d, { z15.d }, z30.d \n\t"  \ |  | ||||||
|     "tbl z16.d, { z16.d }, z30.d \n\t"  \ |  | ||||||
|     "tbl z17.d, { z17.d }, z30.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // LOAD_GAUGE |  | ||||||
| #define LOAD_GAUGE  \ |  | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.d \n\t" \ |  | ||||||
|     "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (baseU + 2 * 3 * 64) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // MULT_2SPIN |  | ||||||
| #define MULT_2SPIN_1_A64FXd(A)  \ |  | ||||||
| { \ |  | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "movprfx z18.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ |  | ||||||
|     "movprfx z21.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ |  | ||||||
|     "movprfx z19.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ |  | ||||||
|     "movprfx z22.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ |  | ||||||
|     "movprfx z20.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ |  | ||||||
|     "movprfx z23.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ |  | ||||||
|     "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ |  | ||||||
|     "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ |  | ||||||
|     "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ |  | ||||||
|     "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ |  | ||||||
|     "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ |  | ||||||
|     "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ |  | ||||||
|     "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ |  | ||||||
|     "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (baseU + 2 * 3 * 64) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // MULT_2SPIN_BACKEND |  | ||||||
| #define MULT_2SPIN_2_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ |  | ||||||
|     "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ |  | ||||||
|     "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ |  | ||||||
|     "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ |  | ||||||
|     "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ |  | ||||||
|     "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ |  | ||||||
|     "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ |  | ||||||
|     "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ |  | ||||||
|     "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ |  | ||||||
|     "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ |  | ||||||
|     "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ |  | ||||||
|     "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ |  | ||||||
|     "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ |  | ||||||
|     "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ |  | ||||||
|     "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ |  | ||||||
|     "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ |  | ||||||
|     "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ |  | ||||||
|     "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ |  | ||||||
|     "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ |  | ||||||
|     "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ |  | ||||||
|     "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ |  | ||||||
|     "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ |  | ||||||
|     "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ |  | ||||||
|     "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XP_PROJ |  | ||||||
| #define XP_PROJ_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ |  | ||||||
|     "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ |  | ||||||
|     "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ |  | ||||||
|     "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \ |  | ||||||
|     "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \ |  | ||||||
|     "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XP_RECON |  | ||||||
| #define XP_RECON_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "movprfx z6.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ |  | ||||||
|     "movprfx z7.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ |  | ||||||
|     "movprfx z8.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ |  | ||||||
|     "movprfx z9.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ |  | ||||||
|     "movprfx z10.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ |  | ||||||
|     "movprfx z11.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ |  | ||||||
|     "mov z0.d, p5/m, z18.d \n\t" \ |  | ||||||
|     "mov z1.d, p5/m, z19.d \n\t" \ |  | ||||||
|     "mov z2.d, p5/m, z20.d \n\t" \ |  | ||||||
|     "mov z3.d, p5/m, z21.d \n\t" \ |  | ||||||
|     "mov z4.d, p5/m, z22.d \n\t" \ |  | ||||||
|     "mov z5.d, p5/m, z23.d \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // XP_RECON_ACCUM |  | ||||||
| #define XP_RECON_ACCUM_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \ |  | ||||||
|     "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \ |  | ||||||
|     "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \ |  | ||||||
|     "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \ |  | ||||||
|     "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \ |  | ||||||
|     "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // YP_PROJ |  | ||||||
| #define YP_PROJ_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fsub z12.d, p5/m, z12.d, z21.d \n\t" \ |  | ||||||
|     "fsub z13.d, p5/m, z13.d, z22.d \n\t" \ |  | ||||||
|     "fsub z14.d, p5/m, z14.d, z23.d \n\t" \ |  | ||||||
|     "fadd z15.d, p5/m, z15.d, z18.d \n\t"  \ |  | ||||||
|     "fadd z16.d, p5/m, z16.d, z19.d \n\t"  \ |  | ||||||
|     "fadd z17.d, p5/m, z17.d, z20.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // ZP_PROJ |  | ||||||
| #define ZP_PROJ_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \ |  | ||||||
|     "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \ |  | ||||||
|     "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \ |  | ||||||
|     "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \ |  | ||||||
|     "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \ |  | ||||||
|     "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // TP_PROJ |  | ||||||
| #define TP_PROJ_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z12.d, p5/m, z12.d, z18.d \n\t"  \ |  | ||||||
|     "fadd z13.d, p5/m, z13.d, z19.d \n\t"  \ |  | ||||||
|     "fadd z14.d, p5/m, z14.d, z20.d \n\t"  \ |  | ||||||
|     "fadd z15.d, p5/m, z15.d, z21.d \n\t"  \ |  | ||||||
|     "fadd z16.d, p5/m, z16.d, z22.d \n\t"  \ |  | ||||||
|     "fadd z17.d, p5/m, z17.d, z23.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XM_PROJ |  | ||||||
| #define XM_PROJ_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \ |  | ||||||
|     "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \ |  | ||||||
|     "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \ |  | ||||||
|     "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \ |  | ||||||
|     "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \ |  | ||||||
|     "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XM_RECON |  | ||||||
| #define XM_RECON_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "movprfx z6.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ |  | ||||||
|     "movprfx z7.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ |  | ||||||
|     "movprfx z8.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ |  | ||||||
|     "movprfx z9.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ |  | ||||||
|     "movprfx z10.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ |  | ||||||
|     "movprfx z11.d, p5/m, z31.d \n\t" \ |  | ||||||
|     "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ |  | ||||||
|     "mov z0.d, p5/m, z18.d \n\t" \ |  | ||||||
|     "mov z1.d, p5/m, z19.d \n\t" \ |  | ||||||
|     "mov z2.d, p5/m, z20.d \n\t" \ |  | ||||||
|     "mov z3.d, p5/m, z21.d \n\t" \ |  | ||||||
|     "mov z4.d, p5/m, z22.d \n\t" \ |  | ||||||
|     "mov z5.d, p5/m, z23.d \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // YM_PROJ |  | ||||||
| #define YM_PROJ_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z12.d, p5/m, z12.d, z21.d \n\t"  \ |  | ||||||
|     "fadd z13.d, p5/m, z13.d, z22.d \n\t"  \ |  | ||||||
|     "fadd z14.d, p5/m, z14.d, z23.d \n\t"  \ |  | ||||||
|     "fsub z15.d, p5/m, z15.d, z18.d \n\t" \ |  | ||||||
|     "fsub z16.d, p5/m, z16.d, z19.d \n\t" \ |  | ||||||
|     "fsub z17.d, p5/m, z17.d, z20.d \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // ZM_PROJ |  | ||||||
| #define ZM_PROJ_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \ |  | ||||||
|     "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \ |  | ||||||
|     "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \ |  | ||||||
|     "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \ |  | ||||||
|     "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \ |  | ||||||
|     "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // TM_PROJ |  | ||||||
| #define TM_PROJ_A64FXd  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.d \n\t" \ |  | ||||||
|     "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ |  | ||||||
|     "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ |  | ||||||
|     "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ |  | ||||||
|     "fsub z15.d, p5/m, z15.d, z21.d \n\t" \ |  | ||||||
|     "fsub z16.d, p5/m, z16.d, z22.d \n\t" \ |  | ||||||
|     "fsub z17.d, p5/m, z17.d, z23.d \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XM_RECON_ACCUM |  | ||||||
| #define XM_RECON_ACCUM_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ |  | ||||||
|     "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ |  | ||||||
|     "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ |  | ||||||
|     "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ |  | ||||||
|     "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ |  | ||||||
|     "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // YP_RECON_ACCUM |  | ||||||
| #define YP_RECON_ACCUM_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \ |  | ||||||
|     "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \ |  | ||||||
|     "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \ |  | ||||||
|     "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \ |  | ||||||
|     "fadd z6.d, p5/m, z6.d, z21.d \n\t"  \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \ |  | ||||||
|     "fadd z7.d, p5/m, z7.d, z22.d \n\t"  \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \ |  | ||||||
|     "fadd z8.d, p5/m, z8.d, z23.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // YM_RECON_ACCUM |  | ||||||
| #define YM_RECON_ACCUM_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \ |  | ||||||
|     "fadd z9.d, p5/m, z9.d, z18.d \n\t"  \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \ |  | ||||||
|     "fadd z10.d, p5/m, z10.d, z19.d \n\t"  \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \ |  | ||||||
|     "fadd z11.d, p5/m, z11.d, z20.d \n\t"  \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \ |  | ||||||
|     "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \ |  | ||||||
|     "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \ |  | ||||||
|     "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // ZP_RECON_ACCUM |  | ||||||
| #define ZP_RECON_ACCUM_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \ |  | ||||||
|     "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \ |  | ||||||
|     "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \ |  | ||||||
|     "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \ |  | ||||||
|     "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \ |  | ||||||
|     "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // ZM_RECON_ACCUM |  | ||||||
| #define ZM_RECON_ACCUM_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \ |  | ||||||
|     "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \ |  | ||||||
|     "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \ |  | ||||||
|     "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \ |  | ||||||
|     "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \ |  | ||||||
|     "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // TP_RECON_ACCUM |  | ||||||
| #define TP_RECON_ACCUM_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \ |  | ||||||
|     "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \ |  | ||||||
|     "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \ |  | ||||||
|     "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \ |  | ||||||
|     "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \ |  | ||||||
|     "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \ |  | ||||||
|     "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // TM_RECON_ACCUM |  | ||||||
| #define TM_RECON_ACCUM_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \ |  | ||||||
|     "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \ |  | ||||||
|     "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \ |  | ||||||
|     "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \ |  | ||||||
|     "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \ |  | ||||||
|     "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \ |  | ||||||
|     "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // ZERO_PSI |  | ||||||
| #define ZERO_PSI_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.d \n\t" \ |  | ||||||
|     "fmov z0.d , 0 \n\t" \ |  | ||||||
|     "fmov z1.d , 0 \n\t" \ |  | ||||||
|     "fmov z2.d , 0 \n\t" \ |  | ||||||
|     "fmov z3.d , 0 \n\t" \ |  | ||||||
|     "fmov z4.d , 0 \n\t" \ |  | ||||||
|     "fmov z5.d , 0 \n\t" \ |  | ||||||
|     "fmov z6.d , 0 \n\t" \ |  | ||||||
|     "fmov z7.d , 0 \n\t" \ |  | ||||||
|     "fmov z8.d , 0 \n\t" \ |  | ||||||
|     "fmov z9.d , 0 \n\t" \ |  | ||||||
|     "fmov z10.d , 0 \n\t" \ |  | ||||||
|     "fmov z11.d , 0 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // PREFETCH_RESULT_L2_STORE (prefetch store to L2) |  | ||||||
| #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_RESULT_L1_STORE (prefetch store to L1) |  | ||||||
| #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // ADD_RESULT_INTERNAL |  | ||||||
| #define ADD_RESULT_INTERNAL_A64FXd  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.d, p5/m, z0.d, z12.d \n\t"  \ |  | ||||||
|     "fadd z1.d, p5/m, z1.d, z13.d \n\t"  \ |  | ||||||
|     "fadd z2.d, p5/m, z2.d, z14.d \n\t"  \ |  | ||||||
|     "fadd z3.d, p5/m, z3.d, z15.d \n\t"  \ |  | ||||||
|     "fadd z4.d, p5/m, z4.d, z16.d \n\t"  \ |  | ||||||
|     "fadd z5.d, p5/m, z5.d, z17.d \n\t"  \ |  | ||||||
|     "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \ |  | ||||||
|     "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \ |  | ||||||
|     "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \ |  | ||||||
|     "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \ |  | ||||||
|     "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \ |  | ||||||
|     "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| @@ -1,779 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
|     Source file: Fujitsu_A64FX_asm_single.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2020 |  | ||||||
|  |  | ||||||
| Author: Nils Meyer <nils.meyer@ur.de> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #define LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_A64FXf(base)   |  | ||||||
| #define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)   |  | ||||||
| #define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)   |  | ||||||
| #define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)   |  | ||||||
| #define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)   |  | ||||||
| #define PF_GAUGE(A)   |  | ||||||
| #define PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)   |  | ||||||
| #define PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A)   |  | ||||||
| #define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)   |  | ||||||
| #define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)   |  | ||||||
| #define LOCK_GAUGE(A)   |  | ||||||
| #define UNLOCK_GAUGE(A)   |  | ||||||
| #define MASK_REGS                      DECLARATIONS_A64FXf   |  | ||||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)   |  | ||||||
| #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)   |  | ||||||
| #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf   |  | ||||||
| #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)   |  | ||||||
| #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)   |  | ||||||
| #define XP_PROJ                        XP_PROJ_A64FXf   |  | ||||||
| #define YP_PROJ                        YP_PROJ_A64FXf   |  | ||||||
| #define ZP_PROJ                        ZP_PROJ_A64FXf   |  | ||||||
| #define TP_PROJ                        TP_PROJ_A64FXf   |  | ||||||
| #define XM_PROJ                        XM_PROJ_A64FXf   |  | ||||||
| #define YM_PROJ                        YM_PROJ_A64FXf   |  | ||||||
| #define ZM_PROJ                        ZM_PROJ_A64FXf   |  | ||||||
| #define TM_PROJ                        TM_PROJ_A64FXf   |  | ||||||
| #define XP_RECON                       XP_RECON_A64FXf   |  | ||||||
| #define XM_RECON                       XM_RECON_A64FXf   |  | ||||||
| #define XM_RECON_ACCUM                 XM_RECON_ACCUM_A64FXf   |  | ||||||
| #define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXf   |  | ||||||
| #define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXf   |  | ||||||
| #define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXf   |  | ||||||
| #define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXf   |  | ||||||
| #define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXf   |  | ||||||
| #define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXf   |  | ||||||
| #define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXf   |  | ||||||
| #define PERMUTE_DIR0                   0   |  | ||||||
| #define PERMUTE_DIR1                   1   |  | ||||||
| #define PERMUTE_DIR2                   2   |  | ||||||
| #define PERMUTE_DIR3                   3   |  | ||||||
| #define PERMUTE                        PERMUTE_A64FXf;   |  | ||||||
| #define LOAD_TABLE(Dir)                if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }   |  | ||||||
| #define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }   |  | ||||||
| // DECLARATIONS |  | ||||||
| #define DECLARATIONS_A64FXf  \ |  | ||||||
|     const uint32_t lut[4][16] = { \ |  | ||||||
|         {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ |  | ||||||
|         {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ |  | ||||||
|         {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ |  | ||||||
|         {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ |  | ||||||
| asm ( \ |  | ||||||
|     "fmov z31.s , 0 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // RESULT |  | ||||||
| #define RESULT_A64FXf(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "str z0, [%[storeptr], -6, mul vl] \n\t" \ |  | ||||||
|     "str z1, [%[storeptr], -5, mul vl] \n\t" \ |  | ||||||
|     "str z2, [%[storeptr], -4, mul vl] \n\t" \ |  | ||||||
|     "str z3, [%[storeptr], -3, mul vl] \n\t" \ |  | ||||||
|     "str z4, [%[storeptr], -2, mul vl] \n\t" \ |  | ||||||
|     "str z5, [%[storeptr], -1, mul vl] \n\t" \ |  | ||||||
|     "str z6, [%[storeptr], 0, mul vl] \n\t" \ |  | ||||||
|     "str z7, [%[storeptr], 1, mul vl] \n\t" \ |  | ||||||
|     "str z8, [%[storeptr], 2, mul vl] \n\t" \ |  | ||||||
|     "str z9, [%[storeptr], 3, mul vl] \n\t" \ |  | ||||||
|     "str z10, [%[storeptr], 4, mul vl] \n\t" \ |  | ||||||
|     "str z11, [%[storeptr], 5, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [storeptr] "r" (base + 2 * 3 * 64) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_CHIMU_L2 (prefetch to L2) |  | ||||||
| #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_CHIMU_L1 (prefetch to L1) |  | ||||||
| #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_GAUGE_L2 (prefetch to L2) |  | ||||||
| #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \ |  | ||||||
| { \ |  | ||||||
|     const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (baseU) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_GAUGE_L1 (prefetch to L1) |  | ||||||
| #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \ |  | ||||||
| { \ |  | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (baseU) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_CHI |  | ||||||
| #define LOAD_CHI_A64FXf(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ |  | ||||||
|     "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_CHIMU |  | ||||||
| #define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.s \n\t" \ |  | ||||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ |  | ||||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ |  | ||||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ |  | ||||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base + 2 * 3 * 64) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_CHIMU_0213 |  | ||||||
| #define LOAD_CHIMU_0213_A64FXf  \ |  | ||||||
| { \ |  | ||||||
|     const SiteSpinor & ref(in[offset]); \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.s \n\t" \ |  | ||||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ |  | ||||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ |  | ||||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (&ref[2][0]) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_CHIMU_0312 |  | ||||||
| #define LOAD_CHIMU_0312_A64FXf  \ |  | ||||||
| { \ |  | ||||||
|     const SiteSpinor & ref(in[offset]); \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.s \n\t" \ |  | ||||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ |  | ||||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ |  | ||||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ |  | ||||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (&ref[2][0]) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // LOAD_TABLE0 |  | ||||||
| #define LOAD_TABLE0  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [tableptr] "r" (&lut[0]),[index] "i" (0) \ |  | ||||||
|     : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // LOAD_TABLE1 |  | ||||||
| #define LOAD_TABLE1  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [tableptr] "r" (&lut[0]),[index] "i" (1) \ |  | ||||||
|     : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // LOAD_TABLE2 |  | ||||||
| #define LOAD_TABLE2  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [tableptr] "r" (&lut[0]),[index] "i" (2) \ |  | ||||||
|     : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // LOAD_TABLE3 |  | ||||||
| #define LOAD_TABLE3  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [tableptr] "r" (&lut[0]),[index] "i" (3) \ |  | ||||||
|     : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // PERMUTE |  | ||||||
| #define PERMUTE_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "tbl z12.s, { z12.s }, z30.s \n\t"  \ |  | ||||||
|     "tbl z13.s, { z13.s }, z30.s \n\t"  \ |  | ||||||
|     "tbl z14.s, { z14.s }, z30.s \n\t"  \ |  | ||||||
|     "tbl z15.s, { z15.s }, z30.s \n\t"  \ |  | ||||||
|     "tbl z16.s, { z16.s }, z30.s \n\t"  \ |  | ||||||
|     "tbl z17.s, { z17.s }, z30.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // LOAD_GAUGE |  | ||||||
| #define LOAD_GAUGE  \ |  | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.s \n\t" \ |  | ||||||
|     "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (baseU + 2 * 3 * 64) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // MULT_2SPIN |  | ||||||
| #define MULT_2SPIN_1_A64FXf(A)  \ |  | ||||||
| { \ |  | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |  | ||||||
| asm ( \ |  | ||||||
|     "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ |  | ||||||
|     "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ |  | ||||||
|     "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ |  | ||||||
|     "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ |  | ||||||
|     "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ |  | ||||||
|     "movprfx z18.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ |  | ||||||
|     "movprfx z21.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ |  | ||||||
|     "movprfx z19.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ |  | ||||||
|     "movprfx z22.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ |  | ||||||
|     "movprfx z20.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ |  | ||||||
|     "movprfx z23.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ |  | ||||||
|     "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ |  | ||||||
|     "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ |  | ||||||
|     "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \ |  | ||||||
|     "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ |  | ||||||
|     "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ |  | ||||||
|     "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ |  | ||||||
|     "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ |  | ||||||
|     "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ |  | ||||||
|     "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (baseU + 2 * 3 * 64) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // MULT_2SPIN_BACKEND |  | ||||||
| #define MULT_2SPIN_2_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ |  | ||||||
|     "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ |  | ||||||
|     "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ |  | ||||||
|     "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \ |  | ||||||
|     "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \ |  | ||||||
|     "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \ |  | ||||||
|     "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \ |  | ||||||
|     "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \ |  | ||||||
|     "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \ |  | ||||||
|     "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \ |  | ||||||
|     "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \ |  | ||||||
|     "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \ |  | ||||||
|     "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \ |  | ||||||
|     "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \ |  | ||||||
|     "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \ |  | ||||||
|     "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \ |  | ||||||
|     "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \ |  | ||||||
|     "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \ |  | ||||||
|     "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \ |  | ||||||
|     "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \ |  | ||||||
|     "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \ |  | ||||||
|     "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \ |  | ||||||
|     "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ |  | ||||||
|     "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XP_PROJ |  | ||||||
| #define XP_PROJ_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ |  | ||||||
|     "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ |  | ||||||
|     "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ |  | ||||||
|     "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \ |  | ||||||
|     "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \ |  | ||||||
|     "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XP_RECON |  | ||||||
| #define XP_RECON_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "movprfx z6.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ |  | ||||||
|     "movprfx z7.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ |  | ||||||
|     "movprfx z8.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ |  | ||||||
|     "movprfx z9.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ |  | ||||||
|     "movprfx z10.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ |  | ||||||
|     "movprfx z11.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ |  | ||||||
|     "mov z0.s, p5/m, z18.s \n\t" \ |  | ||||||
|     "mov z1.s, p5/m, z19.s \n\t" \ |  | ||||||
|     "mov z2.s, p5/m, z20.s \n\t" \ |  | ||||||
|     "mov z3.s, p5/m, z21.s \n\t" \ |  | ||||||
|     "mov z4.s, p5/m, z22.s \n\t" \ |  | ||||||
|     "mov z5.s, p5/m, z23.s \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // XP_RECON_ACCUM |  | ||||||
| #define XP_RECON_ACCUM_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \ |  | ||||||
|     "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \ |  | ||||||
|     "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \ |  | ||||||
|     "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \ |  | ||||||
|     "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \ |  | ||||||
|     "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // YP_PROJ |  | ||||||
| #define YP_PROJ_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fsub z12.s, p5/m, z12.s, z21.s \n\t" \ |  | ||||||
|     "fsub z13.s, p5/m, z13.s, z22.s \n\t" \ |  | ||||||
|     "fsub z14.s, p5/m, z14.s, z23.s \n\t" \ |  | ||||||
|     "fadd z15.s, p5/m, z15.s, z18.s \n\t"  \ |  | ||||||
|     "fadd z16.s, p5/m, z16.s, z19.s \n\t"  \ |  | ||||||
|     "fadd z17.s, p5/m, z17.s, z20.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // ZP_PROJ |  | ||||||
| #define ZP_PROJ_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \ |  | ||||||
|     "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \ |  | ||||||
|     "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \ |  | ||||||
|     "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \ |  | ||||||
|     "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \ |  | ||||||
|     "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // TP_PROJ |  | ||||||
| #define TP_PROJ_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z12.s, p5/m, z12.s, z18.s \n\t"  \ |  | ||||||
|     "fadd z13.s, p5/m, z13.s, z19.s \n\t"  \ |  | ||||||
|     "fadd z14.s, p5/m, z14.s, z20.s \n\t"  \ |  | ||||||
|     "fadd z15.s, p5/m, z15.s, z21.s \n\t"  \ |  | ||||||
|     "fadd z16.s, p5/m, z16.s, z22.s \n\t"  \ |  | ||||||
|     "fadd z17.s, p5/m, z17.s, z23.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XM_PROJ |  | ||||||
| #define XM_PROJ_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \ |  | ||||||
|     "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \ |  | ||||||
|     "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \ |  | ||||||
|     "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \ |  | ||||||
|     "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \ |  | ||||||
|     "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XM_RECON |  | ||||||
| #define XM_RECON_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "movprfx z6.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ |  | ||||||
|     "movprfx z7.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ |  | ||||||
|     "movprfx z8.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ |  | ||||||
|     "movprfx z9.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ |  | ||||||
|     "movprfx z10.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ |  | ||||||
|     "movprfx z11.s, p5/m, z31.s \n\t" \ |  | ||||||
|     "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ |  | ||||||
|     "mov z0.s, p5/m, z18.s \n\t" \ |  | ||||||
|     "mov z1.s, p5/m, z19.s \n\t" \ |  | ||||||
|     "mov z2.s, p5/m, z20.s \n\t" \ |  | ||||||
|     "mov z3.s, p5/m, z21.s \n\t" \ |  | ||||||
|     "mov z4.s, p5/m, z22.s \n\t" \ |  | ||||||
|     "mov z5.s, p5/m, z23.s \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // YM_PROJ |  | ||||||
| #define YM_PROJ_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z12.s, p5/m, z12.s, z21.s \n\t"  \ |  | ||||||
|     "fadd z13.s, p5/m, z13.s, z22.s \n\t"  \ |  | ||||||
|     "fadd z14.s, p5/m, z14.s, z23.s \n\t"  \ |  | ||||||
|     "fsub z15.s, p5/m, z15.s, z18.s \n\t" \ |  | ||||||
|     "fsub z16.s, p5/m, z16.s, z19.s \n\t" \ |  | ||||||
|     "fsub z17.s, p5/m, z17.s, z20.s \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // ZM_PROJ |  | ||||||
| #define ZM_PROJ_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \ |  | ||||||
|     "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \ |  | ||||||
|     "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \ |  | ||||||
|     "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \ |  | ||||||
|     "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \ |  | ||||||
|     "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // TM_PROJ |  | ||||||
| #define TM_PROJ_A64FXf  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.s \n\t" \ |  | ||||||
|     "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ |  | ||||||
|     "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ |  | ||||||
|     "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ |  | ||||||
|     "fsub z15.s, p5/m, z15.s, z21.s \n\t" \ |  | ||||||
|     "fsub z16.s, p5/m, z16.s, z22.s \n\t" \ |  | ||||||
|     "fsub z17.s, p5/m, z17.s, z23.s \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // XM_RECON_ACCUM |  | ||||||
| #define XM_RECON_ACCUM_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ |  | ||||||
|     "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ |  | ||||||
|     "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ |  | ||||||
|     "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ |  | ||||||
|     "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ |  | ||||||
|     "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // YP_RECON_ACCUM |  | ||||||
| #define YP_RECON_ACCUM_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \ |  | ||||||
|     "fsub z9.s, p5/m, z9.s, z18.s \n\t" \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \ |  | ||||||
|     "fsub z10.s, p5/m, z10.s, z19.s \n\t" \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \ |  | ||||||
|     "fsub z11.s, p5/m, z11.s, z20.s \n\t" \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \ |  | ||||||
|     "fadd z6.s, p5/m, z6.s, z21.s \n\t"  \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \ |  | ||||||
|     "fadd z7.s, p5/m, z7.s, z22.s \n\t"  \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \ |  | ||||||
|     "fadd z8.s, p5/m, z8.s, z23.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // YM_RECON_ACCUM |  | ||||||
| #define YM_RECON_ACCUM_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \ |  | ||||||
|     "fadd z9.s, p5/m, z9.s, z18.s \n\t"  \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \ |  | ||||||
|     "fadd z10.s, p5/m, z10.s, z19.s \n\t"  \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \ |  | ||||||
|     "fadd z11.s, p5/m, z11.s, z20.s \n\t"  \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \ |  | ||||||
|     "fsub z6.s, p5/m, z6.s, z21.s \n\t" \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \ |  | ||||||
|     "fsub z7.s, p5/m, z7.s, z22.s \n\t" \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \ |  | ||||||
|     "fsub z8.s, p5/m, z8.s, z23.s \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // ZP_RECON_ACCUM |  | ||||||
| #define ZP_RECON_ACCUM_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \ |  | ||||||
|     "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \ |  | ||||||
|     "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \ |  | ||||||
|     "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \ |  | ||||||
|     "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \ |  | ||||||
|     "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // ZM_RECON_ACCUM |  | ||||||
| #define ZM_RECON_ACCUM_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \ |  | ||||||
|     "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \ |  | ||||||
|     "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \ |  | ||||||
|     "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \ |  | ||||||
|     "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \ |  | ||||||
|     "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // TP_RECON_ACCUM |  | ||||||
| #define TP_RECON_ACCUM_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \ |  | ||||||
|     "fadd z6.s, p5/m, z6.s, z18.s \n\t"  \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \ |  | ||||||
|     "fadd z7.s, p5/m, z7.s, z19.s \n\t"  \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \ |  | ||||||
|     "fadd z8.s, p5/m, z8.s, z20.s \n\t"  \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \ |  | ||||||
|     "fadd z9.s, p5/m, z9.s, z21.s \n\t"  \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \ |  | ||||||
|     "fadd z10.s, p5/m, z10.s, z22.s \n\t"  \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \ |  | ||||||
|     "fadd z11.s, p5/m, z11.s, z23.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // TM_RECON_ACCUM |  | ||||||
| #define TM_RECON_ACCUM_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \ |  | ||||||
|     "fsub z6.s, p5/m, z6.s, z18.s \n\t" \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \ |  | ||||||
|     "fsub z7.s, p5/m, z7.s, z19.s \n\t" \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \ |  | ||||||
|     "fsub z8.s, p5/m, z8.s, z20.s \n\t" \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \ |  | ||||||
|     "fsub z9.s, p5/m, z9.s, z21.s \n\t" \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \ |  | ||||||
|     "fsub z10.s, p5/m, z10.s, z22.s \n\t" \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \ |  | ||||||
|     "fsub z11.s, p5/m, z11.s, z23.s \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // ZERO_PSI |  | ||||||
| #define ZERO_PSI_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "ptrue p5.s \n\t" \ |  | ||||||
|     "fmov z0.s , 0 \n\t" \ |  | ||||||
|     "fmov z1.s , 0 \n\t" \ |  | ||||||
|     "fmov z2.s , 0 \n\t" \ |  | ||||||
|     "fmov z3.s , 0 \n\t" \ |  | ||||||
|     "fmov z4.s , 0 \n\t" \ |  | ||||||
|     "fmov z5.s , 0 \n\t" \ |  | ||||||
|     "fmov z6.s , 0 \n\t" \ |  | ||||||
|     "fmov z7.s , 0 \n\t" \ |  | ||||||
|     "fmov z8.s , 0 \n\t" \ |  | ||||||
|     "fmov z9.s , 0 \n\t" \ |  | ||||||
|     "fmov z10.s , 0 \n\t" \ |  | ||||||
|     "fmov z11.s , 0 \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| // PREFETCH_RESULT_L2_STORE (prefetch store to L2) |  | ||||||
| #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // PREFETCH_RESULT_L1_STORE (prefetch store to L1) |  | ||||||
| #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \ |  | ||||||
| { \ |  | ||||||
| asm ( \ |  | ||||||
|     "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ |  | ||||||
|     "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ |  | ||||||
|     "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ |  | ||||||
|     :  \ |  | ||||||
|     : [fetchptr] "r" (base) \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ |  | ||||||
| ); \ |  | ||||||
| } |  | ||||||
| // ADD_RESULT_INTERNAL |  | ||||||
| #define ADD_RESULT_INTERNAL_A64FXf  \ |  | ||||||
| asm ( \ |  | ||||||
|     "fadd z0.s, p5/m, z0.s, z12.s \n\t"  \ |  | ||||||
|     "fadd z1.s, p5/m, z1.s, z13.s \n\t"  \ |  | ||||||
|     "fadd z2.s, p5/m, z2.s, z14.s \n\t"  \ |  | ||||||
|     "fadd z3.s, p5/m, z3.s, z15.s \n\t"  \ |  | ||||||
|     "fadd z4.s, p5/m, z4.s, z16.s \n\t"  \ |  | ||||||
|     "fadd z5.s, p5/m, z5.s, z17.s \n\t"  \ |  | ||||||
|     "fadd z6.s, p5/m, z6.s, z18.s \n\t"  \ |  | ||||||
|     "fadd z7.s, p5/m, z7.s, z19.s \n\t"  \ |  | ||||||
|     "fadd z8.s, p5/m, z8.s, z20.s \n\t"  \ |  | ||||||
|     "fadd z9.s, p5/m, z9.s, z21.s \n\t"  \ |  | ||||||
|     "fadd z10.s, p5/m, z10.s, z22.s \n\t"  \ |  | ||||||
|     "fadd z11.s, p5/m, z11.s, z23.s \n\t"  \ |  | ||||||
|     :  \ |  | ||||||
|     :  \ |  | ||||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ |  | ||||||
| );  |  | ||||||
|  |  | ||||||
| @@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
| #define LOCK_GAUGE(A)   | #define LOCK_GAUGE(A)   | ||||||
| #define UNLOCK_GAUGE(A)   | #define UNLOCK_GAUGE(A)   | ||||||
| #define MASK_REGS                      DECLARATIONS_A64FXd   | #define MASK_REGS                      DECLARATIONS_A64FXd   | ||||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)   | #define SAVE_RESULT(A,B)               RESULT_A64FXd(A);   | ||||||
| #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)   | #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)   | ||||||
| #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd   | #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd   | ||||||
| #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)   | #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)   | ||||||
|  | #define ZERO_PSI                       ZERO_PSI_A64FXd   | ||||||
| #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)   | #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)   | ||||||
| #define XP_PROJ                        XP_PROJ_A64FXd   | #define XP_PROJ                        XP_PROJ_A64FXd   | ||||||
| #define YP_PROJ                        YP_PROJ_A64FXd   | #define YP_PROJ                        YP_PROJ_A64FXd   | ||||||
| @@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
| #define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }   | #define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }   | ||||||
| // DECLARATIONS | // DECLARATIONS | ||||||
| #define DECLARATIONS_A64FXd  \ | #define DECLARATIONS_A64FXd  \ | ||||||
|  |     uint64_t baseU; \ | ||||||
|     const uint64_t lut[4][8] = { \ |     const uint64_t lut[4][8] = { \ | ||||||
|         {4, 5, 6, 7, 0, 1, 2, 3}, \ |         {4, 5, 6, 7, 0, 1, 2, 3}, \ | ||||||
|         {2, 3, 0, 1, 6, 7, 4, 5}, \ |         {2, 3, 0, 1, 6, 7, 4, 5}, \ | ||||||
| @@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
| // RESULT | // RESULT | ||||||
| #define RESULT_A64FXd(base)  \ | #define RESULT_A64FXd(base)  \ | ||||||
| { \ | { \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \ | ||||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \ |     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \ | ||||||
| } | } | ||||||
| // PREFETCH_CHIMU_L2 (prefetch to L2) | // PREFETCH_CHIMU_L2 (prefetch to L2) | ||||||
| #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \ | #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \ | ||||||
| { \ | { \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ | ||||||
| } | } | ||||||
| // PREFETCH_CHIMU_L1 (prefetch to L1) | // PREFETCH_CHIMU_L1 (prefetch to L1) | ||||||
| #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \ | #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \ | ||||||
| { \ | { \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ | ||||||
| } | } | ||||||
| // PREFETCH_GAUGE_L2 (prefetch to L2) | // PREFETCH_GAUGE_L2 (prefetch to L2) | ||||||
| #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \ | #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \ | ||||||
| { \ | { \ | ||||||
|     const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ |     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ | ||||||
| } | } | ||||||
| // PREFETCH_GAUGE_L1 (prefetch to L1) | // PREFETCH_GAUGE_L1 (prefetch to L1) | ||||||
| #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \ | #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \ | ||||||
| { \ | { \ | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ | ||||||
| } | } | ||||||
| // LOAD_CHI | // LOAD_CHI | ||||||
| #define LOAD_CHI_A64FXd(base)  \ | #define LOAD_CHI_A64FXd(base)  \ | ||||||
| { \ | { \ | ||||||
|     Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64));  \ |     Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0));  \ | ||||||
|     Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64));  \ |     Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1));  \ | ||||||
|     Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64));  \ |     Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2));  \ | ||||||
|     Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64));  \ |     Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3));  \ | ||||||
|     Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64));  \ |     Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4));  \ | ||||||
|     Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64));  \ |     Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5));  \ | ||||||
| } | } | ||||||
| // LOAD_CHIMU | // LOAD_CHIMU | ||||||
| #define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \ | #define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \ | ||||||
| { \ | { \ | ||||||
|     Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \ |     Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \ |     Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||||
|     Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \ |     Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \ |     Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \ |     Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \ |     Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||||
|     Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \ |     Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|     Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \ |     Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
|     Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \ |     Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||||
|     Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \ |     Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||||
|     Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \ |     Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||||
|     Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \ |     Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||||
| } | } | ||||||
| // LOAD_CHIMU_0213 | // LOAD_CHIMU_0213 | ||||||
| #define LOAD_CHIMU_0213_A64FXd  \ | #define LOAD_CHIMU_0213_A64FXd  \ | ||||||
| { \ | { \ | ||||||
|     const SiteSpinor & ref(in[offset]); \ |     const SiteSpinor & ref(in[offset]); \ | ||||||
|     Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \ |     Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \ |     Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \ |     Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \ |     Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
|     Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \ |     Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||||
|     Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \ |     Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||||
|     Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \ |     Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \ |     Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||||
|     Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \ |     Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|     Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \ |     Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||||
|     Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \ |     Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||||
|     Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \ |     Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||||
| } | } | ||||||
| // LOAD_CHIMU_0312 | // LOAD_CHIMU_0312 | ||||||
| #define LOAD_CHIMU_0312_A64FXd  \ | #define LOAD_CHIMU_0312_A64FXd  \ | ||||||
| { \ | { \ | ||||||
|     const SiteSpinor & ref(in[offset]); \ |     const SiteSpinor & ref(in[offset]); \ | ||||||
|     Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \ |     Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \ |     Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||||
|     Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \ |     Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \ |     Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||||
|     Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \ |     Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||||
|     Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \ |     Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||||
|     Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \ |     Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \ |     Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \ |     Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|     Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \ |     Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
|     Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \ |     Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||||
|     Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \ |     Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||||
| } | } | ||||||
| // LOAD_TABLE0 | // LOAD_TABLE0 | ||||||
| #define LOAD_TABLE0  \ | #define LOAD_TABLE0  \ | ||||||
| @@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
|     Chi_12 = svtbl(Chi_12, table0);     |     Chi_12 = svtbl(Chi_12, table0);     | ||||||
|  |  | ||||||
| // LOAD_GAUGE | // LOAD_GAUGE | ||||||
| #define LOAD_GAUGE  \ | #define LOAD_GAUGE(A)  \ | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |  | ||||||
| { \ | { \ | ||||||
|     U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \ |     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||||
|     U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \ |     U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \ |     U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \ |     U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \ |     U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \ |     U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|  |     U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
| } | } | ||||||
| // MULT_2SPIN | // MULT_2SPIN | ||||||
| #define MULT_2SPIN_1_A64FXd(A)  \ | #define MULT_2SPIN_1_A64FXd(A)  \ | ||||||
| { \ | { \ | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||||
|     U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \ |     U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \ |     U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \ |     U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \ |     U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \ |     U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|     U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \ |     U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
|     UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ |     UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ | ||||||
|     UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ |     UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ | ||||||
|     UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ |     UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ | ||||||
| @@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
|     UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ |     UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ | ||||||
|     UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ |     UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ | ||||||
|     UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ |     UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ | ||||||
|     U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \ |     U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \ | ||||||
|     U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \ |     U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \ | ||||||
|     U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \ |     U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \ | ||||||
| } | } | ||||||
| // MULT_2SPIN_BACKEND | // MULT_2SPIN_BACKEND | ||||||
| #define MULT_2SPIN_2_A64FXd  \ | #define MULT_2SPIN_2_A64FXd  \ | ||||||
| @@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
|     result_31 = svdup_f64(0.); \ |     result_31 = svdup_f64(0.); \ | ||||||
|     result_32 = svdup_f64(0.);  |     result_32 = svdup_f64(0.);  | ||||||
|  |  | ||||||
| // PREFETCH_RESULT_L2_STORE (prefetch store to L2) | // PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) | ||||||
| #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \ | #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \ | ||||||
| { \ | { \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ |     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ |     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ |     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ | ||||||
| } | } | ||||||
| // PREFETCH_RESULT_L1_STORE (prefetch store to L1) | // PREFETCH_RESULT_L1_STORE (prefetch store to L1) | ||||||
| #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \ | #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \ | ||||||
|   | |||||||
| @@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
| #define LOCK_GAUGE(A)   | #define LOCK_GAUGE(A)   | ||||||
| #define UNLOCK_GAUGE(A)   | #define UNLOCK_GAUGE(A)   | ||||||
| #define MASK_REGS                      DECLARATIONS_A64FXf   | #define MASK_REGS                      DECLARATIONS_A64FXf   | ||||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)   | #define SAVE_RESULT(A,B)               RESULT_A64FXf(A);   | ||||||
| #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)   | #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)   | ||||||
| #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf   | #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf   | ||||||
| #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)   | #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)   | ||||||
|  | #define ZERO_PSI                       ZERO_PSI_A64FXf   | ||||||
| #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)   | #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)   | ||||||
| #define XP_PROJ                        XP_PROJ_A64FXf   | #define XP_PROJ                        XP_PROJ_A64FXf   | ||||||
| #define YP_PROJ                        YP_PROJ_A64FXf   | #define YP_PROJ                        YP_PROJ_A64FXf   | ||||||
| @@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
| #define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }   | #define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }   | ||||||
| // DECLARATIONS | // DECLARATIONS | ||||||
| #define DECLARATIONS_A64FXf  \ | #define DECLARATIONS_A64FXf  \ | ||||||
|  |     uint64_t baseU; \ | ||||||
|     const uint32_t lut[4][16] = { \ |     const uint32_t lut[4][16] = { \ | ||||||
|         {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ |         {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ | ||||||
|         {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ |         {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ | ||||||
| @@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
| // RESULT | // RESULT | ||||||
| #define RESULT_A64FXf(base)  \ | #define RESULT_A64FXf(base)  \ | ||||||
| { \ | { \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \ | ||||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \ |     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \ | ||||||
| } | } | ||||||
| // PREFETCH_CHIMU_L2 (prefetch to L2) | // PREFETCH_CHIMU_L2 (prefetch to L2) | ||||||
| #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \ | #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \ | ||||||
| { \ | { \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ | ||||||
| } | } | ||||||
| // PREFETCH_CHIMU_L1 (prefetch to L1) | // PREFETCH_CHIMU_L1 (prefetch to L1) | ||||||
| #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \ | #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \ | ||||||
| { \ | { \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ | ||||||
| } | } | ||||||
| // PREFETCH_GAUGE_L2 (prefetch to L2) | // PREFETCH_GAUGE_L2 (prefetch to L2) | ||||||
| #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \ | #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \ | ||||||
| { \ | { \ | ||||||
|     const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ |     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ | ||||||
| } | } | ||||||
| // PREFETCH_GAUGE_L1 (prefetch to L1) | // PREFETCH_GAUGE_L1 (prefetch to L1) | ||||||
| #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \ | #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \ | ||||||
| { \ | { \ | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ | ||||||
|     svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ |     svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ | ||||||
| } | } | ||||||
| // LOAD_CHI | // LOAD_CHI | ||||||
| #define LOAD_CHI_A64FXf(base)  \ | #define LOAD_CHI_A64FXf(base)  \ | ||||||
| { \ | { \ | ||||||
|     Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64));  \ |     Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0));  \ | ||||||
|     Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64));  \ |     Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1));  \ | ||||||
|     Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64));  \ |     Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2));  \ | ||||||
|     Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64));  \ |     Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3));  \ | ||||||
|     Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64));  \ |     Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4));  \ | ||||||
|     Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64));  \ |     Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5));  \ | ||||||
| } | } | ||||||
| // LOAD_CHIMU | // LOAD_CHIMU | ||||||
| #define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \ | #define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \ | ||||||
| { \ | { \ | ||||||
|     Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \ |     Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \ |     Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||||
|     Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \ |     Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \ |     Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \ |     Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \ |     Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||||
|     Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \ |     Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|     Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \ |     Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
|     Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \ |     Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||||
|     Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \ |     Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||||
|     Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \ |     Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||||
|     Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \ |     Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||||
| } | } | ||||||
| // LOAD_CHIMU_0213 | // LOAD_CHIMU_0213 | ||||||
| #define LOAD_CHIMU_0213_A64FXf  \ | #define LOAD_CHIMU_0213_A64FXf  \ | ||||||
| { \ | { \ | ||||||
|     const SiteSpinor & ref(in[offset]); \ |     const SiteSpinor & ref(in[offset]); \ | ||||||
|     Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \ |     Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \ |     Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \ |     Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \ |     Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
|     Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \ |     Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||||
|     Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \ |     Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||||
|     Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \ |     Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \ |     Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||||
|     Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \ |     Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|     Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \ |     Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||||
|     Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \ |     Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||||
|     Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \ |     Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||||
| } | } | ||||||
| // LOAD_CHIMU_0312 | // LOAD_CHIMU_0312 | ||||||
| #define LOAD_CHIMU_0312_A64FXf  \ | #define LOAD_CHIMU_0312_A64FXf  \ | ||||||
| { \ | { \ | ||||||
|     const SiteSpinor & ref(in[offset]); \ |     const SiteSpinor & ref(in[offset]); \ | ||||||
|     Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \ |     Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \ |     Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||||
|     Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \ |     Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \ |     Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||||
|     Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \ |     Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||||
|     Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \ |     Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||||
|     Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \ |     Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \ |     Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \ |     Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|     Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \ |     Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
|     Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \ |     Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||||
|     Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \ |     Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||||
| } | } | ||||||
| // LOAD_TABLE0 | // LOAD_TABLE0 | ||||||
| #define LOAD_TABLE0  \ | #define LOAD_TABLE0  \ | ||||||
| @@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
|     Chi_12 = svtbl(Chi_12, table0);     |     Chi_12 = svtbl(Chi_12, table0);     | ||||||
|  |  | ||||||
| // LOAD_GAUGE | // LOAD_GAUGE | ||||||
| #define LOAD_GAUGE  \ | #define LOAD_GAUGE(A)  \ | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |  | ||||||
| { \ | { \ | ||||||
|     U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \ |     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||||
|     U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \ |     U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \ |     U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \ |     U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \ |     U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \ |     U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|  |     U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
| } | } | ||||||
| // MULT_2SPIN | // MULT_2SPIN | ||||||
| #define MULT_2SPIN_1_A64FXf(A)  \ | #define MULT_2SPIN_1_A64FXf(A)  \ | ||||||
| { \ | { \ | ||||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ |     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||||
|     U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \ |     U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \ | ||||||
|     U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \ |     U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \ | ||||||
|     U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \ |     U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \ | ||||||
|     U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \ |     U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \ | ||||||
|     U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \ |     U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \ | ||||||
|     U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \ |     U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \ | ||||||
|     UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ |     UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ | ||||||
|     UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ |     UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ | ||||||
|     UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ |     UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ | ||||||
| @@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
|     UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ |     UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ | ||||||
|     UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ |     UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ | ||||||
|     UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ |     UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ | ||||||
|     U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \ |     U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \ | ||||||
|     U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \ |     U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \ | ||||||
|     U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \ |     U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \ | ||||||
| } | } | ||||||
| // MULT_2SPIN_BACKEND | // MULT_2SPIN_BACKEND | ||||||
| #define MULT_2SPIN_2_A64FXf  \ | #define MULT_2SPIN_2_A64FXf  \ | ||||||
| @@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
|     result_31 = svdup_f32(0.); \ |     result_31 = svdup_f32(0.); \ | ||||||
|     result_32 = svdup_f32(0.);  |     result_32 = svdup_f32(0.);  | ||||||
|  |  | ||||||
| // PREFETCH_RESULT_L2_STORE (prefetch store to L2) | // PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) | ||||||
| #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \ | #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \ | ||||||
| { \ | { \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ |     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ |     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ | ||||||
|     svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ |     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ | ||||||
| } | } | ||||||
| // PREFETCH_RESULT_L1_STORE (prefetch store to L1) | // PREFETCH_RESULT_L1_STORE (prefetch store to L1) | ||||||
| #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \ | #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \ | ||||||
|   | |||||||
| @@ -46,6 +46,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | |||||||
| #undef MULT_2SPIN_2 | #undef MULT_2SPIN_2 | ||||||
| #undef MAYBEPERM | #undef MAYBEPERM | ||||||
| #undef LOAD_CHI | #undef LOAD_CHI | ||||||
|  | #undef ZERO_PSI | ||||||
| #undef XP_PROJ | #undef XP_PROJ | ||||||
| #undef YP_PROJ | #undef YP_PROJ | ||||||
| #undef ZP_PROJ | #undef ZP_PROJ | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user