mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Merge pull request #329 from nmeyer-ur/feature/a64fx-3
Revised dslash/dwf kernels for A64FX
This commit is contained in:
		@@ -38,9 +38,6 @@ Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
// undefine everything related to kernels
 | 
			
		||||
#include <simd/Fujitsu_A64FX_undef.h>
 | 
			
		||||
 | 
			
		||||
// enable A64FX body
 | 
			
		||||
#define WILSONKERNELSASMBODYA64FX
 | 
			
		||||
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////////
 | 
			
		||||
    // If we are A64FX specialise the single precision routine
 | 
			
		||||
@@ -63,119 +60,89 @@ Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
#define INTERIOR_AND_EXTERIOR
 | 
			
		||||
#undef INTERIOR
 | 
			
		||||
#undef EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef INTERIOR_AND_EXTERIOR
 | 
			
		||||
#define INTERIOR
 | 
			
		||||
#undef EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef INTERIOR_AND_EXTERIOR
 | 
			
		||||
#undef INTERIOR
 | 
			
		||||
#define EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -185,119 +152,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
 | 
			
		||||
#define INTERIOR_AND_EXTERIOR
 | 
			
		||||
#undef INTERIOR
 | 
			
		||||
#undef EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef INTERIOR_AND_EXTERIOR
 | 
			
		||||
#define INTERIOR
 | 
			
		||||
#undef EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef INTERIOR_AND_EXTERIOR
 | 
			
		||||
#undef INTERIOR
 | 
			
		||||
#define EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// undefine
 | 
			
		||||
@@ -330,119 +267,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie
 | 
			
		||||
#define INTERIOR_AND_EXTERIOR
 | 
			
		||||
#undef INTERIOR
 | 
			
		||||
#undef EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef INTERIOR_AND_EXTERIOR
 | 
			
		||||
#define INTERIOR
 | 
			
		||||
#undef EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef INTERIOR_AND_EXTERIOR
 | 
			
		||||
#undef INTERIOR
 | 
			
		||||
#define EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
// XYZT vectorised, dag Kernel, double
 | 
			
		||||
@@ -451,124 +358,93 @@ WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
 | 
			
		||||
#define INTERIOR_AND_EXTERIOR
 | 
			
		||||
#undef INTERIOR
 | 
			
		||||
#undef EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef INTERIOR_AND_EXTERIOR
 | 
			
		||||
#define INTERIOR
 | 
			
		||||
#undef EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef INTERIOR_AND_EXTERIOR
 | 
			
		||||
#undef INTERIOR
 | 
			
		||||
#define EXTERIOR
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
			
		||||
template<> void
 | 
			
		||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
			
		||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
			
		||||
#if defined (WILSONKERNELSASMBODYA64FX)
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// undefs
 | 
			
		||||
#undef WILSONKERNELSASMBODYA64FX
 | 
			
		||||
#include <simd/Fujitsu_A64FX_undef.h>
 | 
			
		||||
 | 
			
		||||
#endif //A64FXASM
 | 
			
		||||
 
 | 
			
		||||
@@ -25,6 +25,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
// GCC 10 messes up SVE instruction scheduling using -O3, but
 | 
			
		||||
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
 | 
			
		||||
// performance now is better than armclang 20.2
 | 
			
		||||
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
#define DIR0_PROJ    XP_PROJ
 | 
			
		||||
#define DIR1_PROJ    YP_PROJ
 | 
			
		||||
@@ -97,7 +102,7 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
    PROJ;							                        \
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR,perm);					        \
 | 
			
		||||
      } else {								                \
 | 
			
		||||
	LOAD_CHI(base);							                \
 | 
			
		||||
	  LOAD_CHI(base);							                \
 | 
			
		||||
      }									                    \
 | 
			
		||||
      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
 | 
			
		||||
    MULT_2SPIN_1(Dir);					                    \
 | 
			
		||||
@@ -110,6 +115,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
    }                                                       \
 | 
			
		||||
    RECON;								                    \
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
 | 
			
		||||
    though I expected that it would improve on performance
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
 | 
			
		||||
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
 | 
			
		||||
  PREFETCH1_CHIMU(base);						            \
 | 
			
		||||
@@ -126,73 +136,63 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
 | 
			
		||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
 | 
			
		||||
      basep = st.GetPFInfo(nent,plocal); nent++;			\
 | 
			
		||||
      if ( local ) {							            \
 | 
			
		||||
    LOAD_CHIMU(base);                                       \
 | 
			
		||||
    LOAD_TABLE(PERMUTE_DIR);                                \
 | 
			
		||||
    PROJ;							                        \
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR,perm);					        \
 | 
			
		||||
      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}	    \
 | 
			
		||||
      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
 | 
			
		||||
      if ( local || st.same_node[Dir] ) {				    \
 | 
			
		||||
    MULT_2SPIN_1(Dir);					                    \
 | 
			
		||||
    PREFETCH_CHIMU(base);                                   \
 | 
			
		||||
    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
 | 
			
		||||
    MULT_2SPIN_2;					                        \
 | 
			
		||||
    if (s == 0) {                                           \
 | 
			
		||||
       if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
 | 
			
		||||
    }                                                       \
 | 
			
		||||
    RECON;								                    \
 | 
			
		||||
    PREFETCH_CHIMU_L2(basep);                               \
 | 
			
		||||
      } else { PREFETCH_CHIMU(base); }								                    \
 | 
			
		||||
      if ( local ) {							\
 | 
			
		||||
  LOAD_CHIMU(base);                                       \
 | 
			
		||||
  LOAD_TABLE(PERMUTE_DIR);                                \
 | 
			
		||||
  PROJ;							                        \
 | 
			
		||||
  MAYBEPERM(PERMUTE_DIR,perm);					        \
 | 
			
		||||
      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\
 | 
			
		||||
      if ( local || st.same_node[Dir] ) {				\
 | 
			
		||||
  MULT_2SPIN_1(Dir);					                    \
 | 
			
		||||
  MULT_2SPIN_2;					                        \
 | 
			
		||||
  RECON;								\
 | 
			
		||||
      }									\
 | 
			
		||||
  base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
 | 
			
		||||
  PREFETCH_CHIMU(base);						\
 | 
			
		||||
  PREFETCH_CHIMU_L2(basep);                               \
 | 
			
		||||
 | 
			
		||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
 | 
			
		||||
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
 | 
			
		||||
  PREFETCH1_CHIMU(base);						\
 | 
			
		||||
  { ZERO_PSI; }								\
 | 
			
		||||
  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
 | 
			
		||||
 | 
			
		||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Post comms kernel
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef EXTERIOR
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
 | 
			
		||||
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
 | 
			
		||||
  if((!local)&&(!st.same_node[Dir]) ) {					    \
 | 
			
		||||
    LOAD_CHI(base);							                \
 | 
			
		||||
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
 | 
			
		||||
  if((!local)&&(!st.same_node[Dir]) ) {					\
 | 
			
		||||
    LOAD_CHI(base);							\
 | 
			
		||||
    MULT_2SPIN_1(Dir);					                    \
 | 
			
		||||
    PREFETCH_CHIMU(base);                                   \
 | 
			
		||||
    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
 | 
			
		||||
    MULT_2SPIN_2;					                        \
 | 
			
		||||
    if (s == 0) {                                           \
 | 
			
		||||
      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
 | 
			
		||||
    }                                                       \
 | 
			
		||||
    RECON;								                    \
 | 
			
		||||
    nmu++;								                    \
 | 
			
		||||
    RECON;								\
 | 
			
		||||
    nmu++;								\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
 | 
			
		||||
  nmu=0;								                    \
 | 
			
		||||
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
 | 
			
		||||
  if((!local)&&(!st.same_node[Dir]) ) {					    \
 | 
			
		||||
    LOAD_CHI(base);							                \
 | 
			
		||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
 | 
			
		||||
  nmu=0;								\
 | 
			
		||||
  { ZERO_PSI;}								\
 | 
			
		||||
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
 | 
			
		||||
  if((!local)&&(!st.same_node[Dir]) ) {					\
 | 
			
		||||
    LOAD_CHI(base);							\
 | 
			
		||||
    MULT_2SPIN_1(Dir);					                    \
 | 
			
		||||
    PREFETCH_CHIMU(base);                                   \
 | 
			
		||||
    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
 | 
			
		||||
    MULT_2SPIN_2;					                        \
 | 
			
		||||
    if (s == 0) {                                           \
 | 
			
		||||
      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
 | 
			
		||||
    }                                                       \
 | 
			
		||||
    RECON;								                    \
 | 
			
		||||
    nmu++;								                    \
 | 
			
		||||
    RECON;								\
 | 
			
		||||
    nmu++;								\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
{
 | 
			
		||||
  int nmu;
 | 
			
		||||
  int local,perm, ptype;
 | 
			
		||||
@@ -209,7 +209,6 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
 | 
			
		||||
    //    int sUn=lo.Reorder(ssn);
 | 
			
		||||
    int sUn=ssn;
 | 
			
		||||
    LOCK_GAUGE(0);
 | 
			
		||||
#else
 | 
			
		||||
    int sU =ssU;
 | 
			
		||||
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
 | 
			
		||||
@@ -295,6 +294,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
      std::cout << "----------------------------------------------------" << std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
      // DC ZVA test
 | 
			
		||||
      // { uint64_t basestore = (uint64_t)&out[ss];
 | 
			
		||||
      //   PREFETCH_RESULT_L2_STORE(basestore); }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
 | 
			
		||||
 | 
			
		||||
#ifdef SHOW
 | 
			
		||||
@@ -308,6 +312,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
      std::cout << "----------------------------------------------------" << std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
      // DC ZVA test
 | 
			
		||||
      //{ uint64_t basestore = (uint64_t)&out[ss];
 | 
			
		||||
      //  PREFETCH_RESULT_L2_STORE(basestore); }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
 | 
			
		||||
 | 
			
		||||
#ifdef SHOW
 | 
			
		||||
@@ -321,6 +330,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
      std::cout << "----------------------------------------------------" << std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
      // DC ZVA test
 | 
			
		||||
      //{ uint64_t basestore = (uint64_t)&out[ss];
 | 
			
		||||
      //  PREFETCH_RESULT_L2_STORE(basestore); }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
 | 
			
		||||
 | 
			
		||||
#ifdef SHOW
 | 
			
		||||
@@ -341,6 +355,7 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 | 
			
		||||
      base = (uint64_t) &out[ss];
 | 
			
		||||
      basep= st.GetPFInfo(nent,plocal); ent++;
 | 
			
		||||
      basep = (uint64_t) &out[ssn];
 | 
			
		||||
      //PREFETCH_RESULT_L1_STORE(base);
 | 
			
		||||
      RESULT(base,basep);
 | 
			
		||||
 | 
			
		||||
#ifdef SHOW
 | 
			
		||||
 
 | 
			
		||||
@@ -1,779 +0,0 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: Fujitsu_A64FX_asm_double.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2020
 | 
			
		||||
 | 
			
		||||
Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#define LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_A64FXd(base)  
 | 
			
		||||
#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)  
 | 
			
		||||
#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  
 | 
			
		||||
#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)  
 | 
			
		||||
#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  
 | 
			
		||||
#define PF_GAUGE(A)  
 | 
			
		||||
#define PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)  
 | 
			
		||||
#define PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)  
 | 
			
		||||
#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
 | 
			
		||||
#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
 | 
			
		||||
#define LOCK_GAUGE(A)  
 | 
			
		||||
#define UNLOCK_GAUGE(A)  
 | 
			
		||||
#define MASK_REGS                      DECLARATIONS_A64FXd  
 | 
			
		||||
#define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)  
 | 
			
		||||
#define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)  
 | 
			
		||||
#define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd  
 | 
			
		||||
#define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
 | 
			
		||||
#define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)  
 | 
			
		||||
#define XP_PROJ                        XP_PROJ_A64FXd  
 | 
			
		||||
#define YP_PROJ                        YP_PROJ_A64FXd  
 | 
			
		||||
#define ZP_PROJ                        ZP_PROJ_A64FXd  
 | 
			
		||||
#define TP_PROJ                        TP_PROJ_A64FXd  
 | 
			
		||||
#define XM_PROJ                        XM_PROJ_A64FXd  
 | 
			
		||||
#define YM_PROJ                        YM_PROJ_A64FXd  
 | 
			
		||||
#define ZM_PROJ                        ZM_PROJ_A64FXd  
 | 
			
		||||
#define TM_PROJ                        TM_PROJ_A64FXd  
 | 
			
		||||
#define XP_RECON                       XP_RECON_A64FXd  
 | 
			
		||||
#define XM_RECON                       XM_RECON_A64FXd  
 | 
			
		||||
#define XM_RECON_ACCUM                 XM_RECON_ACCUM_A64FXd  
 | 
			
		||||
#define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXd  
 | 
			
		||||
#define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXd  
 | 
			
		||||
#define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXd  
 | 
			
		||||
#define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXd  
 | 
			
		||||
#define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXd  
 | 
			
		||||
#define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXd  
 | 
			
		||||
#define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXd  
 | 
			
		||||
#define PERMUTE_DIR0                   0  
 | 
			
		||||
#define PERMUTE_DIR1                   1  
 | 
			
		||||
#define PERMUTE_DIR2                   2  
 | 
			
		||||
#define PERMUTE_DIR3                   3  
 | 
			
		||||
#define PERMUTE                        PERMUTE_A64FXd;  
 | 
			
		||||
#define LOAD_TABLE(Dir)                if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }  
 | 
			
		||||
#define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }  
 | 
			
		||||
// DECLARATIONS
 | 
			
		||||
#define DECLARATIONS_A64FXd  \
 | 
			
		||||
    const uint64_t lut[4][8] = { \
 | 
			
		||||
        {4, 5, 6, 7, 0, 1, 2, 3}, \
 | 
			
		||||
        {2, 3, 0, 1, 6, 7, 4, 5}, \
 | 
			
		||||
        {1, 0, 3, 2, 5, 4, 7, 6}, \
 | 
			
		||||
        {0, 1, 2, 4, 5, 6, 7, 8} };\
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fmov z31.d , 0 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// RESULT
 | 
			
		||||
#define RESULT_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "str z0, [%[storeptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "str z1, [%[storeptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "str z2, [%[storeptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "str z3, [%[storeptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "str z4, [%[storeptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "str z5, [%[storeptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "str z6, [%[storeptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "str z7, [%[storeptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "str z8, [%[storeptr], 2, mul vl] \n\t" \
 | 
			
		||||
    "str z9, [%[storeptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "str z10, [%[storeptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "str z11, [%[storeptr], 5, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [storeptr] "r" (base + 2 * 3 * 64) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_CHIMU_L2 (prefetch to L2)
 | 
			
		||||
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_CHIMU_L1 (prefetch to L1)
 | 
			
		||||
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_GAUGE_L2 (prefetch to L2)
 | 
			
		||||
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (baseU) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_GAUGE_L1 (prefetch to L1)
 | 
			
		||||
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (baseU) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHI
 | 
			
		||||
#define LOAD_CHI_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU
 | 
			
		||||
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.d \n\t" \
 | 
			
		||||
    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base + 2 * 3 * 64) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU_0213
 | 
			
		||||
#define LOAD_CHIMU_0213_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
    const SiteSpinor & ref(in[offset]); \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.d \n\t" \
 | 
			
		||||
    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (&ref[2][0]) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU_0312
 | 
			
		||||
#define LOAD_CHIMU_0312_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
    const SiteSpinor & ref(in[offset]); \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.d \n\t" \
 | 
			
		||||
    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (&ref[2][0]) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_TABLE0
 | 
			
		||||
#define LOAD_TABLE0  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
 | 
			
		||||
    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// LOAD_TABLE1
 | 
			
		||||
#define LOAD_TABLE1  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
 | 
			
		||||
    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// LOAD_TABLE2
 | 
			
		||||
#define LOAD_TABLE2  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
 | 
			
		||||
    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// LOAD_TABLE3
 | 
			
		||||
#define LOAD_TABLE3  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [tableptr] "r" (&lut[0]),[index] "i" (3) \
 | 
			
		||||
    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// PERMUTE
 | 
			
		||||
#define PERMUTE_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "tbl z12.d, { z12.d }, z30.d \n\t"  \
 | 
			
		||||
    "tbl z13.d, { z13.d }, z30.d \n\t"  \
 | 
			
		||||
    "tbl z14.d, { z14.d }, z30.d \n\t"  \
 | 
			
		||||
    "tbl z15.d, { z15.d }, z30.d \n\t"  \
 | 
			
		||||
    "tbl z16.d, { z16.d }, z30.d \n\t"  \
 | 
			
		||||
    "tbl z17.d, { z17.d }, z30.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// LOAD_GAUGE
 | 
			
		||||
#define LOAD_GAUGE  \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.d \n\t" \
 | 
			
		||||
    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// MULT_2SPIN
 | 
			
		||||
#define MULT_2SPIN_1_A64FXd(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "movprfx z18.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
 | 
			
		||||
    "movprfx z21.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
 | 
			
		||||
    "movprfx z19.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
 | 
			
		||||
    "movprfx z22.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
 | 
			
		||||
    "movprfx z20.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
 | 
			
		||||
    "movprfx z23.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
 | 
			
		||||
    "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// MULT_2SPIN_BACKEND
 | 
			
		||||
#define MULT_2SPIN_2_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \
 | 
			
		||||
    "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
 | 
			
		||||
    "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XP_PROJ
 | 
			
		||||
#define XP_PROJ_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XP_RECON
 | 
			
		||||
#define XP_RECON_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "movprfx z6.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
 | 
			
		||||
    "movprfx z7.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
 | 
			
		||||
    "movprfx z8.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
 | 
			
		||||
    "movprfx z9.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
 | 
			
		||||
    "movprfx z10.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
 | 
			
		||||
    "movprfx z11.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
 | 
			
		||||
    "mov z0.d, p5/m, z18.d \n\t" \
 | 
			
		||||
    "mov z1.d, p5/m, z19.d \n\t" \
 | 
			
		||||
    "mov z2.d, p5/m, z20.d \n\t" \
 | 
			
		||||
    "mov z3.d, p5/m, z21.d \n\t" \
 | 
			
		||||
    "mov z4.d, p5/m, z22.d \n\t" \
 | 
			
		||||
    "mov z5.d, p5/m, z23.d \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// XP_RECON_ACCUM
 | 
			
		||||
#define XP_RECON_ACCUM_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
 | 
			
		||||
    "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
 | 
			
		||||
    "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
 | 
			
		||||
    "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
 | 
			
		||||
    "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
 | 
			
		||||
    "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// YP_PROJ
 | 
			
		||||
#define YP_PROJ_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fsub z12.d, p5/m, z12.d, z21.d \n\t" \
 | 
			
		||||
    "fsub z13.d, p5/m, z13.d, z22.d \n\t" \
 | 
			
		||||
    "fsub z14.d, p5/m, z14.d, z23.d \n\t" \
 | 
			
		||||
    "fadd z15.d, p5/m, z15.d, z18.d \n\t"  \
 | 
			
		||||
    "fadd z16.d, p5/m, z16.d, z19.d \n\t"  \
 | 
			
		||||
    "fadd z17.d, p5/m, z17.d, z20.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// ZP_PROJ
 | 
			
		||||
#define ZP_PROJ_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// TP_PROJ
 | 
			
		||||
#define TP_PROJ_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z12.d, p5/m, z12.d, z18.d \n\t"  \
 | 
			
		||||
    "fadd z13.d, p5/m, z13.d, z19.d \n\t"  \
 | 
			
		||||
    "fadd z14.d, p5/m, z14.d, z20.d \n\t"  \
 | 
			
		||||
    "fadd z15.d, p5/m, z15.d, z21.d \n\t"  \
 | 
			
		||||
    "fadd z16.d, p5/m, z16.d, z22.d \n\t"  \
 | 
			
		||||
    "fadd z17.d, p5/m, z17.d, z23.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XM_PROJ
 | 
			
		||||
#define XM_PROJ_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XM_RECON
 | 
			
		||||
#define XM_RECON_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "movprfx z6.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
 | 
			
		||||
    "movprfx z7.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
 | 
			
		||||
    "movprfx z8.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
 | 
			
		||||
    "movprfx z9.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
 | 
			
		||||
    "movprfx z10.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
 | 
			
		||||
    "movprfx z11.d, p5/m, z31.d \n\t" \
 | 
			
		||||
    "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
 | 
			
		||||
    "mov z0.d, p5/m, z18.d \n\t" \
 | 
			
		||||
    "mov z1.d, p5/m, z19.d \n\t" \
 | 
			
		||||
    "mov z2.d, p5/m, z20.d \n\t" \
 | 
			
		||||
    "mov z3.d, p5/m, z21.d \n\t" \
 | 
			
		||||
    "mov z4.d, p5/m, z22.d \n\t" \
 | 
			
		||||
    "mov z5.d, p5/m, z23.d \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// YM_PROJ
 | 
			
		||||
#define YM_PROJ_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z12.d, p5/m, z12.d, z21.d \n\t"  \
 | 
			
		||||
    "fadd z13.d, p5/m, z13.d, z22.d \n\t"  \
 | 
			
		||||
    "fadd z14.d, p5/m, z14.d, z23.d \n\t"  \
 | 
			
		||||
    "fsub z15.d, p5/m, z15.d, z18.d \n\t" \
 | 
			
		||||
    "fsub z16.d, p5/m, z16.d, z19.d \n\t" \
 | 
			
		||||
    "fsub z17.d, p5/m, z17.d, z20.d \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// ZM_PROJ
 | 
			
		||||
#define ZM_PROJ_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \
 | 
			
		||||
    "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// TM_PROJ
 | 
			
		||||
#define TM_PROJ_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.d \n\t" \
 | 
			
		||||
    "fsub z12.d, p5/m, z12.d, z18.d \n\t" \
 | 
			
		||||
    "fsub z13.d, p5/m, z13.d, z19.d \n\t" \
 | 
			
		||||
    "fsub z14.d, p5/m, z14.d, z20.d \n\t" \
 | 
			
		||||
    "fsub z15.d, p5/m, z15.d, z21.d \n\t" \
 | 
			
		||||
    "fsub z16.d, p5/m, z16.d, z22.d \n\t" \
 | 
			
		||||
    "fsub z17.d, p5/m, z17.d, z23.d \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XM_RECON_ACCUM
 | 
			
		||||
#define XM_RECON_ACCUM_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
 | 
			
		||||
    "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// YP_RECON_ACCUM
 | 
			
		||||
#define YP_RECON_ACCUM_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
 | 
			
		||||
    "fsub z9.d, p5/m, z9.d, z18.d \n\t" \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
 | 
			
		||||
    "fsub z10.d, p5/m, z10.d, z19.d \n\t" \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
 | 
			
		||||
    "fsub z11.d, p5/m, z11.d, z20.d \n\t" \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
 | 
			
		||||
    "fadd z6.d, p5/m, z6.d, z21.d \n\t"  \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
 | 
			
		||||
    "fadd z7.d, p5/m, z7.d, z22.d \n\t"  \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
 | 
			
		||||
    "fadd z8.d, p5/m, z8.d, z23.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// YM_RECON_ACCUM
 | 
			
		||||
#define YM_RECON_ACCUM_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
 | 
			
		||||
    "fadd z9.d, p5/m, z9.d, z18.d \n\t"  \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
 | 
			
		||||
    "fadd z10.d, p5/m, z10.d, z19.d \n\t"  \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
 | 
			
		||||
    "fadd z11.d, p5/m, z11.d, z20.d \n\t"  \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
 | 
			
		||||
    "fsub z6.d, p5/m, z6.d, z21.d \n\t" \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
 | 
			
		||||
    "fsub z7.d, p5/m, z7.d, z22.d \n\t" \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
 | 
			
		||||
    "fsub z8.d, p5/m, z8.d, z23.d \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// ZP_RECON_ACCUM
 | 
			
		||||
#define ZP_RECON_ACCUM_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
 | 
			
		||||
    "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
 | 
			
		||||
    "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
 | 
			
		||||
    "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
 | 
			
		||||
    "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
 | 
			
		||||
    "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// ZM_RECON_ACCUM
 | 
			
		||||
#define ZM_RECON_ACCUM_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
 | 
			
		||||
    "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
 | 
			
		||||
    "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
 | 
			
		||||
    "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
 | 
			
		||||
    "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
 | 
			
		||||
    "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// TP_RECON_ACCUM
 | 
			
		||||
#define TP_RECON_ACCUM_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
 | 
			
		||||
    "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
 | 
			
		||||
    "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
 | 
			
		||||
    "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
 | 
			
		||||
    "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
 | 
			
		||||
    "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
 | 
			
		||||
    "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// TM_RECON_ACCUM
 | 
			
		||||
#define TM_RECON_ACCUM_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
 | 
			
		||||
    "fsub z6.d, p5/m, z6.d, z18.d \n\t" \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
 | 
			
		||||
    "fsub z7.d, p5/m, z7.d, z19.d \n\t" \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
 | 
			
		||||
    "fsub z8.d, p5/m, z8.d, z20.d \n\t" \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
 | 
			
		||||
    "fsub z9.d, p5/m, z9.d, z21.d \n\t" \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
 | 
			
		||||
    "fsub z10.d, p5/m, z10.d, z22.d \n\t" \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
 | 
			
		||||
    "fsub z11.d, p5/m, z11.d, z23.d \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// ZERO_PSI
 | 
			
		||||
#define ZERO_PSI_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.d \n\t" \
 | 
			
		||||
    "fmov z0.d , 0 \n\t" \
 | 
			
		||||
    "fmov z1.d , 0 \n\t" \
 | 
			
		||||
    "fmov z2.d , 0 \n\t" \
 | 
			
		||||
    "fmov z3.d , 0 \n\t" \
 | 
			
		||||
    "fmov z4.d , 0 \n\t" \
 | 
			
		||||
    "fmov z5.d , 0 \n\t" \
 | 
			
		||||
    "fmov z6.d , 0 \n\t" \
 | 
			
		||||
    "fmov z7.d , 0 \n\t" \
 | 
			
		||||
    "fmov z8.d , 0 \n\t" \
 | 
			
		||||
    "fmov z9.d , 0 \n\t" \
 | 
			
		||||
    "fmov z10.d , 0 \n\t" \
 | 
			
		||||
    "fmov z11.d , 0 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
 | 
			
		||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
 | 
			
		||||
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// ADD_RESULT_INTERNAL
 | 
			
		||||
#define ADD_RESULT_INTERNAL_A64FXd  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.d, p5/m, z0.d, z12.d \n\t"  \
 | 
			
		||||
    "fadd z1.d, p5/m, z1.d, z13.d \n\t"  \
 | 
			
		||||
    "fadd z2.d, p5/m, z2.d, z14.d \n\t"  \
 | 
			
		||||
    "fadd z3.d, p5/m, z3.d, z15.d \n\t"  \
 | 
			
		||||
    "fadd z4.d, p5/m, z4.d, z16.d \n\t"  \
 | 
			
		||||
    "fadd z5.d, p5/m, z5.d, z17.d \n\t"  \
 | 
			
		||||
    "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \
 | 
			
		||||
    "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \
 | 
			
		||||
    "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \
 | 
			
		||||
    "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \
 | 
			
		||||
    "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \
 | 
			
		||||
    "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
@@ -1,779 +0,0 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: Fujitsu_A64FX_asm_single.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2020
 | 
			
		||||
 | 
			
		||||
Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#define LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_A64FXf(base)  
 | 
			
		||||
#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)  
 | 
			
		||||
#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  
 | 
			
		||||
#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)  
 | 
			
		||||
#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  
 | 
			
		||||
#define PF_GAUGE(A)  
 | 
			
		||||
#define PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)  
 | 
			
		||||
#define PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A)  
 | 
			
		||||
#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
 | 
			
		||||
#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
 | 
			
		||||
#define LOCK_GAUGE(A)  
 | 
			
		||||
#define UNLOCK_GAUGE(A)  
 | 
			
		||||
#define MASK_REGS                      DECLARATIONS_A64FXf  
 | 
			
		||||
#define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)  
 | 
			
		||||
#define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)  
 | 
			
		||||
#define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf  
 | 
			
		||||
#define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
 | 
			
		||||
#define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)  
 | 
			
		||||
#define XP_PROJ                        XP_PROJ_A64FXf  
 | 
			
		||||
#define YP_PROJ                        YP_PROJ_A64FXf  
 | 
			
		||||
#define ZP_PROJ                        ZP_PROJ_A64FXf  
 | 
			
		||||
#define TP_PROJ                        TP_PROJ_A64FXf  
 | 
			
		||||
#define XM_PROJ                        XM_PROJ_A64FXf  
 | 
			
		||||
#define YM_PROJ                        YM_PROJ_A64FXf  
 | 
			
		||||
#define ZM_PROJ                        ZM_PROJ_A64FXf  
 | 
			
		||||
#define TM_PROJ                        TM_PROJ_A64FXf  
 | 
			
		||||
#define XP_RECON                       XP_RECON_A64FXf  
 | 
			
		||||
#define XM_RECON                       XM_RECON_A64FXf  
 | 
			
		||||
#define XM_RECON_ACCUM                 XM_RECON_ACCUM_A64FXf  
 | 
			
		||||
#define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXf  
 | 
			
		||||
#define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXf  
 | 
			
		||||
#define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXf  
 | 
			
		||||
#define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXf  
 | 
			
		||||
#define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXf  
 | 
			
		||||
#define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXf  
 | 
			
		||||
#define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXf  
 | 
			
		||||
#define PERMUTE_DIR0                   0  
 | 
			
		||||
#define PERMUTE_DIR1                   1  
 | 
			
		||||
#define PERMUTE_DIR2                   2  
 | 
			
		||||
#define PERMUTE_DIR3                   3  
 | 
			
		||||
#define PERMUTE                        PERMUTE_A64FXf;  
 | 
			
		||||
#define LOAD_TABLE(Dir)                if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }  
 | 
			
		||||
#define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }  
 | 
			
		||||
// DECLARATIONS
 | 
			
		||||
#define DECLARATIONS_A64FXf  \
 | 
			
		||||
    const uint32_t lut[4][16] = { \
 | 
			
		||||
        {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
 | 
			
		||||
        {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
 | 
			
		||||
        {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
 | 
			
		||||
        {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fmov z31.s , 0 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// RESULT
 | 
			
		||||
#define RESULT_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "str z0, [%[storeptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "str z1, [%[storeptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "str z2, [%[storeptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "str z3, [%[storeptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "str z4, [%[storeptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "str z5, [%[storeptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "str z6, [%[storeptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "str z7, [%[storeptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "str z8, [%[storeptr], 2, mul vl] \n\t" \
 | 
			
		||||
    "str z9, [%[storeptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "str z10, [%[storeptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "str z11, [%[storeptr], 5, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [storeptr] "r" (base + 2 * 3 * 64) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_CHIMU_L2 (prefetch to L2)
 | 
			
		||||
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_CHIMU_L1 (prefetch to L1)
 | 
			
		||||
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_GAUGE_L2 (prefetch to L2)
 | 
			
		||||
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (baseU) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_GAUGE_L1 (prefetch to L1)
 | 
			
		||||
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (baseU) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHI
 | 
			
		||||
#define LOAD_CHI_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU
 | 
			
		||||
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.s \n\t" \
 | 
			
		||||
    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base + 2 * 3 * 64) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU_0213
 | 
			
		||||
#define LOAD_CHIMU_0213_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
    const SiteSpinor & ref(in[offset]); \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.s \n\t" \
 | 
			
		||||
    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (&ref[2][0]) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU_0312
 | 
			
		||||
#define LOAD_CHIMU_0312_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
    const SiteSpinor & ref(in[offset]); \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.s \n\t" \
 | 
			
		||||
    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (&ref[2][0]) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_TABLE0
 | 
			
		||||
#define LOAD_TABLE0  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
 | 
			
		||||
    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// LOAD_TABLE1
 | 
			
		||||
#define LOAD_TABLE1  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
 | 
			
		||||
    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// LOAD_TABLE2
 | 
			
		||||
#define LOAD_TABLE2  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
 | 
			
		||||
    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// LOAD_TABLE3
 | 
			
		||||
#define LOAD_TABLE3  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [tableptr] "r" (&lut[0]),[index] "i" (3) \
 | 
			
		||||
    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// PERMUTE
 | 
			
		||||
#define PERMUTE_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "tbl z12.s, { z12.s }, z30.s \n\t"  \
 | 
			
		||||
    "tbl z13.s, { z13.s }, z30.s \n\t"  \
 | 
			
		||||
    "tbl z14.s, { z14.s }, z30.s \n\t"  \
 | 
			
		||||
    "tbl z15.s, { z15.s }, z30.s \n\t"  \
 | 
			
		||||
    "tbl z16.s, { z16.s }, z30.s \n\t"  \
 | 
			
		||||
    "tbl z17.s, { z17.s }, z30.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// LOAD_GAUGE
 | 
			
		||||
#define LOAD_GAUGE  \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.s \n\t" \
 | 
			
		||||
    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// MULT_2SPIN
 | 
			
		||||
#define MULT_2SPIN_1_A64FXf(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
 | 
			
		||||
    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
 | 
			
		||||
    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
 | 
			
		||||
    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
 | 
			
		||||
    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
 | 
			
		||||
    "movprfx z18.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \
 | 
			
		||||
    "movprfx z21.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \
 | 
			
		||||
    "movprfx z19.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \
 | 
			
		||||
    "movprfx z22.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \
 | 
			
		||||
    "movprfx z20.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \
 | 
			
		||||
    "movprfx z23.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \
 | 
			
		||||
    "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
 | 
			
		||||
    "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
 | 
			
		||||
    "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// MULT_2SPIN_BACKEND
 | 
			
		||||
#define MULT_2SPIN_2_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \
 | 
			
		||||
    "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
 | 
			
		||||
    "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XP_PROJ
 | 
			
		||||
#define XP_PROJ_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XP_RECON
 | 
			
		||||
#define XP_RECON_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "movprfx z6.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
 | 
			
		||||
    "movprfx z7.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
 | 
			
		||||
    "movprfx z8.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
 | 
			
		||||
    "movprfx z9.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
 | 
			
		||||
    "movprfx z10.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
 | 
			
		||||
    "movprfx z11.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
 | 
			
		||||
    "mov z0.s, p5/m, z18.s \n\t" \
 | 
			
		||||
    "mov z1.s, p5/m, z19.s \n\t" \
 | 
			
		||||
    "mov z2.s, p5/m, z20.s \n\t" \
 | 
			
		||||
    "mov z3.s, p5/m, z21.s \n\t" \
 | 
			
		||||
    "mov z4.s, p5/m, z22.s \n\t" \
 | 
			
		||||
    "mov z5.s, p5/m, z23.s \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// XP_RECON_ACCUM
 | 
			
		||||
#define XP_RECON_ACCUM_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
 | 
			
		||||
    "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
 | 
			
		||||
    "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
 | 
			
		||||
    "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
 | 
			
		||||
    "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
 | 
			
		||||
    "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// YP_PROJ
 | 
			
		||||
#define YP_PROJ_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fsub z12.s, p5/m, z12.s, z21.s \n\t" \
 | 
			
		||||
    "fsub z13.s, p5/m, z13.s, z22.s \n\t" \
 | 
			
		||||
    "fsub z14.s, p5/m, z14.s, z23.s \n\t" \
 | 
			
		||||
    "fadd z15.s, p5/m, z15.s, z18.s \n\t"  \
 | 
			
		||||
    "fadd z16.s, p5/m, z16.s, z19.s \n\t"  \
 | 
			
		||||
    "fadd z17.s, p5/m, z17.s, z20.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// ZP_PROJ
 | 
			
		||||
#define ZP_PROJ_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// TP_PROJ
 | 
			
		||||
#define TP_PROJ_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z12.s, p5/m, z12.s, z18.s \n\t"  \
 | 
			
		||||
    "fadd z13.s, p5/m, z13.s, z19.s \n\t"  \
 | 
			
		||||
    "fadd z14.s, p5/m, z14.s, z20.s \n\t"  \
 | 
			
		||||
    "fadd z15.s, p5/m, z15.s, z21.s \n\t"  \
 | 
			
		||||
    "fadd z16.s, p5/m, z16.s, z22.s \n\t"  \
 | 
			
		||||
    "fadd z17.s, p5/m, z17.s, z23.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XM_PROJ
 | 
			
		||||
#define XM_PROJ_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XM_RECON
 | 
			
		||||
#define XM_RECON_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "movprfx z6.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
 | 
			
		||||
    "movprfx z7.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
 | 
			
		||||
    "movprfx z8.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
 | 
			
		||||
    "movprfx z9.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
 | 
			
		||||
    "movprfx z10.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
 | 
			
		||||
    "movprfx z11.s, p5/m, z31.s \n\t" \
 | 
			
		||||
    "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
 | 
			
		||||
    "mov z0.s, p5/m, z18.s \n\t" \
 | 
			
		||||
    "mov z1.s, p5/m, z19.s \n\t" \
 | 
			
		||||
    "mov z2.s, p5/m, z20.s \n\t" \
 | 
			
		||||
    "mov z3.s, p5/m, z21.s \n\t" \
 | 
			
		||||
    "mov z4.s, p5/m, z22.s \n\t" \
 | 
			
		||||
    "mov z5.s, p5/m, z23.s \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// YM_PROJ
 | 
			
		||||
#define YM_PROJ_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z12.s, p5/m, z12.s, z21.s \n\t"  \
 | 
			
		||||
    "fadd z13.s, p5/m, z13.s, z22.s \n\t"  \
 | 
			
		||||
    "fadd z14.s, p5/m, z14.s, z23.s \n\t"  \
 | 
			
		||||
    "fsub z15.s, p5/m, z15.s, z18.s \n\t" \
 | 
			
		||||
    "fsub z16.s, p5/m, z16.s, z19.s \n\t" \
 | 
			
		||||
    "fsub z17.s, p5/m, z17.s, z20.s \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// ZM_PROJ
 | 
			
		||||
#define ZM_PROJ_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \
 | 
			
		||||
    "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// TM_PROJ
 | 
			
		||||
#define TM_PROJ_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.s \n\t" \
 | 
			
		||||
    "fsub z12.s, p5/m, z12.s, z18.s \n\t" \
 | 
			
		||||
    "fsub z13.s, p5/m, z13.s, z19.s \n\t" \
 | 
			
		||||
    "fsub z14.s, p5/m, z14.s, z20.s \n\t" \
 | 
			
		||||
    "fsub z15.s, p5/m, z15.s, z21.s \n\t" \
 | 
			
		||||
    "fsub z16.s, p5/m, z16.s, z22.s \n\t" \
 | 
			
		||||
    "fsub z17.s, p5/m, z17.s, z23.s \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// XM_RECON_ACCUM
 | 
			
		||||
#define XM_RECON_ACCUM_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
 | 
			
		||||
    "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// YP_RECON_ACCUM
 | 
			
		||||
#define YP_RECON_ACCUM_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
 | 
			
		||||
    "fsub z9.s, p5/m, z9.s, z18.s \n\t" \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
 | 
			
		||||
    "fsub z10.s, p5/m, z10.s, z19.s \n\t" \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
 | 
			
		||||
    "fsub z11.s, p5/m, z11.s, z20.s \n\t" \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
 | 
			
		||||
    "fadd z6.s, p5/m, z6.s, z21.s \n\t"  \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
 | 
			
		||||
    "fadd z7.s, p5/m, z7.s, z22.s \n\t"  \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
 | 
			
		||||
    "fadd z8.s, p5/m, z8.s, z23.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// YM_RECON_ACCUM
 | 
			
		||||
#define YM_RECON_ACCUM_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
 | 
			
		||||
    "fadd z9.s, p5/m, z9.s, z18.s \n\t"  \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
 | 
			
		||||
    "fadd z10.s, p5/m, z10.s, z19.s \n\t"  \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
 | 
			
		||||
    "fadd z11.s, p5/m, z11.s, z20.s \n\t"  \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
 | 
			
		||||
    "fsub z6.s, p5/m, z6.s, z21.s \n\t" \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
 | 
			
		||||
    "fsub z7.s, p5/m, z7.s, z22.s \n\t" \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
 | 
			
		||||
    "fsub z8.s, p5/m, z8.s, z23.s \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// ZP_RECON_ACCUM
 | 
			
		||||
#define ZP_RECON_ACCUM_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
 | 
			
		||||
    "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
 | 
			
		||||
    "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
 | 
			
		||||
    "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
 | 
			
		||||
    "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
 | 
			
		||||
    "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// ZM_RECON_ACCUM
 | 
			
		||||
#define ZM_RECON_ACCUM_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
 | 
			
		||||
    "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
 | 
			
		||||
    "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
 | 
			
		||||
    "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
 | 
			
		||||
    "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
 | 
			
		||||
    "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// TP_RECON_ACCUM
 | 
			
		||||
#define TP_RECON_ACCUM_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
 | 
			
		||||
    "fadd z6.s, p5/m, z6.s, z18.s \n\t"  \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
 | 
			
		||||
    "fadd z7.s, p5/m, z7.s, z19.s \n\t"  \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
 | 
			
		||||
    "fadd z8.s, p5/m, z8.s, z20.s \n\t"  \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
 | 
			
		||||
    "fadd z9.s, p5/m, z9.s, z21.s \n\t"  \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
 | 
			
		||||
    "fadd z10.s, p5/m, z10.s, z22.s \n\t"  \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
 | 
			
		||||
    "fadd z11.s, p5/m, z11.s, z23.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// TM_RECON_ACCUM
 | 
			
		||||
#define TM_RECON_ACCUM_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
 | 
			
		||||
    "fsub z6.s, p5/m, z6.s, z18.s \n\t" \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
 | 
			
		||||
    "fsub z7.s, p5/m, z7.s, z19.s \n\t" \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
 | 
			
		||||
    "fsub z8.s, p5/m, z8.s, z20.s \n\t" \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
 | 
			
		||||
    "fsub z9.s, p5/m, z9.s, z21.s \n\t" \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
 | 
			
		||||
    "fsub z10.s, p5/m, z10.s, z22.s \n\t" \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
 | 
			
		||||
    "fsub z11.s, p5/m, z11.s, z23.s \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// ZERO_PSI
 | 
			
		||||
#define ZERO_PSI_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "ptrue p5.s \n\t" \
 | 
			
		||||
    "fmov z0.s , 0 \n\t" \
 | 
			
		||||
    "fmov z1.s , 0 \n\t" \
 | 
			
		||||
    "fmov z2.s , 0 \n\t" \
 | 
			
		||||
    "fmov z3.s , 0 \n\t" \
 | 
			
		||||
    "fmov z4.s , 0 \n\t" \
 | 
			
		||||
    "fmov z5.s , 0 \n\t" \
 | 
			
		||||
    "fmov z6.s , 0 \n\t" \
 | 
			
		||||
    "fmov z7.s , 0 \n\t" \
 | 
			
		||||
    "fmov z8.s , 0 \n\t" \
 | 
			
		||||
    "fmov z9.s , 0 \n\t" \
 | 
			
		||||
    "fmov z10.s , 0 \n\t" \
 | 
			
		||||
    "fmov z11.s , 0 \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
 | 
			
		||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
 | 
			
		||||
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
 | 
			
		||||
    "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
 | 
			
		||||
    "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
 | 
			
		||||
    :  \
 | 
			
		||||
    : [fetchptr] "r" (base) \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 | 
			
		||||
); \
 | 
			
		||||
}
 | 
			
		||||
// ADD_RESULT_INTERNAL
 | 
			
		||||
#define ADD_RESULT_INTERNAL_A64FXf  \
 | 
			
		||||
asm ( \
 | 
			
		||||
    "fadd z0.s, p5/m, z0.s, z12.s \n\t"  \
 | 
			
		||||
    "fadd z1.s, p5/m, z1.s, z13.s \n\t"  \
 | 
			
		||||
    "fadd z2.s, p5/m, z2.s, z14.s \n\t"  \
 | 
			
		||||
    "fadd z3.s, p5/m, z3.s, z15.s \n\t"  \
 | 
			
		||||
    "fadd z4.s, p5/m, z4.s, z16.s \n\t"  \
 | 
			
		||||
    "fadd z5.s, p5/m, z5.s, z17.s \n\t"  \
 | 
			
		||||
    "fadd z6.s, p5/m, z6.s, z18.s \n\t"  \
 | 
			
		||||
    "fadd z7.s, p5/m, z7.s, z19.s \n\t"  \
 | 
			
		||||
    "fadd z8.s, p5/m, z8.s, z20.s \n\t"  \
 | 
			
		||||
    "fadd z9.s, p5/m, z9.s, z21.s \n\t"  \
 | 
			
		||||
    "fadd z10.s, p5/m, z10.s, z22.s \n\t"  \
 | 
			
		||||
    "fadd z11.s, p5/m, z11.s, z23.s \n\t"  \
 | 
			
		||||
    :  \
 | 
			
		||||
    :  \
 | 
			
		||||
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 | 
			
		||||
); 
 | 
			
		||||
 | 
			
		||||
@@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
#define LOCK_GAUGE(A)  
 | 
			
		||||
#define UNLOCK_GAUGE(A)  
 | 
			
		||||
#define MASK_REGS                      DECLARATIONS_A64FXd  
 | 
			
		||||
#define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)  
 | 
			
		||||
#define SAVE_RESULT(A,B)               RESULT_A64FXd(A);  
 | 
			
		||||
#define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)  
 | 
			
		||||
#define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd  
 | 
			
		||||
#define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
 | 
			
		||||
#define ZERO_PSI                       ZERO_PSI_A64FXd  
 | 
			
		||||
#define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)  
 | 
			
		||||
#define XP_PROJ                        XP_PROJ_A64FXd  
 | 
			
		||||
#define YP_PROJ                        YP_PROJ_A64FXd  
 | 
			
		||||
@@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
#define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }  
 | 
			
		||||
// DECLARATIONS
 | 
			
		||||
#define DECLARATIONS_A64FXd  \
 | 
			
		||||
    uint64_t baseU; \
 | 
			
		||||
    const uint64_t lut[4][8] = { \
 | 
			
		||||
        {4, 5, 6, 7, 0, 1, 2, 3}, \
 | 
			
		||||
        {2, 3, 0, 1, 6, 7, 4, 5}, \
 | 
			
		||||
@@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
// RESULT
 | 
			
		||||
#define RESULT_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \
 | 
			
		||||
    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \
 | 
			
		||||
    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_CHIMU_L2 (prefetch to L2)
 | 
			
		||||
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_CHIMU_L1 (prefetch to L1)
 | 
			
		||||
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_GAUGE_L2 (prefetch to L2)
 | 
			
		||||
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
 | 
			
		||||
    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_GAUGE_L1 (prefetch to L1)
 | 
			
		||||
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
 | 
			
		||||
    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHI
 | 
			
		||||
#define LOAD_CHI_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64));  \
 | 
			
		||||
    Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64));  \
 | 
			
		||||
    Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64));  \
 | 
			
		||||
    Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64));  \
 | 
			
		||||
    Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64));  \
 | 
			
		||||
    Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64));  \
 | 
			
		||||
    Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0));  \
 | 
			
		||||
    Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1));  \
 | 
			
		||||
    Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2));  \
 | 
			
		||||
    Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3));  \
 | 
			
		||||
    Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4));  \
 | 
			
		||||
    Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5));  \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU
 | 
			
		||||
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
 | 
			
		||||
    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
 | 
			
		||||
    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
 | 
			
		||||
    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
 | 
			
		||||
    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
 | 
			
		||||
    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
 | 
			
		||||
    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
 | 
			
		||||
    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
 | 
			
		||||
    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
 | 
			
		||||
    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 | 
			
		||||
    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
 | 
			
		||||
    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU_0213
 | 
			
		||||
#define LOAD_CHIMU_0213_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
    const SiteSpinor & ref(in[offset]); \
 | 
			
		||||
    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
 | 
			
		||||
    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
 | 
			
		||||
    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
 | 
			
		||||
    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
 | 
			
		||||
    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
 | 
			
		||||
    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
 | 
			
		||||
    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
 | 
			
		||||
    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 | 
			
		||||
    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
 | 
			
		||||
    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
 | 
			
		||||
    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
 | 
			
		||||
    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU_0312
 | 
			
		||||
#define LOAD_CHIMU_0312_A64FXd  \
 | 
			
		||||
{ \
 | 
			
		||||
    const SiteSpinor & ref(in[offset]); \
 | 
			
		||||
    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
 | 
			
		||||
    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
 | 
			
		||||
    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
 | 
			
		||||
    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
 | 
			
		||||
    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
 | 
			
		||||
    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
 | 
			
		||||
    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
 | 
			
		||||
    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
 | 
			
		||||
    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
 | 
			
		||||
    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 | 
			
		||||
    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
 | 
			
		||||
    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_TABLE0
 | 
			
		||||
#define LOAD_TABLE0  \
 | 
			
		||||
@@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
    Chi_12 = svtbl(Chi_12, table0);    
 | 
			
		||||
 | 
			
		||||
// LOAD_GAUGE
 | 
			
		||||
#define LOAD_GAUGE  \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
#define LOAD_GAUGE(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 | 
			
		||||
    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
}
 | 
			
		||||
// MULT_2SPIN
 | 
			
		||||
#define MULT_2SPIN_1_A64FXd(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 | 
			
		||||
    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
    UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
 | 
			
		||||
    UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
 | 
			
		||||
    UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
 | 
			
		||||
@@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
    UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
 | 
			
		||||
    UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
 | 
			
		||||
    UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
 | 
			
		||||
    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \
 | 
			
		||||
    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \
 | 
			
		||||
    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \
 | 
			
		||||
    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \
 | 
			
		||||
    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \
 | 
			
		||||
    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \
 | 
			
		||||
}
 | 
			
		||||
// MULT_2SPIN_BACKEND
 | 
			
		||||
#define MULT_2SPIN_2_A64FXd  \
 | 
			
		||||
@@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
    result_31 = svdup_f64(0.); \
 | 
			
		||||
    result_32 = svdup_f64(0.); 
 | 
			
		||||
 | 
			
		||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
 | 
			
		||||
// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
 | 
			
		||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
 | 
			
		||||
    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
 | 
			
		||||
    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
 | 
			
		||||
    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
 | 
			
		||||
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \
 | 
			
		||||
 
 | 
			
		||||
@@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
#define LOCK_GAUGE(A)  
 | 
			
		||||
#define UNLOCK_GAUGE(A)  
 | 
			
		||||
#define MASK_REGS                      DECLARATIONS_A64FXf  
 | 
			
		||||
#define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)  
 | 
			
		||||
#define SAVE_RESULT(A,B)               RESULT_A64FXf(A);  
 | 
			
		||||
#define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)  
 | 
			
		||||
#define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf  
 | 
			
		||||
#define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
 | 
			
		||||
#define ZERO_PSI                       ZERO_PSI_A64FXf  
 | 
			
		||||
#define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)  
 | 
			
		||||
#define XP_PROJ                        XP_PROJ_A64FXf  
 | 
			
		||||
#define YP_PROJ                        YP_PROJ_A64FXf  
 | 
			
		||||
@@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
#define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }  
 | 
			
		||||
// DECLARATIONS
 | 
			
		||||
#define DECLARATIONS_A64FXf  \
 | 
			
		||||
    uint64_t baseU; \
 | 
			
		||||
    const uint32_t lut[4][16] = { \
 | 
			
		||||
        {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
 | 
			
		||||
        {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
 | 
			
		||||
@@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
// RESULT
 | 
			
		||||
#define RESULT_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \
 | 
			
		||||
    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \
 | 
			
		||||
    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_CHIMU_L2 (prefetch to L2)
 | 
			
		||||
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_CHIMU_L1 (prefetch to L1)
 | 
			
		||||
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_GAUGE_L2 (prefetch to L2)
 | 
			
		||||
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
 | 
			
		||||
    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_GAUGE_L1 (prefetch to L1)
 | 
			
		||||
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
 | 
			
		||||
    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
 | 
			
		||||
    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHI
 | 
			
		||||
#define LOAD_CHI_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64));  \
 | 
			
		||||
    Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64));  \
 | 
			
		||||
    Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64));  \
 | 
			
		||||
    Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64));  \
 | 
			
		||||
    Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64));  \
 | 
			
		||||
    Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64));  \
 | 
			
		||||
    Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0));  \
 | 
			
		||||
    Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1));  \
 | 
			
		||||
    Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2));  \
 | 
			
		||||
    Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3));  \
 | 
			
		||||
    Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4));  \
 | 
			
		||||
    Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5));  \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU
 | 
			
		||||
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
 | 
			
		||||
    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
 | 
			
		||||
    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
 | 
			
		||||
    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
 | 
			
		||||
    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
 | 
			
		||||
    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
 | 
			
		||||
    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
 | 
			
		||||
    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
 | 
			
		||||
    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
 | 
			
		||||
    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 | 
			
		||||
    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
 | 
			
		||||
    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU_0213
 | 
			
		||||
#define LOAD_CHIMU_0213_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
    const SiteSpinor & ref(in[offset]); \
 | 
			
		||||
    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
 | 
			
		||||
    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
 | 
			
		||||
    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
 | 
			
		||||
    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
 | 
			
		||||
    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
 | 
			
		||||
    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
 | 
			
		||||
    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
 | 
			
		||||
    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 | 
			
		||||
    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
 | 
			
		||||
    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
 | 
			
		||||
    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
 | 
			
		||||
    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_CHIMU_0312
 | 
			
		||||
#define LOAD_CHIMU_0312_A64FXf  \
 | 
			
		||||
{ \
 | 
			
		||||
    const SiteSpinor & ref(in[offset]); \
 | 
			
		||||
    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
 | 
			
		||||
    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
 | 
			
		||||
    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
 | 
			
		||||
    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
 | 
			
		||||
    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
 | 
			
		||||
    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
 | 
			
		||||
    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
 | 
			
		||||
    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
 | 
			
		||||
    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
 | 
			
		||||
    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 | 
			
		||||
    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
 | 
			
		||||
    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 | 
			
		||||
}
 | 
			
		||||
// LOAD_TABLE0
 | 
			
		||||
#define LOAD_TABLE0  \
 | 
			
		||||
@@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
    Chi_12 = svtbl(Chi_12, table0);    
 | 
			
		||||
 | 
			
		||||
// LOAD_GAUGE
 | 
			
		||||
#define LOAD_GAUGE  \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
#define LOAD_GAUGE(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 | 
			
		||||
    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
}
 | 
			
		||||
// MULT_2SPIN
 | 
			
		||||
#define MULT_2SPIN_1_A64FXf(A)  \
 | 
			
		||||
{ \
 | 
			
		||||
    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 | 
			
		||||
    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
 | 
			
		||||
    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
 | 
			
		||||
    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
 | 
			
		||||
    U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
 | 
			
		||||
    U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
 | 
			
		||||
    U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
 | 
			
		||||
    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 | 
			
		||||
    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
 | 
			
		||||
    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
 | 
			
		||||
    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
 | 
			
		||||
    U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
 | 
			
		||||
    U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
 | 
			
		||||
    U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
 | 
			
		||||
    UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
 | 
			
		||||
    UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
 | 
			
		||||
    UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
 | 
			
		||||
@@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
    UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
 | 
			
		||||
    UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
 | 
			
		||||
    UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
 | 
			
		||||
    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \
 | 
			
		||||
    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \
 | 
			
		||||
    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \
 | 
			
		||||
    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \
 | 
			
		||||
    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \
 | 
			
		||||
    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \
 | 
			
		||||
}
 | 
			
		||||
// MULT_2SPIN_BACKEND
 | 
			
		||||
#define MULT_2SPIN_2_A64FXf  \
 | 
			
		||||
@@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
    result_31 = svdup_f32(0.); \
 | 
			
		||||
    result_32 = svdup_f32(0.); 
 | 
			
		||||
 | 
			
		||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
 | 
			
		||||
// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
 | 
			
		||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \
 | 
			
		||||
{ \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
 | 
			
		||||
    svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
 | 
			
		||||
    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
 | 
			
		||||
    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
 | 
			
		||||
    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
 | 
			
		||||
}
 | 
			
		||||
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
 | 
			
		||||
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \
 | 
			
		||||
 
 | 
			
		||||
@@ -46,6 +46,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
#undef MULT_2SPIN_2
 | 
			
		||||
#undef MAYBEPERM
 | 
			
		||||
#undef LOAD_CHI
 | 
			
		||||
#undef ZERO_PSI
 | 
			
		||||
#undef XP_PROJ
 | 
			
		||||
#undef YP_PROJ
 | 
			
		||||
#undef ZP_PROJ
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user