diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h new file mode 100644 index 00000000..4e428097 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -0,0 +1,660 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + + + Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#if defined(A64FXINTRIN) +#pragma message("A64FX Wilson kernels intrin") +#else +#pragma message("A64FX Wilson kernels asm") +#endif + +#if defined(A64FX) + /////////////////////////////////////////////////////////// + // If we are A64FX specialise the single precision routine + /////////////////////////////////////////////////////////// +#if defined(A64FXINTRIN) +#include +#else +#include +#endif + + +/// Switch off the 5d vectorised code optimisations +#undef DWFVEC5D + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, undag Kernel, single +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, dag Kernel, single +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef MAYBEPERM +//#undef MULT_2SPIN +#define MAYBEPERM(A,B) +//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) + +///////////////////////////////////////////////////////////////// +// Ls vectorised, undag Kernel, single +///////////////////////////////////////////////////////////////// + +#ifdef DWFVEC5D + +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +#undef MULT_2SPIN +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +///////////////////////////////////////////////////////////////// +// Ls vectorised, dag Kernel, single +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#endif // VEC 5D + +//#undef COMPLEX_SIGNS +//#undef MAYBEPERM +//#undef MULT_2SPIN + +// undefine everything +#include + +/////////////////////////////////////////////////////////// +// If we are A64FX specialise the double precision routine +/////////////////////////////////////////////////////////// + +#if defined(A64FXINTRIN) +#include +#else +#include +#endif + +// KNL stuff +//#define MAYBEPERM(A,perm) if (perm) { A ; } +//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) +//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; + + +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, undag Kernel, double +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, dag Kernel, double +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + + +// KNL stuff +//#undef MAYBEPERM +//#undef MULT_2SPIN +#define MAYBEPERM(A,B) +//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) + +///////////////////////////////////////////////////////////////// +// Ls vectorised, undag Kernel, double +///////////////////////////////////////////////////////////////// +#ifdef DWFVEC5D + +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +#undef MULT_2SPIN +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +///////////////////////////////////////////////////////////////// +// Ls vectorised, dag Kernel, double +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#include + +#endif // VEC 5D + +// undefs +#include + +#endif //A64FX diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc index f6f235c8..a8e9e6d9 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc @@ -37,6 +37,7 @@ directory //////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); #include +#include #include NAMESPACE_END(Grid); diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h new file mode 100644 index 00000000..57636961 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -0,0 +1,691 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd(A) +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 PERM0_A64FXd +#define PERMUTE_DIR1 PERM1_A64FXd +#define PERMUTE_DIR2 PERM2_A64FXd +#define PERMUTE_DIR3 PERM3_A64FXd +// DECLARATIONS +#define DECLARATIONS_A64FXd(x) \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ +asm ( \ + "fmov z31.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXd(base) \ +{ \ +asm ( \ + "stnt1d { z0.d }, p5, [%[storeptr], -6, mul vl] \n\t" \ + "stnt1d { z1.d }, p5, [%[storeptr], -5, mul vl] \n\t" \ + "stnt1d { z2.d }, p5, [%[storeptr], -4, mul vl] \n\t" \ + "stnt1d { z3.d }, p5, [%[storeptr], -3, mul vl] \n\t" \ + "stnt1d { z4.d }, p5, [%[storeptr], -2, mul vl] \n\t" \ + "stnt1d { z5.d }, p5, [%[storeptr], -1, mul vl] \n\t" \ + "stnt1d { z6.d }, p5, [%[storeptr], 0, mul vl] \n\t" \ + "stnt1d { z7.d }, p5, [%[storeptr], 1, mul vl] \n\t" \ + "stnt1d { z8.d }, p5, [%[storeptr], 2, mul vl] \n\t" \ + "stnt1d { z9.d }, p5, [%[storeptr], 3, mul vl] \n\t" \ + "stnt1d { z10.d }, p5, [%[storeptr], 4, mul vl] \n\t" \ + "stnt1d { z11.d }, p5, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ +asm ( \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PERM0 +#define PERM0_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM1 +#define PERM1_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM2 +#define PERM2_A64FXd \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERM3 +#define PERM3_A64FXd + +// MULT_2SPIN +#define MULT_2SPIN_A64FXd(A) \ +{ \ + const auto & ref(U[sU][A]); \ +asm ( \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "fmov z18.d , 0 \n\t" \ + "fmov z21.d , 0 \n\t" \ + "fmov z19.d , 0 \n\t" \ + "fmov z22.d , 0 \n\t" \ + "fmov z20.d , 0 \n\t" \ + "fmov z23.d , 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ + : \ + : [fetchptr] "r" ((uint64_t)&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z24.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z25.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z26.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.d, p5/m, z12.d, z27.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z28.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z29.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z24.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z25.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z26.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z27.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.d, p5/m, z12.d, z24.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z25.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z26.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z27.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z28.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z29.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z24.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z25.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z26.d, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fadd z12.d, p5/m, z12.d, z27.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z28.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z29.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z24.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z25.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z26.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z27.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ +asm ( \ + "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \ + "fsub z12.d, p5/m, z12.d, z24.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z25.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z26.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z27.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z28.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z29.d \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "mov z0.d, z18.d \n\t" \ + "mov z1.d, z19.d \n\t" \ + "mov z2.d, z20.d \n\t" \ + "mov z3.d, z21.d \n\t" \ + "mov z4.d, z22.d \n\t" \ + "mov z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ +asm ( \ + "ptrue p5.d \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ +asm ( \ + "ptrue p5.d \n\t" \ + "fmov z0.d , 0 \n\t" \ + "fmov z1.d , 0 \n\t" \ + "fmov z2.d , 0 \n\t" \ + "fmov z3.d , 0 \n\t" \ + "fmov z4.d , 0 \n\t" \ + "fmov z5.d , 0 \n\t" \ + "fmov z6.d , 0 \n\t" \ + "fmov z7.d , 0 \n\t" \ + "fmov z8.d , 0 \n\t" \ + "fmov z9.d , 0 \n\t" \ + "fmov z10.d , 0 \n\t" \ + "fmov z11.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h new file mode 100644 index 00000000..1bafc114 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -0,0 +1,567 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: XXX + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define PREFETCH_CHIMU_L1(A) +#define PREFETCH_GAUGE_L1(A) +#define PREFETCH_CHIMU_L2(A) +#define PREFETCH_GAUGE_L2(A) +#define PF_GAUGE(A) +#define PREFETCH1_CHIMU(A) +#define PREFETCH_CHIMU(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd(A) +#define COMPLEX_SIGNS(A) +#define LOAD64(A,B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd +#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd +#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd +#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd +#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd +#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd +#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd +#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd +#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 PERM0_A64FXd +#define PERMUTE_DIR1 PERM1_A64FXd +#define PERMUTE_DIR2 PERM2_A64FXd +#define PERMUTE_DIR3 PERM3_A64FXd +// DECLARATIONS +#define DECLARATIONS_A64FXd(x) \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ + svfloat64_t result_00; \ + svfloat64_t result_01; \ + svfloat64_t result_02; \ + svfloat64_t result_10; \ + svfloat64_t result_11; \ + svfloat64_t result_12; \ + svfloat64_t result_20; \ + svfloat64_t result_21; \ + svfloat64_t result_22; \ + svfloat64_t result_30; \ + svfloat64_t result_31; \ + svfloat64_t result_32; \ + svfloat64_t Chi_00; \ + svfloat64_t Chi_01; \ + svfloat64_t Chi_02; \ + svfloat64_t Chi_10; \ + svfloat64_t Chi_11; \ + svfloat64_t Chi_12; \ + svfloat64_t UChi_00; \ + svfloat64_t UChi_01; \ + svfloat64_t UChi_02; \ + svfloat64_t UChi_10; \ + svfloat64_t UChi_11; \ + svfloat64_t UChi_12; \ + svfloat64_t U_00; \ + svfloat64_t U_10; \ + svfloat64_t U_20; \ + svfloat64_t U_01; \ + svfloat64_t U_11; \ + svfloat64_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ + svuint64_t table0; \ + svfloat64_t zero0; \ + zero0 = __svzero(zero0); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 U_00 +#define Chimu_21 U_10 +#define Chimu_22 U_20 +#define Chimu_30 U_01 +#define Chimu_31 U_11 +#define Chimu_32 U_21 +// RESULT +#define RESULT_A64FXd(base) \ +{ \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +{ \ + const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ + Chi_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chi_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chi_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chi_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chi_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// PERM0 +#define PERM0_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM1 +#define PERM1_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM2 +#define PERM2_A64FXd \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// PERM3 +#define PERM3_A64FXd + +// MULT_2SPIN +#define MULT_2SPIN_A64FXd(A) \ +{ \ + const auto & ref(U[sU][A]); \ + U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 1 * 64)); \ + UChi_00 = __svzero(UChi_00); \ + UChi_10 = __svzero(UChi_10); \ + UChi_01 = __svzero(UChi_01); \ + UChi_11 = __svzero(UChi_11); \ + UChi_02 = __svzero(UChi_02); \ + UChi_12 = __svzero(UChi_12); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -4 * 64)); \ + U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -1 * 64)); \ + U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 2 * 64)); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ + result_00 = __svzero(result_00); \ + result_01 = __svzero(result_01); \ + result_02 = __svzero(result_02); \ + result_10 = __svzero(result_10); \ + result_11 = __svzero(result_11); \ + result_12 = __svzero(result_12); \ + result_20 = __svzero(result_20); \ + result_21 = __svzero(result_21); \ + result_22 = __svzero(result_22); \ + result_30 = __svzero(result_30); \ + result_31 = __svzero(result_31); \ + result_32 = __svzero(result_32); + diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h new file mode 100644 index 00000000..07939007 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -0,0 +1,68 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_undef.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#undef LOAD_CHIMU_A64FXd +#undef LOAD_CHIMU_A64FXf +#undef PREFETCH_CHIMU_L1 +#undef PREFETCH_GAUGE_L1 +#undef PREFETCH_CHIMU_L2 +#undef PREFETCH_GAUGE_L2 +#undef PF_GAUGE +#undef PREFETCH1_CHIMU +#undef PREFETCH_CHIMU +#undef LOCK_GAUGE +#undef UNLOCK_GAUGE +#undef MASK_REGS +#undef COMPLEX_SIGNS +#undef LOAD64 +#undef SAVE_RESULT +#undef MULT_2SPIN_DIR_PF +#undef MAYBEPERM +#undef LOAD_CHI +#undef ZERO_PSI +#undef XP_PROJMEM +#undef YP_PROJMEM +#undef ZP_PROJMEM +#undef TP_PROJMEM +#undef XM_PROJMEM +#undef YM_PROJMEM +#undef ZM_PROJMEM +#undef TM_PROJMEM +#undef XP_RECON +#undef XM_RECON +#undef YM_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef XP_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef PERMUTE_DIR0 +#undef PERMUTE_DIR1 +#undef PERMUTE_DIR2 +#undef PERMUTE_DIR3