From 581392f2f2ad8b9856caa39507681d64465c0340 Mon Sep 17 00:00:00 2001 From: nils meyer Date: Sun, 12 Apr 2020 22:06:14 +0200 Subject: [PATCH] now with pf, best results so far using intrinsics+pf --- .../implementation/WilsonKernelsAsmA64FX.h | 104 ++- .../WilsonKernelsAsmBodyA64FX.h | 225 ++++++ .../WilsonKernelsHandImplementation.h.orig | 684 ------------------ Grid/simd/Fujitsu_A64FX_asm_double.h | 50 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 50 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 26 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 26 +- 7 files changed, 414 insertions(+), 751 deletions(-) create mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h delete mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.orig diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index d14f4b9c..9d74dd15 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -37,12 +37,17 @@ Author: paboyle // undefine everything #include +#define WILSONKERNELSASMBODYA64FX +#pragma message("invoking A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") + /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine /////////////////////////////////////////////////////////// #if defined(DSLASHINTRIN) +#pragma message ("invoking A64FX Dslash: intrin") #include #else +#pragma message ("invoking A64FX Dslash: asm") #include #endif @@ -59,12 +64,20 @@ Author: paboyle template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, @@ -74,7 +87,11 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -82,23 +99,38 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include - +#endif #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -106,22 +138,39 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -133,22 +182,38 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -156,22 +221,38 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldV template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -179,22 +260,38 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFie template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else #include +#endif //#undef MAYBEPERM //#undef MULT_2SPIN @@ -348,7 +445,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double //#undef MAYBEPERM //#undef MULT_2SPIN -// undefine +// undefine #include /////////////////////////////////////////////////////////// @@ -361,7 +458,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double #include #endif -// former KNL +// former KNL //#define MAYBEPERM(A,perm) if (perm) { A ; } //#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) //#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; @@ -654,6 +751,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Double #endif // VEC 5D // undefs +#undef WILSONKERNELSASMBODYA64FX #include #endif //A64FX diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h new file mode 100644 index 00000000..44bf2005 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -0,0 +1,225 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: WilsonKernelsAsmBodyA64FX.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifdef KERNEL_DAG +#define DIR0_PROJMEM(base) XP_PROJMEM(base); +#define DIR1_PROJMEM(base) YP_PROJMEM(base); +#define DIR2_PROJMEM(base) ZP_PROJMEM(base); +#define DIR3_PROJMEM(base) TP_PROJMEM(base); +#define DIR4_PROJMEM(base) XM_PROJMEM(base); +#define DIR5_PROJMEM(base) YM_PROJMEM(base); +#define DIR6_PROJMEM(base) ZM_PROJMEM(base); +#define DIR7_PROJMEM(base) TM_PROJMEM(base); +#define DIR0_RECON XP_RECON +#define DIR1_RECON YP_RECON_ACCUM +#define DIR2_RECON ZP_RECON_ACCUM +#define DIR3_RECON TP_RECON_ACCUM +#define DIR4_RECON XM_RECON_ACCUM +#define DIR5_RECON YM_RECON_ACCUM +#define DIR6_RECON ZM_RECON_ACCUM +#define DIR7_RECON TM_RECON_ACCUM +#else +#define DIR0_PROJMEM(base) XM_PROJMEM(base); +#define DIR1_PROJMEM(base) YM_PROJMEM(base); +#define DIR2_PROJMEM(base) ZM_PROJMEM(base); +#define DIR3_PROJMEM(base) TM_PROJMEM(base); +#define DIR4_PROJMEM(base) XP_PROJMEM(base); +#define DIR5_PROJMEM(base) YP_PROJMEM(base); +#define DIR6_PROJMEM(base) ZP_PROJMEM(base); +#define DIR7_PROJMEM(base) TP_PROJMEM(base); +#define DIR0_RECON XM_RECON +#define DIR1_RECON YM_RECON_ACCUM +#define DIR2_RECON ZM_RECON_ACCUM +#define DIR3_RECON TM_RECON_ACCUM +#define DIR4_RECON XP_RECON_ACCUM +#define DIR5_RECON YP_RECON_ACCUM +#define DIR6_RECON ZP_RECON_ACCUM +#define DIR7_RECON TP_RECON_ACCUM +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Comms then compute kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR_AND_EXTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + LOAD64(%r10,isigns); \ + RECON; \ + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Pre comms kernel -- prefetch like normal because it is mostly right +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + PREFETCH_CHIMU(base); \ + LOAD64(%r10,isigns); \ + RECON; \ + } else { PREFETCH_CHIMU(base); } + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif +//////////////////////////////////////////////////////////////////////////////// +// Post comms kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef EXTERIOR + + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + { ZERO_PSI;} \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } + +#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} + +#endif +{ + int nmu; + int local,perm, ptype; + uint64_t base; + uint64_t basep; + const uint64_t plocal =(uint64_t) & in[0]; + + COMPLEX_SIGNS(isigns); + MASK_REGS; + int nmax=U.oSites(); + for(int site=0;site=nmax) ssn=0; + // int sUn=lo.Reorder(ssn); + int sUn=ssn; + LOCK_GAUGE(0); +#else + int sU =ssU; + int ssn=ssU+1; if(ssn>=nmax) ssn=0; + int sUn=ssn; +#endif + for(int s=0;s -Author: paboyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ - -#pragma once - -#include - - -#undef LOAD_CHIMU -#undef LOAD_CHI -#undef MULT_2SPIN -#undef PERMUTE_DIR -#undef XP_PROJ -#undef YP_PROJ -#undef ZP_PROJ -#undef TP_PROJ -#undef XM_PROJ -#undef YM_PROJ -#undef ZM_PROJ -#undef TM_PROJ -#undef XP_RECON -#undef XP_RECON_ACCUM -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef ZERO_RESULT -#undef Chimu_00 -#undef Chimu_01 -#undef Chimu_02 -#undef Chimu_10 -#undef Chimu_11 -#undef Chimu_12 -#undef Chimu_20 -#undef Chimu_21 -#undef Chimu_22 -#undef Chimu_30 -#undef Chimu_31 -#undef Chimu_32 -#undef HAND_STENCIL_LEG -#undef HAND_STENCIL_LEG_INT -#undef HAND_STENCIL_LEG_EXT -#undef HAND_RESULT -#undef HAND_RESULT_INT -#undef HAND_RESULT_EXT - -#define REGISTER - -#define LOAD_CHIMU \ - {const SiteSpinor & ref (in[offset]); \ - Chimu_00=ref()(0)(0);\ - Chimu_01=ref()(0)(1);\ - Chimu_02=ref()(0)(2);\ - Chimu_10=ref()(1)(0);\ - Chimu_11=ref()(1)(1);\ - Chimu_12=ref()(1)(2);\ - Chimu_20=ref()(2)(0);\ - Chimu_21=ref()(2)(1);\ - Chimu_22=ref()(2)(2);\ - Chimu_30=ref()(3)(0);\ - Chimu_31=ref()(3)(1);\ - Chimu_32=ref()(3)(2);} - -#define LOAD_CHI\ - {const SiteHalfSpinor &ref(buf[offset]); \ - Chi_00 = ref()(0)(0);\ - Chi_01 = ref()(0)(1);\ - Chi_02 = ref()(0)(2);\ - Chi_10 = ref()(1)(0);\ - Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2);} - -// To splat or not to splat depends on the implementation -#define MULT_2SPIN(A)\ - {auto & ref(U[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - UChi_00 = U_00*Chi_00;\ - UChi_10 = U_00*Chi_10;\ - UChi_01 = U_10*Chi_00;\ - UChi_11 = U_10*Chi_10;\ - UChi_02 = U_20*Chi_00;\ - UChi_12 = U_20*Chi_10;\ - UChi_00+= U_01*Chi_01;\ - UChi_10+= U_01*Chi_11;\ - UChi_01+= U_11*Chi_01;\ - UChi_11+= U_11*Chi_11;\ - UChi_02+= U_21*Chi_01;\ - UChi_12+= U_21*Chi_11;\ - Impl::loadLinkElement(U_00,ref()(0,2)); \ - Impl::loadLinkElement(U_10,ref()(1,2)); \ - Impl::loadLinkElement(U_20,ref()(2,2)); \ - UChi_00+= U_00*Chi_02;\ - UChi_10+= U_00*Chi_12;\ - UChi_01+= U_10*Chi_02;\ - UChi_11+= U_10*Chi_12;\ - UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12;} - - -#define PERMUTE_DIR(dir) \ - permute##dir(Chi_00,Chi_00);\ - permute##dir(Chi_01,Chi_01);\ - permute##dir(Chi_02,Chi_02);\ - permute##dir(Chi_10,Chi_10);\ - permute##dir(Chi_11,Chi_11);\ - permute##dir(Chi_12,Chi_12); - -// hspin(0)=fspin(0)+timesI(fspin(3)); -// hspin(1)=fspin(1)+timesI(fspin(2)); -#define XP_PROJ \ - Chi_00 = Chimu_00+timesI(Chimu_30);\ - Chi_01 = Chimu_01+timesI(Chimu_31);\ - Chi_02 = Chimu_02+timesI(Chimu_32);\ - Chi_10 = Chimu_10+timesI(Chimu_20);\ - Chi_11 = Chimu_11+timesI(Chimu_21);\ - Chi_12 = Chimu_12+timesI(Chimu_22); - -#define YP_PROJ \ - Chi_00 = Chimu_00-Chimu_30;\ - Chi_01 = Chimu_01-Chimu_31;\ - Chi_02 = Chimu_02-Chimu_32;\ - Chi_10 = Chimu_10+Chimu_20;\ - Chi_11 = Chimu_11+Chimu_21;\ - Chi_12 = Chimu_12+Chimu_22; - -#define ZP_PROJ \ - Chi_00 = Chimu_00+timesI(Chimu_20); \ - Chi_01 = Chimu_01+timesI(Chimu_21); \ - Chi_02 = Chimu_02+timesI(Chimu_22); \ - Chi_10 = Chimu_10-timesI(Chimu_30); \ - Chi_11 = Chimu_11-timesI(Chimu_31); \ - Chi_12 = Chimu_12-timesI(Chimu_32); - -#define TP_PROJ \ - Chi_00 = Chimu_00+Chimu_20; \ - Chi_01 = Chimu_01+Chimu_21; \ - Chi_02 = Chimu_02+Chimu_22; \ - Chi_10 = Chimu_10+Chimu_30; \ - Chi_11 = Chimu_11+Chimu_31; \ - Chi_12 = Chimu_12+Chimu_32; - - -// hspin(0)=fspin(0)-timesI(fspin(3)); -// hspin(1)=fspin(1)-timesI(fspin(2)); -#define XM_PROJ \ - Chi_00 = Chimu_00-timesI(Chimu_30);\ - Chi_01 = Chimu_01-timesI(Chimu_31);\ - Chi_02 = Chimu_02-timesI(Chimu_32);\ - Chi_10 = Chimu_10-timesI(Chimu_20);\ - Chi_11 = Chimu_11-timesI(Chimu_21);\ - Chi_12 = Chimu_12-timesI(Chimu_22); - -#define YM_PROJ \ - Chi_00 = Chimu_00+Chimu_30;\ - Chi_01 = Chimu_01+Chimu_31;\ - Chi_02 = Chimu_02+Chimu_32;\ - Chi_10 = Chimu_10-Chimu_20;\ - Chi_11 = Chimu_11-Chimu_21;\ - Chi_12 = Chimu_12-Chimu_22; - -#define ZM_PROJ \ - Chi_00 = Chimu_00-timesI(Chimu_20); \ - Chi_01 = Chimu_01-timesI(Chimu_21); \ - Chi_02 = Chimu_02-timesI(Chimu_22); \ - Chi_10 = Chimu_10+timesI(Chimu_30); \ - Chi_11 = Chimu_11+timesI(Chimu_31); \ - Chi_12 = Chimu_12+timesI(Chimu_32); - -#define TM_PROJ \ - Chi_00 = Chimu_00-Chimu_20; \ - Chi_01 = Chimu_01-Chimu_21; \ - Chi_02 = Chimu_02-Chimu_22; \ - Chi_10 = Chimu_10-Chimu_30; \ - Chi_11 = Chimu_11-Chimu_31; \ - Chi_12 = Chimu_12-Chimu_32; - -// fspin(0)=hspin(0); -// fspin(1)=hspin(1); -// fspin(2)=timesMinusI(hspin(1)); -// fspin(3)=timesMinusI(hspin(0)); -#define XP_RECON\ - result_00 = UChi_00;\ - result_01 = UChi_01;\ - result_02 = UChi_02;\ - result_10 = UChi_10;\ - result_11 = UChi_11;\ - result_12 = UChi_12;\ - result_20 = timesMinusI(UChi_10);\ - result_21 = timesMinusI(UChi_11);\ - result_22 = timesMinusI(UChi_12);\ - result_30 = timesMinusI(UChi_00);\ - result_31 = timesMinusI(UChi_01);\ - result_32 = timesMinusI(UChi_02); - -#define XP_RECON_ACCUM\ - result_00+=UChi_00;\ - result_01+=UChi_01;\ - result_02+=UChi_02;\ - result_10+=UChi_10;\ - result_11+=UChi_11;\ - result_12+=UChi_12;\ - result_20-=timesI(UChi_10);\ - result_21-=timesI(UChi_11);\ - result_22-=timesI(UChi_12);\ - result_30-=timesI(UChi_00);\ - result_31-=timesI(UChi_01);\ - result_32-=timesI(UChi_02); - -#define XM_RECON\ - result_00 = UChi_00;\ - result_01 = UChi_01;\ - result_02 = UChi_02;\ - result_10 = UChi_10;\ - result_11 = UChi_11;\ - result_12 = UChi_12;\ - result_20 = timesI(UChi_10);\ - result_21 = timesI(UChi_11);\ - result_22 = timesI(UChi_12);\ - result_30 = timesI(UChi_00);\ - result_31 = timesI(UChi_01);\ - result_32 = timesI(UChi_02); - -#define XM_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20+= timesI(UChi_10);\ - result_21+= timesI(UChi_11);\ - result_22+= timesI(UChi_12);\ - result_30+= timesI(UChi_00);\ - result_31+= timesI(UChi_01);\ - result_32+= timesI(UChi_02); - -#define YP_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20+= UChi_10;\ - result_21+= UChi_11;\ - result_22+= UChi_12;\ - result_30-= UChi_00;\ - result_31-= UChi_01;\ - result_32-= UChi_02; - -#define YM_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20-= UChi_10;\ - result_21-= UChi_11;\ - result_22-= UChi_12;\ - result_30+= UChi_00;\ - result_31+= UChi_01;\ - result_32+= UChi_02; - -#define ZP_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20-= timesI(UChi_00); \ - result_21-= timesI(UChi_01); \ - result_22-= timesI(UChi_02); \ - result_30+= timesI(UChi_10); \ - result_31+= timesI(UChi_11); \ - result_32+= timesI(UChi_12); - -#define ZM_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20+= timesI(UChi_00); \ - result_21+= timesI(UChi_01); \ - result_22+= timesI(UChi_02); \ - result_30-= timesI(UChi_10); \ - result_31-= timesI(UChi_11); \ - result_32-= timesI(UChi_12); - -#define TP_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20+= UChi_00; \ - result_21+= UChi_01; \ - result_22+= UChi_02; \ - result_30+= UChi_10; \ - result_31+= UChi_11; \ - result_32+= UChi_12; - -#define TM_RECON_ACCUM\ - result_00+= UChi_00;\ - result_01+= UChi_01;\ - result_02+= UChi_02;\ - result_10+= UChi_10;\ - result_11+= UChi_11;\ - result_12+= UChi_12;\ - result_20-= UChi_00; \ - result_21-= UChi_01; \ - result_22-= UChi_02; \ - result_30-= UChi_10; \ - result_31-= UChi_11; \ - result_32-= UChi_12; - -#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ - SE=st.GetEntry(ptype,DIR,ss); \ - offset = SE->_offset; \ - local = SE->_is_local; \ - perm = SE->_permute; \ - if ( local ) { \ - LOAD_CHIMU; \ - PROJ; \ - if ( perm) { \ - PERMUTE_DIR(PERM); \ - } \ - } else { \ - LOAD_CHI; \ - } \ - MULT_2SPIN(DIR); \ - RECON; - -#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ - SE=st.GetEntry(ptype,DIR,ss); \ - offset = SE->_offset; \ - local = SE->_is_local; \ - perm = SE->_permute; \ - if ( local ) { \ - LOAD_CHIMU; \ - PROJ; \ - if ( perm) { \ - PERMUTE_DIR(PERM); \ - } \ - } else if ( st.same_node[DIR] ) { \ - LOAD_CHI; \ - } \ - if (local || st.same_node[DIR] ) { \ - MULT_2SPIN(DIR); \ - RECON; \ - } - -#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ - SE=st.GetEntry(ptype,DIR,ss); \ - offset = SE->_offset; \ - if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ - LOAD_CHI; \ - MULT_2SPIN(DIR); \ - RECON; \ - nmu++; \ - } - -#define HAND_RESULT(ss) \ - { \ - SiteSpinor & ref (out[ss]); \ - vstream(ref()(0)(0),result_00); \ - vstream(ref()(0)(1),result_01); \ - vstream(ref()(0)(2),result_02); \ - vstream(ref()(1)(0),result_10); \ - vstream(ref()(1)(1),result_11); \ - vstream(ref()(1)(2),result_12); \ - vstream(ref()(2)(0),result_20); \ - vstream(ref()(2)(1),result_21); \ - vstream(ref()(2)(2),result_22); \ - vstream(ref()(3)(0),result_30); \ - vstream(ref()(3)(1),result_31); \ - vstream(ref()(3)(2),result_32); \ - } - -#define HAND_RESULT_EXT(ss) \ - if (nmu){ \ - SiteSpinor & ref (out[ss]); \ - ref()(0)(0)+=result_00; \ - ref()(0)(1)+=result_01; \ - ref()(0)(2)+=result_02; \ - ref()(1)(0)+=result_10; \ - ref()(1)(1)+=result_11; \ - ref()(1)(2)+=result_12; \ - ref()(2)(0)+=result_20; \ - ref()(2)(1)+=result_21; \ - ref()(2)(2)+=result_22; \ - ref()(3)(0)+=result_30; \ - ref()(3)(1)+=result_31; \ - ref()(3)(2)+=result_32; \ - } - - -#define HAND_DECLARATIONS(a) \ - Simd result_00; \ - Simd result_01; \ - Simd result_02; \ - Simd result_10; \ - Simd result_11; \ - Simd result_12; \ - Simd result_20; \ - Simd result_21; \ - Simd result_22; \ - Simd result_30; \ - Simd result_31; \ - Simd result_32; \ - Simd Chi_00; \ - Simd Chi_01; \ - Simd Chi_02; \ - Simd Chi_10; \ - Simd Chi_11; \ - Simd Chi_12; \ - Simd UChi_00; \ - Simd UChi_01; \ - Simd UChi_02; \ - Simd UChi_10; \ - Simd UChi_11; \ - Simd UChi_12; \ - Simd U_00; \ - Simd U_10; \ - Simd U_20; \ - Simd U_01; \ - Simd U_11; \ - Simd U_21; - -#define ZERO_RESULT \ - result_00=Zero(); \ - result_01=Zero(); \ - result_02=Zero(); \ - result_10=Zero(); \ - result_11=Zero(); \ - result_12=Zero(); \ - result_20=Zero(); \ - result_21=Zero(); \ - result_22=Zero(); \ - result_30=Zero(); \ - result_31=Zero(); \ - result_32=Zero(); - -#define Chimu_00 Chi_00 -#define Chimu_01 Chi_01 -#define Chimu_02 Chi_02 -#define Chimu_10 Chi_10 -#define Chimu_11 Chi_11 -#define Chimu_12 Chi_12 -#define Chimu_20 UChi_00 -#define Chimu_21 UChi_01 -#define Chimu_22 UChi_02 -#define Chimu_30 UChi_10 -#define Chimu_31 UChi_11 -#define Chimu_32 UChi_12 - -NAMESPACE_BEGIN(Grid); - -template void -WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset,local,perm, ptype; - StencilEntry *SE; - - HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); - HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT(ss); -} - -template -void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset,local,perm, ptype; - - HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); - HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT(ss); -} - -template void -WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset,local,perm, ptype; - StencilEntry *SE; - ZERO_RESULT; - HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT(ss); -} - -template -void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset,local,perm, ptype; - ZERO_RESULT; - HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT(ss); -} - -template void -WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset, ptype; - StencilEntry *SE; - int nmu=0; - ZERO_RESULT; - HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT_EXT(ss); -} - -template -void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset, ptype; - int nmu=0; - ZERO_RESULT; - HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT_EXT(ss); -} - -////////////// Wilson ; uses this implementation ///////////////////// - -NAMESPACE_END(Grid); -#undef LOAD_CHIMU -#undef LOAD_CHI -#undef MULT_2SPIN -#undef PERMUTE_DIR -#undef XP_PROJ -#undef YP_PROJ -#undef ZP_PROJ -#undef TP_PROJ -#undef XM_PROJ -#undef YM_PROJ -#undef ZM_PROJ -#undef TM_PROJ -#undef XP_RECON -#undef XP_RECON_ACCUM -#undef XM_RECON -#undef XM_RECON_ACCUM -#undef YP_RECON_ACCUM -#undef YM_RECON_ACCUM -#undef ZP_RECON_ACCUM -#undef ZM_RECON_ACCUM -#undef TP_RECON_ACCUM -#undef TM_RECON_ACCUM -#undef ZERO_RESULT -#undef Chimu_00 -#undef Chimu_01 -#undef Chimu_02 -#undef Chimu_10 -#undef Chimu_11 -#undef Chimu_12 -#undef Chimu_20 -#undef Chimu_21 -#undef Chimu_22 -#undef Chimu_30 -#undef Chimu_31 -#undef Chimu_32 -#undef HAND_STENCIL_LEG -#undef HAND_STENCIL_LEG_INT -#undef HAND_STENCIL_LEG_EXT -#undef HAND_RESULT -#undef HAND_RESULT_INT -#undef HAND_RESULT_EXT diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 52dd8320..b24fb3a8 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -26,20 +26,20 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) #define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI @@ -105,9 +105,9 @@ asm ( \ #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -117,9 +117,9 @@ asm ( \ #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -128,24 +128,30 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index faa8249b..e60ab381 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -26,20 +26,20 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) #define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) +#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI @@ -105,9 +105,9 @@ asm ( \ #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ { \ asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -117,9 +117,9 @@ asm ( \ #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ { \ asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -128,24 +128,30 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index f94d4f47..9cf1c5db 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -26,20 +26,20 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) #define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXd(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A) +#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI @@ -154,15 +154,21 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \ +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 7329e4dc..3d8b6bf5 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -26,20 +26,20 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) -#define PREFETCH_CHIMU_L1(A) -#define PREFETCH_GAUGE_L1(A) -#define PREFETCH_CHIMU_L2(A) -#define PREFETCH_GAUGE_L2(A) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) #define PF_GAUGE(A) -#define PREFETCH1_CHIMU(A) -#define PREFETCH_CHIMU(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) #define SAVE_RESULT(A,B) RESULT_A64FXf(A) -#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A) +#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } #define MAYBEPERM(A,perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI @@ -154,15 +154,21 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \ +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \