/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: WilsonKernelsAsmBodyA64FX.h Copyright (C) 2020 Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ // GCC 10 messes up SVE instruction scheduling using -O3, but // -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders // performance now is better than armclang 20.2 #ifdef KERNEL_DAG #define DIR0_PROJ XP_PROJ #define DIR1_PROJ YP_PROJ #define DIR2_PROJ ZP_PROJ #define DIR3_PROJ TP_PROJ #define DIR4_PROJ XM_PROJ #define DIR5_PROJ YM_PROJ #define DIR6_PROJ ZM_PROJ #define DIR7_PROJ TM_PROJ #define DIR0_RECON XP_RECON #define DIR1_RECON YP_RECON_ACCUM #define DIR2_RECON ZP_RECON_ACCUM #define DIR3_RECON TP_RECON_ACCUM #define DIR4_RECON XM_RECON_ACCUM #define DIR5_RECON YM_RECON_ACCUM #define DIR6_RECON ZM_RECON_ACCUM #define DIR7_RECON TM_RECON_ACCUM #else #define DIR0_PROJ XM_PROJ #define DIR1_PROJ YM_PROJ #define DIR2_PROJ ZM_PROJ #define DIR3_PROJ TM_PROJ #define DIR4_PROJ XP_PROJ #define DIR5_PROJ YP_PROJ #define DIR6_PROJ ZP_PROJ #define DIR7_PROJ TP_PROJ #define DIR0_RECON XM_RECON #define DIR1_RECON YM_RECON_ACCUM #define DIR2_RECON ZM_RECON_ACCUM #define DIR3_RECON TM_RECON_ACCUM #define DIR4_RECON XP_RECON_ACCUM #define DIR5_RECON YP_RECON_ACCUM #define DIR6_RECON ZP_RECON_ACCUM #define DIR7_RECON TP_RECON_ACCUM #endif //using namespace std; #undef SHOW //#define SHOW #undef WHERE #ifdef INTERIOR_AND_EXTERIOR #define WHERE "INT_AND_EXT" #endif #ifdef INTERIOR #define WHERE "INT" #endif #ifdef EXTERIOR #define WHERE "EXT" #endif //#pragma message("here") //////////////////////////////////////////////////////////////////////////////// // Comms then compute kernel //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR_AND_EXTERIOR #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ if ( local ) { \ LOAD_CHIMU(base); \ LOAD_TABLE(PERMUTE_DIR); \ PROJ; \ MAYBEPERM(PERMUTE_DIR,perm); \ } else { \ LOAD_CHI(base); \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ MULT_2SPIN_1(Dir); \ PREFETCH_CHIMU(base); \ PREFETCH_CHIMU_L2(basep); \ /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ if (s == 0) { \ if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ } \ RECON; \ /* NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty though I expected that it would improve on performance */ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif //////////////////////////////////////////////////////////////////////////////// // Pre comms kernel -- prefetch like normal because it is mostly right //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ if ( local ) { \ LOAD_CHIMU(base); \ LOAD_TABLE(PERMUTE_DIR); \ PROJ; \ MAYBEPERM(PERMUTE_DIR,perm); \ }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ if ( local || st.same_node[Dir] ) { \ MULT_2SPIN_1(Dir); \ MULT_2SPIN_2; \ RECON; \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ PREFETCH_CHIMU(base); \ PREFETCH_CHIMU_L2(basep); \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ { ZERO_PSI; } \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ MULT_2SPIN_2; \ RECON; \ nmu++; \ } #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ nmu=0; \ { ZERO_PSI;} \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ MULT_2SPIN_2; \ RECON; \ nmu++; \ } #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif { int nmu; int local,perm, ptype; uint64_t base; uint64_t basep; const uint64_t plocal =(uint64_t) & in[0]; MASK_REGS; int nmax=U.oSites(); for(int site=0;site=nmax) ssn=0; // int sUn=lo.Reorder(ssn); int sUn=ssn; #else int sU =ssU; int ssn=ssU+1; if(ssn>=nmax) ssn=0; int sUn=ssn; #endif for(int s=0;s