From 6fdce60492bd83688692111090dfe517bf1de08d Mon Sep 17 00:00:00 2001 From: nils meyer Date: Thu, 16 Apr 2020 22:43:32 +0200 Subject: [PATCH] revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0) --- .../WilsonKernelsAsmBodyA64FX.h | 168 +++++++++------- Grid/simd/Fujitsu_A64FX_asm_double.h | 92 ++++----- Grid/simd/Fujitsu_A64FX_asm_single.h | 103 ++++------ Grid/simd/Fujitsu_A64FX_intrin_double.h | 72 +++---- Grid/simd/Fujitsu_A64FX_intrin_single.h | 78 +++----- Grid/simd/Fujitsu_A64FX_undef.h | 28 ++- Grid/simd/gridverter.py | 189 +++++------------- 7 files changed, 279 insertions(+), 451 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 54a52468..d77b4414 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -26,14 +26,14 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ #ifdef KERNEL_DAG -#define DIR0_PROJMEM(base) XP_PROJMEM(base); -#define DIR1_PROJMEM(base) YP_PROJMEM(base); -#define DIR2_PROJMEM(base) ZP_PROJMEM(base); -#define DIR3_PROJMEM(base) TP_PROJMEM(base); -#define DIR4_PROJMEM(base) XM_PROJMEM(base); -#define DIR5_PROJMEM(base) YM_PROJMEM(base); -#define DIR6_PROJMEM(base) ZM_PROJMEM(base); -#define DIR7_PROJMEM(base) TM_PROJMEM(base); +#define DIR0_PROJ XP_PROJ +#define DIR1_PROJ YP_PROJ +#define DIR2_PROJ ZP_PROJ +#define DIR3_PROJ TP_PROJ +#define DIR4_PROJ XM_PROJ +#define DIR5_PROJ YM_PROJ +#define DIR6_PROJ ZM_PROJ +#define DIR7_PROJ TM_PROJ #define DIR0_RECON XP_RECON #define DIR1_RECON YP_RECON_ACCUM #define DIR2_RECON ZP_RECON_ACCUM @@ -43,14 +43,14 @@ Author: Nils Meyer #define DIR6_RECON ZM_RECON_ACCUM #define DIR7_RECON TM_RECON_ACCUM #else -#define DIR0_PROJMEM(base) XM_PROJMEM(base); -#define DIR1_PROJMEM(base) YM_PROJMEM(base); -#define DIR2_PROJMEM(base) ZM_PROJMEM(base); -#define DIR3_PROJMEM(base) TM_PROJMEM(base); -#define DIR4_PROJMEM(base) XP_PROJMEM(base); -#define DIR5_PROJMEM(base) YP_PROJMEM(base); -#define DIR6_PROJMEM(base) ZP_PROJMEM(base); -#define DIR7_PROJMEM(base) TP_PROJMEM(base); +#define DIR0_PROJ XM_PROJ +#define DIR1_PROJ YM_PROJ +#define DIR2_PROJ ZM_PROJ +#define DIR3_PROJ TM_PROJ +#define DIR4_PROJ XP_PROJ +#define DIR5_PROJ YP_PROJ +#define DIR6_PROJ ZP_PROJ +#define DIR7_PROJ TP_PROJ #define DIR0_RECON XM_RECON #define DIR1_RECON YM_RECON_ACCUM #define DIR2_RECON ZM_RECON_ACCUM @@ -91,23 +91,28 @@ Author: Nils Meyer #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ - /* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \ - PROJ(base); \ - /* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \ - MAYBEPERM(PERMUTE_DIR,perm); \ - } else { \ - LOAD_CHI(base); \ - } \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ - MULT_2SPIN_DIR_PF(Dir,basep); \ - RECON; \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + PREFETCH_CHIMU_L2(basep); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - PF_GAUGE(Xp); \ - PREFETCH1_CHIMU(base); \ +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PREFETCH1_CHIMU(base); \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -121,22 +126,28 @@ Author: Nils Meyer #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ - PROJ(base); \ - MAYBEPERM(PERMUTE_DIR,perm); \ - }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ - if ( local || st.same_node[Dir] ) { \ - MULT_2SPIN_DIR_PF(Dir,basep); \ - RECON; \ - } \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + PREFETCH_CHIMU_L2(basep); \ + } else { PREFETCH_CHIMU(base); } \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - PF_GAUGE(Xp); \ PREFETCH1_CHIMU(base); \ - { ZERO_PSI; } \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -149,23 +160,34 @@ Author: Nils Meyer #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ - MULT_2SPIN_DIR_PF(Dir,base); \ - RECON; \ - nmu++; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + nmu++; \ } -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - nmu=0; \ - { ZERO_PSI;} \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ - MULT_2SPIN_DIR_PF(Dir,base); \ - RECON; \ - nmu++; \ +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + nmu++; \ } #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} @@ -201,7 +223,7 @@ Author: Nils Meyer uint64_t delta_base, delta_base_p; - ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON); + ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON); #ifdef SHOW float rescale = 64. * 12.; @@ -221,7 +243,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON); + ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON); #ifdef SHOW std::cout << "Dir = " << Yp << " " << WHERE<< std::endl; @@ -234,7 +256,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON); + ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON); #ifdef SHOW std::cout << "Dir = " << Zp << " " << WHERE<< std::endl; @@ -247,7 +269,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON); + ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON); #ifdef SHOW std::cout << "Dir = " << Tp << " " << WHERE<< std::endl; @@ -260,7 +282,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON); + ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON); #ifdef SHOW std::cout << "Dir = " << Xm << " " << WHERE<< std::endl; @@ -273,7 +295,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON); + ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); #ifdef SHOW std::cout << "Dir = " << Ym << " " << WHERE<< std::endl; @@ -286,7 +308,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON); + ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); #ifdef SHOW std::cout << "Dir = " << Zm << " " << WHERE<< std::endl; @@ -299,7 +321,7 @@ Author: Nils Meyer std::cout << "----------------------------------------------------" << std::endl; #endif - ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON); + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); #ifdef SHOW std::cout << "Dir = " << Tm << " " << WHERE<< std::endl; @@ -337,14 +359,14 @@ Author: Nils Meyer } } -#undef DIR0_PROJMEM -#undef DIR1_PROJMEM -#undef DIR2_PROJMEM -#undef DIR3_PROJMEM -#undef DIR4_PROJMEM -#undef DIR5_PROJMEM -#undef DIR6_PROJMEM -#undef DIR7_PROJMEM +#undef DIR0_PROJ +#undef DIR1_PROJ +#undef DIR2_PROJ +#undef DIR3_PROJ +#undef DIR4_PROJ +#undef DIR5_PROJ +#undef DIR6_PROJ +#undef DIR7_PROJ #undef DIR0_RECON #undef DIR1_RECON #undef DIR2_RECON diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 7931398f..4d9e8fd9 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -25,7 +25,7 @@ Author: Nils Meyer See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) @@ -40,23 +40,19 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B); -#define MULT_2SPIN_DIR_PF(A,B) \ - MULT_2SPIN_A64FXd(A); \ - PREFETCH_CHIMU_L2(B); \ - if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } -#define MAYBEPERM(A,perm) { A ; } +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI -#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define XP_PROJ XP_PROJ_A64FXd +#define YP_PROJ YP_PROJ_A64FXd +#define ZP_PROJ ZP_PROJ_A64FXd +#define TP_PROJ TP_PROJ_A64FXd +#define XM_PROJ XM_PROJ_A64FXd +#define YM_PROJ YM_PROJ_A64FXd +#define ZM_PROJ ZM_PROJ_A64FXd +#define TM_PROJ TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd @@ -67,10 +63,13 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; } -#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; } -#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; } -#define PERMUTE_DIR3 +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXd; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } +#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ const uint64_t lut[4][8] = { \ @@ -281,8 +280,8 @@ asm ( \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM0 -#define PERM0_A64FXd \ +// PERMUTE +#define PERMUTE_A64FXd \ asm ( \ "tbl z12.d, { z12.d }, z30.d \n\t" \ "tbl z13.d, { z13.d }, z30.d \n\t" \ @@ -295,37 +294,6 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM1 -#define PERM1_A64FXd \ -asm ( \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM2 -#define PERM2_A64FXd \ -asm ( \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM3 -#define PERM3_A64FXd - // LOAD_GAUGE #define LOAD_GAUGE \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ @@ -344,7 +312,7 @@ asm ( \ ); \ } // MULT_2SPIN -#define MULT_2SPIN_A64FXd(A) \ +#define MULT_2SPIN_1_A64FXd(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ @@ -375,6 +343,15 @@ asm ( \ "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXd \ +{ \ +asm ( \ "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ @@ -400,15 +377,14 @@ asm ( \ "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XP_PROJ #define XP_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 8b4442c8..e1532acb 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -25,7 +25,7 @@ Author: Nils Meyer See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) @@ -40,23 +40,19 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B); -#define MULT_2SPIN_DIR_PF(A,B) \ - MULT_2SPIN_A64FXf(A); \ - PREFETCH_CHIMU_L2(B); \ - if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } -#define MAYBEPERM(A,perm) { A ; } +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI -#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define XP_PROJ XP_PROJ_A64FXf +#define YP_PROJ YP_PROJ_A64FXf +#define ZP_PROJ ZP_PROJ_A64FXf +#define TP_PROJ TP_PROJ_A64FXf +#define XM_PROJ XM_PROJ_A64FXf +#define YM_PROJ YM_PROJ_A64FXf +#define ZM_PROJ ZM_PROJ_A64FXf +#define TM_PROJ TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf @@ -67,10 +63,13 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; } -#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; } -#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; } -#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; } +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXf; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } +#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ const uint32_t lut[4][16] = { \ @@ -281,50 +280,8 @@ asm ( \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PERM0 -#define PERM0_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM1 -#define PERM1_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM2 -#define PERM2_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERM3 -#define PERM3_A64FXf \ +// PERMUTE +#define PERMUTE_A64FXf \ asm ( \ "tbl z12.s, { z12.s }, z30.s \n\t" \ "tbl z13.s, { z13.s }, z30.s \n\t" \ @@ -355,7 +312,7 @@ asm ( \ ); \ } // MULT_2SPIN -#define MULT_2SPIN_A64FXf(A) \ +#define MULT_2SPIN_1_A64FXf(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ asm ( \ @@ -386,6 +343,15 @@ asm ( \ "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXf \ +{ \ +asm ( \ "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ @@ -411,15 +377,14 @@ asm ( \ "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); \ } // XP_PROJ #define XP_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 2ddb33f1..4a792047 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -25,7 +25,7 @@ Author: Nils Meyer See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) @@ -40,23 +40,19 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXd #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B); -#define MULT_2SPIN_DIR_PF(A,B) \ - MULT_2SPIN_A64FXd(A); \ - PREFETCH_CHIMU_L2(B); \ - if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } -#define MAYBEPERM(A,perm) { A ; } +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI -#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define XP_PROJ XP_PROJ_A64FXd +#define YP_PROJ YP_PROJ_A64FXd +#define ZP_PROJ ZP_PROJ_A64FXd +#define TP_PROJ TP_PROJ_A64FXd +#define XM_PROJ XM_PROJ_A64FXd +#define YM_PROJ YM_PROJ_A64FXd +#define ZM_PROJ ZM_PROJ_A64FXd +#define TM_PROJ TM_PROJ_A64FXd #define XP_RECON XP_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd @@ -67,10 +63,13 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; } -#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; } -#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; } -#define PERMUTE_DIR3 +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXd; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } +#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ const uint64_t lut[4][8] = { \ @@ -254,8 +253,8 @@ Author: Nils Meyer #define LOAD_TABLE3 \ table0 = svld1(pg1, (uint64_t*)&lut[3]); -// PERM0 -#define PERM0_A64FXd \ +// PERMUTE +#define PERMUTE_A64FXd \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -263,27 +262,6 @@ Author: Nils Meyer Chi_11 = svtbl(Chi_11, table0); \ Chi_12 = svtbl(Chi_12, table0); -// PERM1 -#define PERM1_A64FXd \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM2 -#define PERM2_A64FXd \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM3 -#define PERM3_A64FXd - // LOAD_GAUGE #define LOAD_GAUGE \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ @@ -296,7 +274,7 @@ Author: Nils Meyer U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ } // MULT_2SPIN -#define MULT_2SPIN_A64FXd(A) \ +#define MULT_2SPIN_1_A64FXd(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ @@ -320,6 +298,10 @@ Author: Nils Meyer U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXd \ +{ \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 3824aecf..0ba5df17 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -25,7 +25,7 @@ Author: Nils Meyer See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) @@ -40,23 +40,19 @@ Author: Nils Meyer #define MASK_REGS DECLARATIONS_A64FXf #define COMPLEX_SIGNS(A) #define LOAD64(A,B) -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B); -#define MULT_2SPIN_DIR_PF(A,B) \ - MULT_2SPIN_A64FXf(A); \ - PREFETCH_CHIMU_L2(B); \ - if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } } -#define MAYBEPERM(A,perm) { A ; } +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI -#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf -#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf -#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf -#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf -#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf -#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf -#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf -#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define XP_PROJ XP_PROJ_A64FXf +#define YP_PROJ YP_PROJ_A64FXf +#define ZP_PROJ ZP_PROJ_A64FXf +#define TP_PROJ TP_PROJ_A64FXf +#define XM_PROJ XM_PROJ_A64FXf +#define YM_PROJ YM_PROJ_A64FXf +#define ZM_PROJ ZM_PROJ_A64FXf +#define TM_PROJ TM_PROJ_A64FXf #define XP_RECON XP_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf @@ -67,10 +63,13 @@ Author: Nils Meyer #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; } -#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; } -#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; } -#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; } +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXf; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } +#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ const uint32_t lut[4][16] = { \ @@ -254,35 +253,8 @@ Author: Nils Meyer #define LOAD_TABLE3 \ table0 = svld1(pg1, (uint32_t*)&lut[3]); -// PERM0 -#define PERM0_A64FXf \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM1 -#define PERM1_A64FXf \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM2 -#define PERM2_A64FXf \ - Chi_00 = svtbl(Chi_00, table0); \ - Chi_01 = svtbl(Chi_01, table0); \ - Chi_02 = svtbl(Chi_02, table0); \ - Chi_10 = svtbl(Chi_10, table0); \ - Chi_11 = svtbl(Chi_11, table0); \ - Chi_12 = svtbl(Chi_12, table0); - -// PERM3 -#define PERM3_A64FXf \ +// PERMUTE +#define PERMUTE_A64FXf \ Chi_00 = svtbl(Chi_00, table0); \ Chi_01 = svtbl(Chi_01, table0); \ Chi_02 = svtbl(Chi_02, table0); \ @@ -302,7 +274,7 @@ Author: Nils Meyer U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ } // MULT_2SPIN -#define MULT_2SPIN_A64FXf(A) \ +#define MULT_2SPIN_1_A64FXf(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ @@ -326,6 +298,10 @@ Author: Nils Meyer U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXf \ +{ \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 5c41a7c4..81eec37a 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -26,14 +26,12 @@ Author: Nils Meyer *************************************************************************************/ /* END LEGAL */ -#undef LOAD_CHIMU_A64FXd -#undef LOAD_CHIMU_A64FXf +#undef LOAD_CHIMU #undef PREFETCH_CHIMU_L1 #undef PREFETCH_GAUGE_L1 #undef PREFETCH_CHIMU_L2 #undef PREFETCH_GAUGE_L2 #undef PREFETCH_GAUGE_L1_INTERNAL -#undef PF_GAUGE #undef PREFETCH1_CHIMU #undef PREFETCH_CHIMU #undef PREFETCH_RESULT_L2_STORE @@ -42,22 +40,20 @@ Author: Nils Meyer #undef LOCK_GAUGE #undef UNLOCK_GAUGE #undef MASK_REGS -#undef COMPLEX_SIGNS -#undef LOAD64 #undef SAVE_RESULT #undef ADD_RESULT -#undef MULT_2SPIN_DIR_PF +#undef MULT_2SPIN_1 +#undef MULT_2SPIN_2 #undef MAYBEPERM #undef LOAD_CHI -#undef ZERO_PSI -#undef XP_PROJMEM -#undef YP_PROJMEM -#undef ZP_PROJMEM -#undef TP_PROJMEM -#undef XM_PROJMEM -#undef YM_PROJMEM -#undef ZM_PROJMEM -#undef TM_PROJMEM +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ #undef XP_RECON #undef XM_RECON #undef XM_RECON_ACCUM @@ -68,10 +64,12 @@ Author: Nils Meyer #undef YP_RECON_ACCUM #undef ZP_RECON_ACCUM #undef TP_RECON_ACCUM +#undef PERMUTE #undef PERMUTE_DIR0 #undef PERMUTE_DIR1 #undef PERMUTE_DIR2 #undef PERMUTE_DIR3 +#undef LOAD_TABLE #undef LOAD_TABLE0 #undef LOAD_TABLE1 #undef LOAD_TABLE2 diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py index 137471cd..7628159b 100755 --- a/Grid/simd/gridverter.py +++ b/Grid/simd/gridverter.py @@ -115,37 +115,9 @@ STORE_BASE_PTR_COLOR_OFFSET = 2 # 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) OPT = """ -#ifdef INTERIOR - -#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ --- LOAD64(%r10,isigns); \ - PROJ(base); \ -++ PF_GAUGE(Dir); \ - MAYBEPERM(PERMUTE_DIR,perm); \ - } else if ( st.same_node[Dir] ) { - LOAD_CHI(base); -++ PF_GAUGE(Dir); - } \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - if ( local || st.same_node[Dir] ) { \ - MULT_2SPIN_DIR_PF(Dir,basep); \ - PREFETCH_CHIMU(base); \ --- LOAD64(%r10,isigns); \ - RECON; \ - } else { PREFETCH_CHIMU(base); } - -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ --- PF_GAUGE(Xp); \ - PREFETCH1_CHIMU(base); \ - { ZERO_PSI; } \ - ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) - -#define RESULT(base,basep) SAVE_RESULT(base,basep); - -#endif +* interleave prefetching and compute in MULT_2SPIN +* could test storing U's in MULT_2SPIN to L1d, might be beneficial for life time cache lines +* structure reordering: MAYBEPERM after MULT_2SPIN ? """ filename = 'XXX' @@ -905,7 +877,8 @@ if ALTERNATIVE_LOADS == True: define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') define(F'LOAD_CHIMU(x)') else: - define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') + #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') + define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') if PREFETCH: define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') @@ -935,39 +908,22 @@ define(F'UNLOCK_GAUGE(A)') define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') define(F'COMPLEX_SIGNS(A)') define(F'LOAD64(A,B)') -# prefetch chimu here is useless, because already done in last leg -#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);') -define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);') -if PREFETCH: - definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ') - write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\') - write (F' PREFETCH_CHIMU_L2(B); \\') - write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}') - -# definemultiline(F'MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A);') -# write (F' PREFETCH_CHIMU_L2(B); \\') -# write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\') -# write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}') - newline() -else: - define(F'MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_{PRECSUFFIX}(A)') -# break out maybeperm in permutes -#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') -define(F'MAYBEPERM(A,perm) {{ A ; }}') +define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') +define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') +define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') # don't need zero psi, everything is done in recons #define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') -define(F'ZERO_PSI') -define(F'ADD_RESULT(base,basep) LOAD_CHIMU_{PRECSUFFIX}(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') +define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') # loads projections -define(F'XP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XP_PROJ_{PRECSUFFIX}') -define(F'YP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YP_PROJ_{PRECSUFFIX}') -define(F'ZP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZP_PROJ_{PRECSUFFIX}') -define(F'TP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TP_PROJ_{PRECSUFFIX}') -define(F'XM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XM_PROJ_{PRECSUFFIX}') -define(F'YM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YM_PROJ_{PRECSUFFIX}') -define(F'ZM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZM_PROJ_{PRECSUFFIX}') -define(F'TM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TM_PROJ_{PRECSUFFIX}') +define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}') +define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}') +define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}') +define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}') +define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}') +define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}') +define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}') +define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}') # recons define(F'XP_RECON XP_RECON_{PRECSUFFIX}') define(F'XM_RECON XM_RECON_{PRECSUFFIX}') @@ -979,14 +935,21 @@ define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') -# permutes -define(F'PERMUTE_DIR0 LOAD_TABLE0; if (perm) {{ PERM0_{PRECSUFFIX}; }}') -define(F'PERMUTE_DIR1 LOAD_TABLE1; if (perm) {{ PERM1_{PRECSUFFIX}; }}') -define(F'PERMUTE_DIR2 LOAD_TABLE2; if (perm) {{ PERM2_{PRECSUFFIX}; }}') +# new permutes +define(F'PERMUTE_DIR0 0') +define(F'PERMUTE_DIR1 1') +define(F'PERMUTE_DIR2 2') +define(F'PERMUTE_DIR3 3') +define(F'PERMUTE PERMUTE_{PRECSUFFIX};') +# load table +#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') if PRECISION == 'double': - define(F'PERMUTE_DIR3') + define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}') + define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}') else: - define(F'PERMUTE_DIR3 LOAD_TABLE3; if (perm) {{ PERM3_{PRECSUFFIX}; }}') + define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}') + define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}') + write('// DECLARATIONS') @@ -1040,20 +1003,14 @@ U_01.declare() U_11.declare() U_21.declare() # 6 -> 30 regs -# all true +# all predications true pg1.declare() if PRECISION == 'double': pg1.movestr('svptrue_b64()') else: pg1.movestr('svptrue_b32()') -# even elements only -#pg2.declare() -#pg2.movestr('svzip1_b64(svptrue_b64(), svpfalse_b())') - -# preload tables -# 0: swap -# 1: permute 1 +# tables if PRECISION == 'double': write(' svuint64_t table0; \\', target='I') # -> 31 regs else: @@ -1061,10 +1018,10 @@ else: zero0.declare() +# zero register asmopen() zero0.zero(zeroreg=True) asmclose() - newline() define('Chimu_00 Chi_00', target='I') @@ -1087,7 +1044,6 @@ else: # wilson4.h define('Chimu_30 U_01', target='I') define('Chimu_31 U_11', target='I') define('Chimu_32 U_21', target='I') - newline() @@ -1380,47 +1336,11 @@ table0.loadtable(3) asmclose() newline() -# 8 directions = 6x permutations -d['factor'] = 2 # factor is 0 -d['cycles_PERM'] += 6 * d['factor'] -write('// PERM0') -definemultiline(F'PERM0_{PRECSUFFIX}') -debugall('PERM0 PRE', group='Chi') -asmopen() -#table0.loadtable(0) -Chi_00.permute(0, table0) -Chi_01.permute(0, table0) -Chi_02.permute(0, table0) -Chi_10.permute(0, table0) -Chi_11.permute(0, table0) -Chi_12.permute(0, table0) -asmclose() -debugall('PERM0 POST', group='Chi') -newline() - d['factor'] = 2 # factor is 2 d['cycles_PERM'] += 6 * d['factor'] -write('// PERM1') -definemultiline(F'PERM1_{PRECSUFFIX}') -debugall('PERM1 PRE', group='Chi') -asmopen() -#table0.loadtable(1) -Chi_00.permute(1, table0) -Chi_01.permute(1, table0) -Chi_02.permute(1, table0) -Chi_10.permute(1, table0) -Chi_11.permute(1, table0) -Chi_12.permute(1, table0) -asmclose() -debugall('PERM1 POST', group='Chi') -newline() - -d['factor'] = 2 # factor is 2 -# PERM2 = swap real and imaginary -d['cycles_PERM'] += 6 * d['factor'] -write('// PERM2') -definemultiline(F'PERM2_{PRECSUFFIX}') -debugall('PERM2 PRE', group='Chi') +write('// PERMUTE') +definemultiline(F'PERMUTE_{PRECSUFFIX}') +debugall('PERM PRE', group='Chi') asmopen() #table0.loadtable(2) Chi_00.permute(2, table0) @@ -1430,26 +1350,7 @@ Chi_10.permute(2, table0) Chi_11.permute(2, table0) Chi_12.permute(2, table0) asmclose() -debugall('PERM2 POST', group='Chi') -newline() - -# PERM3 = identity (DP), so exclude from counting -d['factor'] = 0 -d['cycles_PERM'] += 6 * d['factor'] -write('// PERM3') -definemultiline(F'PERM3_{PRECSUFFIX}') -if PRECISION == 'single': - debugall('PERM3 PRE', group='Chi') - asmopen() - #table0.loadtable(3) - Chi_00.permute(3, table0) - Chi_01.permute(3, table0) - Chi_02.permute(3, table0) - Chi_10.permute(3, table0) - Chi_11.permute(3, table0) - Chi_12.permute(3, table0) - asmclose() - debugall('PERM3 POST', group='Chi') +debugall('PERM POST', group='Chi') newline() write('// LOAD_GAUGE') @@ -1473,7 +1374,7 @@ if ASM_LOAD_GAUGE: asmclose() curlyclose() newline() -# XXXXXX remove loads + d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total # assume all U loads are hidden # FCMLA issue latency = 2 cycles @@ -1482,7 +1383,7 @@ d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total # 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] write('// MULT_2SPIN') -definemultiline(F'MULT_2SPIN_{PRECSUFFIX}(A)') +definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)') curlyopen() #write(' const auto & ref(U[sU][A]); \\') if GRIDBENCH: # referencing differs in Grid and GridBench @@ -1541,7 +1442,15 @@ if ASM_LOAD_GAUGE: U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded U_10.load("ref[1][2]") # early load U_20.load("ref[2][2]") # A --> +asmclose() +debugall('MULT_2SPIN_1', group='UChi') +curlyclose() +newline() +write('// MULT_2SPIN_BACKEND') +definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}') +curlyopen() +asmopen() # round 3 UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) @@ -1571,7 +1480,7 @@ UChi_11.mac1(U_10, Chi_12) UChi_02.mac1(U_20, Chi_02) UChi_12.mac1(U_20, Chi_12) asmclose() -debugall('MULT_2SPIN', group='UChi') +debugall('MULT_2SPIN_2', group='UChi') curlyclose() newline() @@ -1587,7 +1496,7 @@ if ALTERNATIVE_LOADS == True: write(' LOAD_CHIMU_0312_PLUG \\') curlyopen() asmopen() -pg1.loadpredication() +#pg1.loadpredication() Chi_00.addTimesI(Chimu_00, Chimu_30) Chi_01.addTimesI(Chimu_01, Chimu_31) Chi_02.addTimesI(Chimu_02, Chimu_32)