1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0)

This commit is contained in:
nils meyer 2020-04-16 22:43:32 +02:00
parent 852db4626a
commit 6fdce60492
7 changed files with 279 additions and 451 deletions

View File

@ -26,14 +26,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
*************************************************************************************/
/* END LEGAL */
#ifdef KERNEL_DAG
#define DIR0_PROJMEM(base) XP_PROJMEM(base);
#define DIR1_PROJMEM(base) YP_PROJMEM(base);
#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
#define DIR3_PROJMEM(base) TP_PROJMEM(base);
#define DIR4_PROJMEM(base) XM_PROJMEM(base);
#define DIR5_PROJMEM(base) YM_PROJMEM(base);
#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
#define DIR7_PROJMEM(base) TM_PROJMEM(base);
#define DIR0_PROJ XP_PROJ
#define DIR1_PROJ YP_PROJ
#define DIR2_PROJ ZP_PROJ
#define DIR3_PROJ TP_PROJ
#define DIR4_PROJ XM_PROJ
#define DIR5_PROJ YM_PROJ
#define DIR6_PROJ ZM_PROJ
#define DIR7_PROJ TM_PROJ
#define DIR0_RECON XP_RECON
#define DIR1_RECON YP_RECON_ACCUM
#define DIR2_RECON ZP_RECON_ACCUM
@ -43,14 +43,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define DIR6_RECON ZM_RECON_ACCUM
#define DIR7_RECON TM_RECON_ACCUM
#else
#define DIR0_PROJMEM(base) XM_PROJMEM(base);
#define DIR1_PROJMEM(base) YM_PROJMEM(base);
#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
#define DIR3_PROJMEM(base) TM_PROJMEM(base);
#define DIR4_PROJMEM(base) XP_PROJMEM(base);
#define DIR5_PROJMEM(base) YP_PROJMEM(base);
#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
#define DIR7_PROJMEM(base) TP_PROJMEM(base);
#define DIR0_PROJ XM_PROJ
#define DIR1_PROJ YM_PROJ
#define DIR2_PROJ ZM_PROJ
#define DIR3_PROJ TM_PROJ
#define DIR4_PROJ XP_PROJ
#define DIR5_PROJ YP_PROJ
#define DIR6_PROJ ZP_PROJ
#define DIR7_PROJ TP_PROJ
#define DIR0_RECON XM_RECON
#define DIR1_RECON YM_RECON_ACCUM
#define DIR2_RECON ZM_RECON_ACCUM
@ -92,21 +92,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
/* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \
PROJ(base); \
/* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \
LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
} else { \
LOAD_CHI(base); \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
MULT_2SPIN_DIR_PF(Dir,basep); \
PREFETCH_CHIMU_L2(basep); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PF_GAUGE(Xp); \
PREFETCH1_CHIMU(base); \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
@ -122,21 +127,27 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
PROJ(base); \
LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_DIR_PF(Dir,basep); \
RECON; \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
PREFETCH_CHIMU_L2(basep); \
} else { PREFETCH_CHIMU(base); } \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PF_GAUGE(Xp); \
PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
@ -152,18 +163,29 @@ Author: Nils Meyer <nils.meyer@ur.de>
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_DIR_PF(Dir,base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
}
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \
{ ZERO_PSI;} \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_DIR_PF(Dir,base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
}
@ -201,7 +223,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
uint64_t delta_base, delta_base_p;
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
#ifdef SHOW
float rescale = 64. * 12.;
@ -221,7 +243,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
#ifdef SHOW
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
@ -234,7 +256,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
#ifdef SHOW
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
@ -247,7 +269,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
#ifdef SHOW
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
@ -260,7 +282,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
#ifdef SHOW
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
@ -273,7 +295,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
#ifdef SHOW
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
@ -286,7 +308,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
#ifdef SHOW
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
@ -299,7 +321,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
#ifdef SHOW
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
@ -337,14 +359,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
}
}
#undef DIR0_PROJMEM
#undef DIR1_PROJMEM
#undef DIR2_PROJMEM
#undef DIR3_PROJMEM
#undef DIR4_PROJMEM
#undef DIR5_PROJMEM
#undef DIR6_PROJMEM
#undef DIR7_PROJMEM
#undef DIR0_PROJ
#undef DIR1_PROJ
#undef DIR2_PROJ
#undef DIR3_PROJ
#undef DIR4_PROJ
#undef DIR5_PROJ
#undef DIR6_PROJ
#undef DIR7_PROJ
#undef DIR0_RECON
#undef DIR1_RECON
#undef DIR2_RECON

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x)
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXd
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B);
#define MULT_2SPIN_DIR_PF(A,B) \
MULT_2SPIN_A64FXd(A); \
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJ XP_PROJ_A64FXd
#define YP_PROJ YP_PROJ_A64FXd
#define ZP_PROJ ZP_PROJ_A64FXd
#define TP_PROJ TP_PROJ_A64FXd
#define XM_PROJ XM_PROJ_A64FXd
#define YM_PROJ YM_PROJ_A64FXd
#define ZM_PROJ ZM_PROJ_A64FXd
#define TM_PROJ TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; }
#define PERMUTE_DIR3
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXd;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
@ -281,8 +280,8 @@ asm ( \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM0
#define PERM0_A64FXd \
// PERMUTE
#define PERMUTE_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
@ -295,37 +294,6 @@ asm ( \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
#define PERM3_A64FXd
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
@ -344,7 +312,7 @@ asm ( \
); \
}
// MULT_2SPIN
#define MULT_2SPIN_A64FXd(A) \
#define MULT_2SPIN_1_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
@ -375,6 +343,15 @@ asm ( \
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXd \
{ \
asm ( \
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
@ -400,15 +377,14 @@ asm ( \
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_PROJ
#define XP_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXf
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B);
#define MULT_2SPIN_DIR_PF(A,B) \
MULT_2SPIN_A64FXf(A); \
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJ XP_PROJ_A64FXf
#define YP_PROJ YP_PROJ_A64FXf
#define ZP_PROJ ZP_PROJ_A64FXf
#define TP_PROJ TP_PROJ_A64FXf
#define XM_PROJ XM_PROJ_A64FXf
#define YM_PROJ YM_PROJ_A64FXf
#define ZM_PROJ ZM_PROJ_A64FXf
#define TM_PROJ TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXf;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
@ -281,50 +280,8 @@ asm ( \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM0
#define PERM0_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
#define PERM3_A64FXf \
// PERMUTE
#define PERMUTE_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
@ -355,7 +312,7 @@ asm ( \
); \
}
// MULT_2SPIN
#define MULT_2SPIN_A64FXf(A) \
#define MULT_2SPIN_1_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
@ -386,6 +343,15 @@ asm ( \
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
{ \
asm ( \
"fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
"fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
"fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
@ -411,15 +377,14 @@ asm ( \
"fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
"fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_PROJ
#define XP_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x)
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXd
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B);
#define MULT_2SPIN_DIR_PF(A,B) \
MULT_2SPIN_A64FXd(A); \
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJ XP_PROJ_A64FXd
#define YP_PROJ YP_PROJ_A64FXd
#define ZP_PROJ ZP_PROJ_A64FXd
#define TP_PROJ TP_PROJ_A64FXd
#define XM_PROJ XM_PROJ_A64FXd
#define YM_PROJ YM_PROJ_A64FXd
#define ZM_PROJ ZM_PROJ_A64FXd
#define TM_PROJ TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; }
#define PERMUTE_DIR3
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXd;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
@ -254,8 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD_TABLE3 \
table0 = svld1(pg1, (uint64_t*)&lut[3]);
// PERM0
#define PERM0_A64FXd \
// PERMUTE
#define PERMUTE_A64FXd \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -263,27 +262,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM1
#define PERM1_A64FXd \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM2
#define PERM2_A64FXd \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM3
#define PERM3_A64FXd
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
@ -296,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
}
// MULT_2SPIN
#define MULT_2SPIN_A64FXd(A) \
#define MULT_2SPIN_1_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
@ -320,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXd \
{ \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXf
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B);
#define MULT_2SPIN_DIR_PF(A,B) \
MULT_2SPIN_A64FXf(A); \
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJ XP_PROJ_A64FXf
#define YP_PROJ YP_PROJ_A64FXf
#define ZP_PROJ ZP_PROJ_A64FXf
#define TP_PROJ TP_PROJ_A64FXf
#define XM_PROJ XM_PROJ_A64FXf
#define YM_PROJ YM_PROJ_A64FXf
#define ZM_PROJ ZM_PROJ_A64FXf
#define TM_PROJ TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXf;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
@ -254,35 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD_TABLE3 \
table0 = svld1(pg1, (uint32_t*)&lut[3]);
// PERM0
#define PERM0_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM1
#define PERM1_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM2
#define PERM2_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM3
#define PERM3_A64FXf \
// PERMUTE
#define PERMUTE_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -302,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
}
// MULT_2SPIN
#define MULT_2SPIN_A64FXf(A) \
#define MULT_2SPIN_1_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
@ -326,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
{ \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \

View File

@ -26,14 +26,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
*************************************************************************************/
/* END LEGAL */
#undef LOAD_CHIMU_A64FXd
#undef LOAD_CHIMU_A64FXf
#undef LOAD_CHIMU
#undef PREFETCH_CHIMU_L1
#undef PREFETCH_GAUGE_L1
#undef PREFETCH_CHIMU_L2
#undef PREFETCH_GAUGE_L2
#undef PREFETCH_GAUGE_L1_INTERNAL
#undef PF_GAUGE
#undef PREFETCH1_CHIMU
#undef PREFETCH_CHIMU
#undef PREFETCH_RESULT_L2_STORE
@ -42,22 +40,20 @@ Author: Nils Meyer <nils.meyer@ur.de>
#undef LOCK_GAUGE
#undef UNLOCK_GAUGE
#undef MASK_REGS
#undef COMPLEX_SIGNS
#undef LOAD64
#undef SAVE_RESULT
#undef ADD_RESULT
#undef MULT_2SPIN_DIR_PF
#undef MULT_2SPIN_1
#undef MULT_2SPIN_2
#undef MAYBEPERM
#undef LOAD_CHI
#undef ZERO_PSI
#undef XP_PROJMEM
#undef YP_PROJMEM
#undef ZP_PROJMEM
#undef TP_PROJMEM
#undef XM_PROJMEM
#undef YM_PROJMEM
#undef ZM_PROJMEM
#undef TM_PROJMEM
#undef XP_PROJ
#undef YP_PROJ
#undef ZP_PROJ
#undef TP_PROJ
#undef XM_PROJ
#undef YM_PROJ
#undef ZM_PROJ
#undef TM_PROJ
#undef XP_RECON
#undef XM_RECON
#undef XM_RECON_ACCUM
@ -68,10 +64,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
#undef YP_RECON_ACCUM
#undef ZP_RECON_ACCUM
#undef TP_RECON_ACCUM
#undef PERMUTE
#undef PERMUTE_DIR0
#undef PERMUTE_DIR1
#undef PERMUTE_DIR2
#undef PERMUTE_DIR3
#undef LOAD_TABLE
#undef LOAD_TABLE0
#undef LOAD_TABLE1
#undef LOAD_TABLE2

View File

@ -115,37 +115,9 @@ STORE_BASE_PTR_COLOR_OFFSET = 2
# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2)
OPT = """
#ifdef INTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
-- LOAD64(%r10,isigns); \
PROJ(base); \
++ PF_GAUGE(Dir); \
MAYBEPERM(PERMUTE_DIR,perm); \
} else if ( st.same_node[Dir] ) {
LOAD_CHI(base);
++ PF_GAUGE(Dir);
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_DIR_PF(Dir,basep); \
PREFETCH_CHIMU(base); \
-- LOAD64(%r10,isigns); \
RECON; \
} else { PREFETCH_CHIMU(base); }
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
-- PF_GAUGE(Xp); \
PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
* interleave prefetching and compute in MULT_2SPIN
* could test storing U's in MULT_2SPIN to L1d, might be beneficial for life time cache lines
* structure reordering: MAYBEPERM after MULT_2SPIN ?
"""
filename = 'XXX'
@ -905,7 +877,8 @@ if ALTERNATIVE_LOADS == True:
define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}')
define(F'LOAD_CHIMU(x)')
else:
define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)')
#define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)')
define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)')
if PREFETCH:
define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)')
@ -935,39 +908,22 @@ define(F'UNLOCK_GAUGE(A)')
define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}')
define(F'COMPLEX_SIGNS(A)')
define(F'LOAD64(A,B)')
# prefetch chimu here is useless, because already done in last leg
#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);')
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);')
if PREFETCH:
definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ')
write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
write (F' PREFETCH_CHIMU_L2(B); \\')
write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}')
# definemultiline(F'MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A);')
# write (F' PREFETCH_CHIMU_L2(B); \\')
# write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
# write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}')
newline()
else:
define(F'MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_{PRECSUFFIX}(A)')
# break out maybeperm in permutes
#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}')
define(F'MAYBEPERM(A,perm) {{ A ; }}')
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)')
define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)')
define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}')
define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)')
# don't need zero psi, everything is done in recons
#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}')
define(F'ZERO_PSI')
define(F'ADD_RESULT(base,basep) LOAD_CHIMU_{PRECSUFFIX}(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
# loads projections
define(F'XP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XP_PROJ_{PRECSUFFIX}')
define(F'YP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YP_PROJ_{PRECSUFFIX}')
define(F'ZP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZP_PROJ_{PRECSUFFIX}')
define(F'TP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TP_PROJ_{PRECSUFFIX}')
define(F'XM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XM_PROJ_{PRECSUFFIX}')
define(F'YM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YM_PROJ_{PRECSUFFIX}')
define(F'ZM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZM_PROJ_{PRECSUFFIX}')
define(F'TM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TM_PROJ_{PRECSUFFIX}')
define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}')
define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}')
define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}')
define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}')
define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}')
define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}')
define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}')
define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}')
# recons
define(F'XP_RECON XP_RECON_{PRECSUFFIX}')
define(F'XM_RECON XM_RECON_{PRECSUFFIX}')
@ -979,14 +935,21 @@ define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}')
define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}')
define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}')
define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}')
# permutes
define(F'PERMUTE_DIR0 LOAD_TABLE0; if (perm) {{ PERM0_{PRECSUFFIX}; }}')
define(F'PERMUTE_DIR1 LOAD_TABLE1; if (perm) {{ PERM1_{PRECSUFFIX}; }}')
define(F'PERMUTE_DIR2 LOAD_TABLE2; if (perm) {{ PERM2_{PRECSUFFIX}; }}')
# new permutes
define(F'PERMUTE_DIR0 0')
define(F'PERMUTE_DIR1 1')
define(F'PERMUTE_DIR2 2')
define(F'PERMUTE_DIR3 3')
define(F'PERMUTE PERMUTE_{PRECSUFFIX};')
# load table
#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}')
if PRECISION == 'double':
define(F'PERMUTE_DIR3')
define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}')
define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}')
else:
define(F'PERMUTE_DIR3 LOAD_TABLE3; if (perm) {{ PERM3_{PRECSUFFIX}; }}')
define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}')
define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}')
write('// DECLARATIONS')
@ -1040,20 +1003,14 @@ U_01.declare()
U_11.declare()
U_21.declare() # 6 -> 30 regs
# all true
# all predications true
pg1.declare()
if PRECISION == 'double':
pg1.movestr('svptrue_b64()')
else:
pg1.movestr('svptrue_b32()')
# even elements only
#pg2.declare()
#pg2.movestr('svzip1_b64(svptrue_b64(), svpfalse_b())')
# preload tables
# 0: swap
# 1: permute 1
# tables
if PRECISION == 'double':
write(' svuint64_t table0; \\', target='I') # -> 31 regs
else:
@ -1061,10 +1018,10 @@ else:
zero0.declare()
# zero register
asmopen()
zero0.zero(zeroreg=True)
asmclose()
newline()
define('Chimu_00 Chi_00', target='I')
@ -1087,7 +1044,6 @@ else: # wilson4.h
define('Chimu_30 U_01', target='I')
define('Chimu_31 U_11', target='I')
define('Chimu_32 U_21', target='I')
newline()
@ -1380,47 +1336,11 @@ table0.loadtable(3)
asmclose()
newline()
# 8 directions = 6x permutations
d['factor'] = 2 # factor is 0
d['cycles_PERM'] += 6 * d['factor']
write('// PERM0')
definemultiline(F'PERM0_{PRECSUFFIX}')
debugall('PERM0 PRE', group='Chi')
asmopen()
#table0.loadtable(0)
Chi_00.permute(0, table0)
Chi_01.permute(0, table0)
Chi_02.permute(0, table0)
Chi_10.permute(0, table0)
Chi_11.permute(0, table0)
Chi_12.permute(0, table0)
asmclose()
debugall('PERM0 POST', group='Chi')
newline()
d['factor'] = 2 # factor is 2
d['cycles_PERM'] += 6 * d['factor']
write('// PERM1')
definemultiline(F'PERM1_{PRECSUFFIX}')
debugall('PERM1 PRE', group='Chi')
asmopen()
#table0.loadtable(1)
Chi_00.permute(1, table0)
Chi_01.permute(1, table0)
Chi_02.permute(1, table0)
Chi_10.permute(1, table0)
Chi_11.permute(1, table0)
Chi_12.permute(1, table0)
asmclose()
debugall('PERM1 POST', group='Chi')
newline()
d['factor'] = 2 # factor is 2
# PERM2 = swap real and imaginary
d['cycles_PERM'] += 6 * d['factor']
write('// PERM2')
definemultiline(F'PERM2_{PRECSUFFIX}')
debugall('PERM2 PRE', group='Chi')
write('// PERMUTE')
definemultiline(F'PERMUTE_{PRECSUFFIX}')
debugall('PERM PRE', group='Chi')
asmopen()
#table0.loadtable(2)
Chi_00.permute(2, table0)
@ -1430,26 +1350,7 @@ Chi_10.permute(2, table0)
Chi_11.permute(2, table0)
Chi_12.permute(2, table0)
asmclose()
debugall('PERM2 POST', group='Chi')
newline()
# PERM3 = identity (DP), so exclude from counting
d['factor'] = 0
d['cycles_PERM'] += 6 * d['factor']
write('// PERM3')
definemultiline(F'PERM3_{PRECSUFFIX}')
if PRECISION == 'single':
debugall('PERM3 PRE', group='Chi')
asmopen()
#table0.loadtable(3)
Chi_00.permute(3, table0)
Chi_01.permute(3, table0)
Chi_02.permute(3, table0)
Chi_10.permute(3, table0)
Chi_11.permute(3, table0)
Chi_12.permute(3, table0)
asmclose()
debugall('PERM3 POST', group='Chi')
debugall('PERM POST', group='Chi')
newline()
write('// LOAD_GAUGE')
@ -1473,7 +1374,7 @@ if ASM_LOAD_GAUGE:
asmclose()
curlyclose()
newline()
# XXXXXX remove loads
d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
# assume all U loads are hidden
# FCMLA issue latency = 2 cycles
@ -1482,7 +1383,7 @@ d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9
d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor']
write('// MULT_2SPIN')
definemultiline(F'MULT_2SPIN_{PRECSUFFIX}(A)')
definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)')
curlyopen()
#write(' const auto & ref(U[sU][A]); \\')
if GRIDBENCH: # referencing differs in Grid and GridBench
@ -1541,7 +1442,15 @@ if ASM_LOAD_GAUGE:
U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded
U_10.load("ref[1][2]") # early load
U_20.load("ref[2][2]") # A -->
asmclose()
debugall('MULT_2SPIN_1', group='UChi')
curlyclose()
newline()
write('// MULT_2SPIN_BACKEND')
definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}')
curlyopen()
asmopen()
# round 3
UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and
UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90)
@ -1571,7 +1480,7 @@ UChi_11.mac1(U_10, Chi_12)
UChi_02.mac1(U_20, Chi_02)
UChi_12.mac1(U_20, Chi_12)
asmclose()
debugall('MULT_2SPIN', group='UChi')
debugall('MULT_2SPIN_2', group='UChi')
curlyclose()
newline()
@ -1587,7 +1496,7 @@ if ALTERNATIVE_LOADS == True:
write(' LOAD_CHIMU_0312_PLUG \\')
curlyopen()
asmopen()
pg1.loadpredication()
#pg1.loadpredication()
Chi_00.addTimesI(Chimu_00, Chimu_30)
Chi_01.addTimesI(Chimu_01, Chimu_31)
Chi_02.addTimesI(Chimu_02, Chimu_32)