1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0)

This commit is contained in:
nils meyer 2020-04-16 22:43:32 +02:00
parent 852db4626a
commit 6fdce60492
7 changed files with 279 additions and 451 deletions

View File

@ -26,14 +26,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifdef KERNEL_DAG #ifdef KERNEL_DAG
#define DIR0_PROJMEM(base) XP_PROJMEM(base); #define DIR0_PROJ XP_PROJ
#define DIR1_PROJMEM(base) YP_PROJMEM(base); #define DIR1_PROJ YP_PROJ
#define DIR2_PROJMEM(base) ZP_PROJMEM(base); #define DIR2_PROJ ZP_PROJ
#define DIR3_PROJMEM(base) TP_PROJMEM(base); #define DIR3_PROJ TP_PROJ
#define DIR4_PROJMEM(base) XM_PROJMEM(base); #define DIR4_PROJ XM_PROJ
#define DIR5_PROJMEM(base) YM_PROJMEM(base); #define DIR5_PROJ YM_PROJ
#define DIR6_PROJMEM(base) ZM_PROJMEM(base); #define DIR6_PROJ ZM_PROJ
#define DIR7_PROJMEM(base) TM_PROJMEM(base); #define DIR7_PROJ TM_PROJ
#define DIR0_RECON XP_RECON #define DIR0_RECON XP_RECON
#define DIR1_RECON YP_RECON_ACCUM #define DIR1_RECON YP_RECON_ACCUM
#define DIR2_RECON ZP_RECON_ACCUM #define DIR2_RECON ZP_RECON_ACCUM
@ -43,14 +43,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define DIR6_RECON ZM_RECON_ACCUM #define DIR6_RECON ZM_RECON_ACCUM
#define DIR7_RECON TM_RECON_ACCUM #define DIR7_RECON TM_RECON_ACCUM
#else #else
#define DIR0_PROJMEM(base) XM_PROJMEM(base); #define DIR0_PROJ XM_PROJ
#define DIR1_PROJMEM(base) YM_PROJMEM(base); #define DIR1_PROJ YM_PROJ
#define DIR2_PROJMEM(base) ZM_PROJMEM(base); #define DIR2_PROJ ZM_PROJ
#define DIR3_PROJMEM(base) TM_PROJMEM(base); #define DIR3_PROJ TM_PROJ
#define DIR4_PROJMEM(base) XP_PROJMEM(base); #define DIR4_PROJ XP_PROJ
#define DIR5_PROJMEM(base) YP_PROJMEM(base); #define DIR5_PROJ YP_PROJ
#define DIR6_PROJMEM(base) ZP_PROJMEM(base); #define DIR6_PROJ ZP_PROJ
#define DIR7_PROJMEM(base) TP_PROJMEM(base); #define DIR7_PROJ TP_PROJ
#define DIR0_RECON XM_RECON #define DIR0_RECON XM_RECON
#define DIR1_RECON YM_RECON_ACCUM #define DIR1_RECON YM_RECON_ACCUM
#define DIR2_RECON ZM_RECON_ACCUM #define DIR2_RECON ZM_RECON_ACCUM
@ -91,23 +91,28 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \ basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \ if ( local ) { \
/* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \ LOAD_CHIMU(base); \
PROJ(base); \ LOAD_TABLE(PERMUTE_DIR); \
/* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \ PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \ MAYBEPERM(PERMUTE_DIR,perm); \
} else { \ } else { \
LOAD_CHI(base); \ LOAD_CHI(base); \
} \ } \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \ MULT_2SPIN_1(Dir); \
MULT_2SPIN_DIR_PF(Dir,basep); \ PREFETCH_CHIMU(base); \
RECON; \ PREFETCH_CHIMU_L2(basep); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PF_GAUGE(Xp); \ PREFETCH1_CHIMU(base); \
PREFETCH1_CHIMU(base); \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep); #define RESULT(base,basep) SAVE_RESULT(base,basep);
@ -121,22 +126,28 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \ basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \ if ( local ) { \
PROJ(base); \ LOAD_CHIMU(base); \
MAYBEPERM(PERMUTE_DIR,perm); \ LOAD_TABLE(PERMUTE_DIR); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ PROJ; \
if ( local || st.same_node[Dir] ) { \ MAYBEPERM(PERMUTE_DIR,perm); \
MULT_2SPIN_DIR_PF(Dir,basep); \ }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
RECON; \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \ if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
PREFETCH_CHIMU_L2(basep); \
} else { PREFETCH_CHIMU(base); } \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PF_GAUGE(Xp); \
PREFETCH1_CHIMU(base); \ PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep); #define RESULT(base,basep) SAVE_RESULT(base,basep);
@ -149,23 +160,34 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \ if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \ LOAD_CHI(base); \
MULT_2SPIN_DIR_PF(Dir,base); \ MULT_2SPIN_1(Dir); \
RECON; \ PREFETCH_CHIMU(base); \
nmu++; \ /* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
} }
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \ nmu=0; \
{ ZERO_PSI;} \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ if((!local)&&(!st.same_node[Dir]) ) { \
if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \
LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \
MULT_2SPIN_DIR_PF(Dir,base); \ PREFETCH_CHIMU(base); \
RECON; \ /* PREFETCH_GAUGE_L1(NxtDir); */ \
nmu++; \ MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
} }
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
@ -201,7 +223,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
uint64_t delta_base, delta_base_p; uint64_t delta_base, delta_base_p;
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON); ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
#ifdef SHOW #ifdef SHOW
float rescale = 64. * 12.; float rescale = 64. * 12.;
@ -221,7 +243,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON); ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
#ifdef SHOW #ifdef SHOW
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl; std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
@ -234,7 +256,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON); ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
#ifdef SHOW #ifdef SHOW
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl; std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
@ -247,7 +269,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON); ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
#ifdef SHOW #ifdef SHOW
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl; std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
@ -260,7 +282,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON); ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
#ifdef SHOW #ifdef SHOW
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl; std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
@ -273,7 +295,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON); ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
#ifdef SHOW #ifdef SHOW
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl; std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
@ -286,7 +308,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON); ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
#ifdef SHOW #ifdef SHOW
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl; std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
@ -299,7 +321,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON); ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
#ifdef SHOW #ifdef SHOW
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl; std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
@ -337,14 +359,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
} }
} }
#undef DIR0_PROJMEM #undef DIR0_PROJ
#undef DIR1_PROJMEM #undef DIR1_PROJ
#undef DIR2_PROJMEM #undef DIR2_PROJ
#undef DIR3_PROJMEM #undef DIR3_PROJ
#undef DIR4_PROJMEM #undef DIR4_PROJ
#undef DIR5_PROJMEM #undef DIR5_PROJ
#undef DIR6_PROJMEM #undef DIR6_PROJ
#undef DIR7_PROJMEM #undef DIR7_PROJ
#undef DIR0_RECON #undef DIR0_RECON
#undef DIR1_RECON #undef DIR1_RECON
#undef DIR2_RECON #undef DIR2_RECON

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) #define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXd #define MASK_REGS DECLARATIONS_A64FXd
#define COMPLEX_SIGNS(A) #define COMPLEX_SIGNS(A)
#define LOAD64(A,B) #define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B); #define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_DIR_PF(A,B) \ #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
MULT_2SPIN_A64FXd(A); \ #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJ ZP_PROJ_A64FXd
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd #define TP_PROJ TP_PROJ_A64FXd
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd #define XM_PROJ XM_PROJ_A64FXd
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd #define YM_PROJ YM_PROJ_A64FXd
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd #define ZM_PROJ ZM_PROJ_A64FXd
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd #define TM_PROJ TM_PROJ_A64FXd
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd #define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; } #define PERMUTE_DIR0 0
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; } #define PERMUTE_DIR1 1
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; } #define PERMUTE_DIR2 2
#define PERMUTE_DIR3 #define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXd;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
// DECLARATIONS // DECLARATIONS
#define DECLARATIONS_A64FXd \ #define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \ const uint64_t lut[4][8] = { \
@ -281,8 +280,8 @@ asm ( \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); );
// PERM0 // PERMUTE
#define PERM0_A64FXd \ #define PERMUTE_A64FXd \
asm ( \ asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \ "tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \ "tbl z13.d, { z13.d }, z30.d \n\t" \
@ -295,37 +294,6 @@ asm ( \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); );
// PERM1
#define PERM1_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
#define PERM3_A64FXd
// LOAD_GAUGE // LOAD_GAUGE
#define LOAD_GAUGE \ #define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
@ -344,7 +312,7 @@ asm ( \
); \ ); \
} }
// MULT_2SPIN // MULT_2SPIN
#define MULT_2SPIN_A64FXd(A) \ #define MULT_2SPIN_1_A64FXd(A) \
{ \ { \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \ asm ( \
@ -375,6 +343,15 @@ asm ( \
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXd \
{ \
asm ( \
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
@ -400,15 +377,14 @@ asm ( \
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
: \ : \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \ : \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \ ); \
} }
// XP_PROJ // XP_PROJ
#define XP_PROJ_A64FXd \ #define XP_PROJ_A64FXd \
{ \ { \
asm ( \ asm ( \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) #define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXf #define MASK_REGS DECLARATIONS_A64FXf
#define COMPLEX_SIGNS(A) #define COMPLEX_SIGNS(A)
#define LOAD64(A,B) #define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B); #define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_DIR_PF(A,B) \ #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
MULT_2SPIN_A64FXf(A); \ #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJ ZP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf #define TP_PROJ TP_PROJ_A64FXf
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf #define XM_PROJ XM_PROJ_A64FXf
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf #define YM_PROJ YM_PROJ_A64FXf
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf #define ZM_PROJ ZM_PROJ_A64FXf
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf #define TM_PROJ TM_PROJ_A64FXf
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf #define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; } #define PERMUTE_DIR0 0
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; } #define PERMUTE_DIR1 1
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; } #define PERMUTE_DIR2 2
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; } #define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXf;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS // DECLARATIONS
#define DECLARATIONS_A64FXf \ #define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \ const uint32_t lut[4][16] = { \
@ -281,50 +280,8 @@ asm ( \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); );
// PERM0 // PERMUTE
#define PERM0_A64FXf \ #define PERMUTE_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
#define PERM3_A64FXf \
asm ( \ asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \ "tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \ "tbl z13.s, { z13.s }, z30.s \n\t" \
@ -355,7 +312,7 @@ asm ( \
); \ ); \
} }
// MULT_2SPIN // MULT_2SPIN
#define MULT_2SPIN_A64FXf(A) \ #define MULT_2SPIN_1_A64FXf(A) \
{ \ { \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \ asm ( \
@ -386,6 +343,15 @@ asm ( \
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
{ \
asm ( \
"fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
"fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
"fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
@ -411,15 +377,14 @@ asm ( \
"fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
"fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
: \ : \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \ : \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \ ); \
} }
// XP_PROJ // XP_PROJ
#define XP_PROJ_A64FXf \ #define XP_PROJ_A64FXf \
{ \ { \
asm ( \ asm ( \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x) #define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXd #define MASK_REGS DECLARATIONS_A64FXd
#define COMPLEX_SIGNS(A) #define COMPLEX_SIGNS(A)
#define LOAD64(A,B) #define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B); #define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_DIR_PF(A,B) \ #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
MULT_2SPIN_A64FXd(A); \ #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJ ZP_PROJ_A64FXd
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd #define TP_PROJ TP_PROJ_A64FXd
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd #define XM_PROJ XM_PROJ_A64FXd
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd #define YM_PROJ YM_PROJ_A64FXd
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd #define ZM_PROJ ZM_PROJ_A64FXd
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd #define TM_PROJ TM_PROJ_A64FXd
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd #define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd #define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; } #define PERMUTE_DIR0 0
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; } #define PERMUTE_DIR1 1
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; } #define PERMUTE_DIR2 2
#define PERMUTE_DIR3 #define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXd;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
// DECLARATIONS // DECLARATIONS
#define DECLARATIONS_A64FXd \ #define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \ const uint64_t lut[4][8] = { \
@ -254,8 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD_TABLE3 \ #define LOAD_TABLE3 \
table0 = svld1(pg1, (uint64_t*)&lut[3]); table0 = svld1(pg1, (uint64_t*)&lut[3]);
// PERM0 // PERMUTE
#define PERM0_A64FXd \ #define PERMUTE_A64FXd \
Chi_00 = svtbl(Chi_00, table0); \ Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \ Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \ Chi_02 = svtbl(Chi_02, table0); \
@ -263,27 +262,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
Chi_11 = svtbl(Chi_11, table0); \ Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0); Chi_12 = svtbl(Chi_12, table0);
// PERM1
#define PERM1_A64FXd \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM2
#define PERM2_A64FXd \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM3
#define PERM3_A64FXd
// LOAD_GAUGE // LOAD_GAUGE
#define LOAD_GAUGE \ #define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
@ -296,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
} }
// MULT_2SPIN // MULT_2SPIN
#define MULT_2SPIN_A64FXd(A) \ #define MULT_2SPIN_1_A64FXd(A) \
{ \ { \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
@ -320,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXd \
{ \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x) #define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) #define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) #define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) #define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXf #define MASK_REGS DECLARATIONS_A64FXf
#define COMPLEX_SIGNS(A) #define COMPLEX_SIGNS(A)
#define LOAD64(A,B) #define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B); #define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_DIR_PF(A,B) \ #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
MULT_2SPIN_A64FXf(A); \ #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJ ZP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf #define TP_PROJ TP_PROJ_A64FXf
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf #define XM_PROJ XM_PROJ_A64FXf
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf #define YM_PROJ YM_PROJ_A64FXf
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf #define ZM_PROJ ZM_PROJ_A64FXf
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf #define TM_PROJ TM_PROJ_A64FXf
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf #define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf #define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf #define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf #define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf #define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf #define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; } #define PERMUTE_DIR0 0
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; } #define PERMUTE_DIR1 1
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; } #define PERMUTE_DIR2 2
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; } #define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXf;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS // DECLARATIONS
#define DECLARATIONS_A64FXf \ #define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \ const uint32_t lut[4][16] = { \
@ -254,35 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD_TABLE3 \ #define LOAD_TABLE3 \
table0 = svld1(pg1, (uint32_t*)&lut[3]); table0 = svld1(pg1, (uint32_t*)&lut[3]);
// PERM0 // PERMUTE
#define PERM0_A64FXf \ #define PERMUTE_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM1
#define PERM1_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM2
#define PERM2_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM3
#define PERM3_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \ Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \ Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \ Chi_02 = svtbl(Chi_02, table0); \
@ -302,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
} }
// MULT_2SPIN // MULT_2SPIN
#define MULT_2SPIN_A64FXf(A) \ #define MULT_2SPIN_1_A64FXf(A) \
{ \ { \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
@ -326,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
{ \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \

View File

@ -26,14 +26,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#undef LOAD_CHIMU_A64FXd #undef LOAD_CHIMU
#undef LOAD_CHIMU_A64FXf
#undef PREFETCH_CHIMU_L1 #undef PREFETCH_CHIMU_L1
#undef PREFETCH_GAUGE_L1 #undef PREFETCH_GAUGE_L1
#undef PREFETCH_CHIMU_L2 #undef PREFETCH_CHIMU_L2
#undef PREFETCH_GAUGE_L2 #undef PREFETCH_GAUGE_L2
#undef PREFETCH_GAUGE_L1_INTERNAL #undef PREFETCH_GAUGE_L1_INTERNAL
#undef PF_GAUGE
#undef PREFETCH1_CHIMU #undef PREFETCH1_CHIMU
#undef PREFETCH_CHIMU #undef PREFETCH_CHIMU
#undef PREFETCH_RESULT_L2_STORE #undef PREFETCH_RESULT_L2_STORE
@ -42,22 +40,20 @@ Author: Nils Meyer <nils.meyer@ur.de>
#undef LOCK_GAUGE #undef LOCK_GAUGE
#undef UNLOCK_GAUGE #undef UNLOCK_GAUGE
#undef MASK_REGS #undef MASK_REGS
#undef COMPLEX_SIGNS
#undef LOAD64
#undef SAVE_RESULT #undef SAVE_RESULT
#undef ADD_RESULT #undef ADD_RESULT
#undef MULT_2SPIN_DIR_PF #undef MULT_2SPIN_1
#undef MULT_2SPIN_2
#undef MAYBEPERM #undef MAYBEPERM
#undef LOAD_CHI #undef LOAD_CHI
#undef ZERO_PSI #undef XP_PROJ
#undef XP_PROJMEM #undef YP_PROJ
#undef YP_PROJMEM #undef ZP_PROJ
#undef ZP_PROJMEM #undef TP_PROJ
#undef TP_PROJMEM #undef XM_PROJ
#undef XM_PROJMEM #undef YM_PROJ
#undef YM_PROJMEM #undef ZM_PROJ
#undef ZM_PROJMEM #undef TM_PROJ
#undef TM_PROJMEM
#undef XP_RECON #undef XP_RECON
#undef XM_RECON #undef XM_RECON
#undef XM_RECON_ACCUM #undef XM_RECON_ACCUM
@ -68,10 +64,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
#undef YP_RECON_ACCUM #undef YP_RECON_ACCUM
#undef ZP_RECON_ACCUM #undef ZP_RECON_ACCUM
#undef TP_RECON_ACCUM #undef TP_RECON_ACCUM
#undef PERMUTE
#undef PERMUTE_DIR0 #undef PERMUTE_DIR0
#undef PERMUTE_DIR1 #undef PERMUTE_DIR1
#undef PERMUTE_DIR2 #undef PERMUTE_DIR2
#undef PERMUTE_DIR3 #undef PERMUTE_DIR3
#undef LOAD_TABLE
#undef LOAD_TABLE0 #undef LOAD_TABLE0
#undef LOAD_TABLE1 #undef LOAD_TABLE1
#undef LOAD_TABLE2 #undef LOAD_TABLE2

View File

@ -115,37 +115,9 @@ STORE_BASE_PTR_COLOR_OFFSET = 2
# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) # 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2)
OPT = """ OPT = """
#ifdef INTERIOR * interleave prefetching and compute in MULT_2SPIN
* could test storing U's in MULT_2SPIN to L1d, might be beneficial for life time cache lines
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ * structure reordering: MAYBEPERM after MULT_2SPIN ?
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
-- LOAD64(%r10,isigns); \
PROJ(base); \
++ PF_GAUGE(Dir); \
MAYBEPERM(PERMUTE_DIR,perm); \
} else if ( st.same_node[Dir] ) {
LOAD_CHI(base);
++ PF_GAUGE(Dir);
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_DIR_PF(Dir,basep); \
PREFETCH_CHIMU(base); \
-- LOAD64(%r10,isigns); \
RECON; \
} else { PREFETCH_CHIMU(base); }
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
-- PF_GAUGE(Xp); \
PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
""" """
filename = 'XXX' filename = 'XXX'
@ -905,7 +877,8 @@ if ALTERNATIVE_LOADS == True:
define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}')
define(F'LOAD_CHIMU(x)') define(F'LOAD_CHIMU(x)')
else: else:
define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)')
define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)')
if PREFETCH: if PREFETCH:
define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)')
@ -935,39 +908,22 @@ define(F'UNLOCK_GAUGE(A)')
define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}')
define(F'COMPLEX_SIGNS(A)') define(F'COMPLEX_SIGNS(A)')
define(F'LOAD64(A,B)') define(F'LOAD64(A,B)')
# prefetch chimu here is useless, because already done in last leg define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)')
#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);') define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)')
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);') define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}')
if PREFETCH:
definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ')
write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
write (F' PREFETCH_CHIMU_L2(B); \\')
write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}')
# definemultiline(F'MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A);')
# write (F' PREFETCH_CHIMU_L2(B); \\')
# write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
# write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}')
newline()
else:
define(F'MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_{PRECSUFFIX}(A)')
# break out maybeperm in permutes
#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}')
define(F'MAYBEPERM(A,perm) {{ A ; }}')
define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)')
# don't need zero psi, everything is done in recons # don't need zero psi, everything is done in recons
#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') #define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}')
define(F'ZERO_PSI') define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
define(F'ADD_RESULT(base,basep) LOAD_CHIMU_{PRECSUFFIX}(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
# loads projections # loads projections
define(F'XP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XP_PROJ_{PRECSUFFIX}') define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}')
define(F'YP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YP_PROJ_{PRECSUFFIX}') define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}')
define(F'ZP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZP_PROJ_{PRECSUFFIX}') define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}')
define(F'TP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TP_PROJ_{PRECSUFFIX}') define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}')
define(F'XM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XM_PROJ_{PRECSUFFIX}') define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}')
define(F'YM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YM_PROJ_{PRECSUFFIX}') define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}')
define(F'ZM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZM_PROJ_{PRECSUFFIX}') define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}')
define(F'TM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TM_PROJ_{PRECSUFFIX}') define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}')
# recons # recons
define(F'XP_RECON XP_RECON_{PRECSUFFIX}') define(F'XP_RECON XP_RECON_{PRECSUFFIX}')
define(F'XM_RECON XM_RECON_{PRECSUFFIX}') define(F'XM_RECON XM_RECON_{PRECSUFFIX}')
@ -979,14 +935,21 @@ define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}')
define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}')
define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}')
define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}')
# permutes # new permutes
define(F'PERMUTE_DIR0 LOAD_TABLE0; if (perm) {{ PERM0_{PRECSUFFIX}; }}') define(F'PERMUTE_DIR0 0')
define(F'PERMUTE_DIR1 LOAD_TABLE1; if (perm) {{ PERM1_{PRECSUFFIX}; }}') define(F'PERMUTE_DIR1 1')
define(F'PERMUTE_DIR2 LOAD_TABLE2; if (perm) {{ PERM2_{PRECSUFFIX}; }}') define(F'PERMUTE_DIR2 2')
define(F'PERMUTE_DIR3 3')
define(F'PERMUTE PERMUTE_{PRECSUFFIX};')
# load table
#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}')
if PRECISION == 'double': if PRECISION == 'double':
define(F'PERMUTE_DIR3') define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}')
define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}')
else: else:
define(F'PERMUTE_DIR3 LOAD_TABLE3; if (perm) {{ PERM3_{PRECSUFFIX}; }}') define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}')
define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}')
write('// DECLARATIONS') write('// DECLARATIONS')
@ -1040,20 +1003,14 @@ U_01.declare()
U_11.declare() U_11.declare()
U_21.declare() # 6 -> 30 regs U_21.declare() # 6 -> 30 regs
# all true # all predications true
pg1.declare() pg1.declare()
if PRECISION == 'double': if PRECISION == 'double':
pg1.movestr('svptrue_b64()') pg1.movestr('svptrue_b64()')
else: else:
pg1.movestr('svptrue_b32()') pg1.movestr('svptrue_b32()')
# even elements only # tables
#pg2.declare()
#pg2.movestr('svzip1_b64(svptrue_b64(), svpfalse_b())')
# preload tables
# 0: swap
# 1: permute 1
if PRECISION == 'double': if PRECISION == 'double':
write(' svuint64_t table0; \\', target='I') # -> 31 regs write(' svuint64_t table0; \\', target='I') # -> 31 regs
else: else:
@ -1061,10 +1018,10 @@ else:
zero0.declare() zero0.declare()
# zero register
asmopen() asmopen()
zero0.zero(zeroreg=True) zero0.zero(zeroreg=True)
asmclose() asmclose()
newline() newline()
define('Chimu_00 Chi_00', target='I') define('Chimu_00 Chi_00', target='I')
@ -1087,7 +1044,6 @@ else: # wilson4.h
define('Chimu_30 U_01', target='I') define('Chimu_30 U_01', target='I')
define('Chimu_31 U_11', target='I') define('Chimu_31 U_11', target='I')
define('Chimu_32 U_21', target='I') define('Chimu_32 U_21', target='I')
newline() newline()
@ -1380,47 +1336,11 @@ table0.loadtable(3)
asmclose() asmclose()
newline() newline()
# 8 directions = 6x permutations
d['factor'] = 2 # factor is 0
d['cycles_PERM'] += 6 * d['factor']
write('// PERM0')
definemultiline(F'PERM0_{PRECSUFFIX}')
debugall('PERM0 PRE', group='Chi')
asmopen()
#table0.loadtable(0)
Chi_00.permute(0, table0)
Chi_01.permute(0, table0)
Chi_02.permute(0, table0)
Chi_10.permute(0, table0)
Chi_11.permute(0, table0)
Chi_12.permute(0, table0)
asmclose()
debugall('PERM0 POST', group='Chi')
newline()
d['factor'] = 2 # factor is 2 d['factor'] = 2 # factor is 2
d['cycles_PERM'] += 6 * d['factor'] d['cycles_PERM'] += 6 * d['factor']
write('// PERM1') write('// PERMUTE')
definemultiline(F'PERM1_{PRECSUFFIX}') definemultiline(F'PERMUTE_{PRECSUFFIX}')
debugall('PERM1 PRE', group='Chi') debugall('PERM PRE', group='Chi')
asmopen()
#table0.loadtable(1)
Chi_00.permute(1, table0)
Chi_01.permute(1, table0)
Chi_02.permute(1, table0)
Chi_10.permute(1, table0)
Chi_11.permute(1, table0)
Chi_12.permute(1, table0)
asmclose()
debugall('PERM1 POST', group='Chi')
newline()
d['factor'] = 2 # factor is 2
# PERM2 = swap real and imaginary
d['cycles_PERM'] += 6 * d['factor']
write('// PERM2')
definemultiline(F'PERM2_{PRECSUFFIX}')
debugall('PERM2 PRE', group='Chi')
asmopen() asmopen()
#table0.loadtable(2) #table0.loadtable(2)
Chi_00.permute(2, table0) Chi_00.permute(2, table0)
@ -1430,26 +1350,7 @@ Chi_10.permute(2, table0)
Chi_11.permute(2, table0) Chi_11.permute(2, table0)
Chi_12.permute(2, table0) Chi_12.permute(2, table0)
asmclose() asmclose()
debugall('PERM2 POST', group='Chi') debugall('PERM POST', group='Chi')
newline()
# PERM3 = identity (DP), so exclude from counting
d['factor'] = 0
d['cycles_PERM'] += 6 * d['factor']
write('// PERM3')
definemultiline(F'PERM3_{PRECSUFFIX}')
if PRECISION == 'single':
debugall('PERM3 PRE', group='Chi')
asmopen()
#table0.loadtable(3)
Chi_00.permute(3, table0)
Chi_01.permute(3, table0)
Chi_02.permute(3, table0)
Chi_10.permute(3, table0)
Chi_11.permute(3, table0)
Chi_12.permute(3, table0)
asmclose()
debugall('PERM3 POST', group='Chi')
newline() newline()
write('// LOAD_GAUGE') write('// LOAD_GAUGE')
@ -1473,7 +1374,7 @@ if ASM_LOAD_GAUGE:
asmclose() asmclose()
curlyclose() curlyclose()
newline() newline()
# XXXXXX remove loads
d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
# assume all U loads are hidden # assume all U loads are hidden
# FCMLA issue latency = 2 cycles # FCMLA issue latency = 2 cycles
@ -1482,7 +1383,7 @@ d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 # 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9
d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor']
write('// MULT_2SPIN') write('// MULT_2SPIN')
definemultiline(F'MULT_2SPIN_{PRECSUFFIX}(A)') definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)')
curlyopen() curlyopen()
#write(' const auto & ref(U[sU][A]); \\') #write(' const auto & ref(U[sU][A]); \\')
if GRIDBENCH: # referencing differs in Grid and GridBench if GRIDBENCH: # referencing differs in Grid and GridBench
@ -1541,7 +1442,15 @@ if ASM_LOAD_GAUGE:
U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded
U_10.load("ref[1][2]") # early load U_10.load("ref[1][2]") # early load
U_20.load("ref[2][2]") # A --> U_20.load("ref[2][2]") # A -->
asmclose()
debugall('MULT_2SPIN_1', group='UChi')
curlyclose()
newline()
write('// MULT_2SPIN_BACKEND')
definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}')
curlyopen()
asmopen()
# round 3 # round 3
UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and
UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90)
@ -1571,7 +1480,7 @@ UChi_11.mac1(U_10, Chi_12)
UChi_02.mac1(U_20, Chi_02) UChi_02.mac1(U_20, Chi_02)
UChi_12.mac1(U_20, Chi_12) UChi_12.mac1(U_20, Chi_12)
asmclose() asmclose()
debugall('MULT_2SPIN', group='UChi') debugall('MULT_2SPIN_2', group='UChi')
curlyclose() curlyclose()
newline() newline()
@ -1587,7 +1496,7 @@ if ALTERNATIVE_LOADS == True:
write(' LOAD_CHIMU_0312_PLUG \\') write(' LOAD_CHIMU_0312_PLUG \\')
curlyopen() curlyopen()
asmopen() asmopen()
pg1.loadpredication() #pg1.loadpredication()
Chi_00.addTimesI(Chimu_00, Chimu_30) Chi_00.addTimesI(Chimu_00, Chimu_30)
Chi_01.addTimesI(Chimu_01, Chimu_31) Chi_01.addTimesI(Chimu_01, Chimu_31)
Chi_02.addTimesI(Chimu_02, Chimu_32) Chi_02.addTimesI(Chimu_02, Chimu_32)