mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0)
This commit is contained in:
parent
852db4626a
commit
6fdce60492
@ -26,14 +26,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifdef KERNEL_DAG
|
||||
#define DIR0_PROJMEM(base) XP_PROJMEM(base);
|
||||
#define DIR1_PROJMEM(base) YP_PROJMEM(base);
|
||||
#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
|
||||
#define DIR3_PROJMEM(base) TP_PROJMEM(base);
|
||||
#define DIR4_PROJMEM(base) XM_PROJMEM(base);
|
||||
#define DIR5_PROJMEM(base) YM_PROJMEM(base);
|
||||
#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
|
||||
#define DIR7_PROJMEM(base) TM_PROJMEM(base);
|
||||
#define DIR0_PROJ XP_PROJ
|
||||
#define DIR1_PROJ YP_PROJ
|
||||
#define DIR2_PROJ ZP_PROJ
|
||||
#define DIR3_PROJ TP_PROJ
|
||||
#define DIR4_PROJ XM_PROJ
|
||||
#define DIR5_PROJ YM_PROJ
|
||||
#define DIR6_PROJ ZM_PROJ
|
||||
#define DIR7_PROJ TM_PROJ
|
||||
#define DIR0_RECON XP_RECON
|
||||
#define DIR1_RECON YP_RECON_ACCUM
|
||||
#define DIR2_RECON ZP_RECON_ACCUM
|
||||
@ -43,14 +43,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define DIR6_RECON ZM_RECON_ACCUM
|
||||
#define DIR7_RECON TM_RECON_ACCUM
|
||||
#else
|
||||
#define DIR0_PROJMEM(base) XM_PROJMEM(base);
|
||||
#define DIR1_PROJMEM(base) YM_PROJMEM(base);
|
||||
#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
|
||||
#define DIR3_PROJMEM(base) TM_PROJMEM(base);
|
||||
#define DIR4_PROJMEM(base) XP_PROJMEM(base);
|
||||
#define DIR5_PROJMEM(base) YP_PROJMEM(base);
|
||||
#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
|
||||
#define DIR7_PROJMEM(base) TP_PROJMEM(base);
|
||||
#define DIR0_PROJ XM_PROJ
|
||||
#define DIR1_PROJ YM_PROJ
|
||||
#define DIR2_PROJ ZM_PROJ
|
||||
#define DIR3_PROJ TM_PROJ
|
||||
#define DIR4_PROJ XP_PROJ
|
||||
#define DIR5_PROJ YP_PROJ
|
||||
#define DIR6_PROJ ZP_PROJ
|
||||
#define DIR7_PROJ TP_PROJ
|
||||
#define DIR0_RECON XM_RECON
|
||||
#define DIR1_RECON YM_RECON_ACCUM
|
||||
#define DIR2_RECON ZM_RECON_ACCUM
|
||||
@ -91,23 +91,28 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||
if ( local ) { \
|
||||
/* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \
|
||||
PROJ(base); \
|
||||
/* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
} else { \
|
||||
LOAD_CHI(base); \
|
||||
} \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(base); \
|
||||
LOAD_TABLE(PERMUTE_DIR); \
|
||||
PROJ; \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
} else { \
|
||||
LOAD_CHI(base); \
|
||||
} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
PREFETCH_CHIMU(base); \
|
||||
MULT_2SPIN_DIR_PF(Dir,basep); \
|
||||
RECON; \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
PREFETCH_CHIMU_L2(basep); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PF_GAUGE(Xp); \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||
|
||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||
@ -121,22 +126,28 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||
if ( local ) { \
|
||||
PROJ(base); \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||
if ( local || st.same_node[Dir] ) { \
|
||||
MULT_2SPIN_DIR_PF(Dir,basep); \
|
||||
RECON; \
|
||||
} \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(base); \
|
||||
LOAD_TABLE(PERMUTE_DIR); \
|
||||
PROJ; \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
PREFETCH_CHIMU(base); \
|
||||
if ( local || st.same_node[Dir] ) { \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
PREFETCH_CHIMU_L2(basep); \
|
||||
} else { PREFETCH_CHIMU(base); } \
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PF_GAUGE(Xp); \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
{ ZERO_PSI; } \
|
||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||
|
||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||
@ -149,23 +160,34 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_DIR_PF(Dir,base); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
nmu=0; \
|
||||
{ ZERO_PSI;} \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_DIR_PF(Dir,base); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
nmu=0; \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
||||
@ -201,7 +223,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
|
||||
uint64_t delta_base, delta_base_p;
|
||||
|
||||
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
|
||||
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
float rescale = 64. * 12.;
|
||||
@ -221,7 +243,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
|
||||
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
|
||||
@ -234,7 +256,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
|
||||
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
|
||||
@ -247,7 +269,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
|
||||
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
|
||||
@ -260,7 +282,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
|
||||
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
|
||||
@ -273,7 +295,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
|
||||
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
|
||||
@ -286,7 +308,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
|
||||
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
|
||||
@ -299,7 +321,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
|
||||
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
|
||||
@ -337,14 +359,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
}
|
||||
}
|
||||
|
||||
#undef DIR0_PROJMEM
|
||||
#undef DIR1_PROJMEM
|
||||
#undef DIR2_PROJMEM
|
||||
#undef DIR3_PROJMEM
|
||||
#undef DIR4_PROJMEM
|
||||
#undef DIR5_PROJMEM
|
||||
#undef DIR6_PROJMEM
|
||||
#undef DIR7_PROJMEM
|
||||
#undef DIR0_PROJ
|
||||
#undef DIR1_PROJ
|
||||
#undef DIR2_PROJ
|
||||
#undef DIR3_PROJ
|
||||
#undef DIR4_PROJ
|
||||
#undef DIR5_PROJ
|
||||
#undef DIR6_PROJ
|
||||
#undef DIR7_PROJ
|
||||
#undef DIR0_RECON
|
||||
#undef DIR1_RECON
|
||||
#undef DIR2_RECON
|
||||
|
@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x)
|
||||
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
|
||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
||||
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MASK_REGS DECLARATIONS_A64FXd
|
||||
#define COMPLEX_SIGNS(A)
|
||||
#define LOAD64(A,B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B);
|
||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
||||
MULT_2SPIN_A64FXd(A); \
|
||||
PREFETCH_CHIMU_L2(B); \
|
||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
||||
#define MAYBEPERM(A,perm) { A ; }
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
||||
#define ZERO_PSI
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
||||
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
|
||||
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
|
||||
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
|
||||
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd
|
||||
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd
|
||||
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd
|
||||
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd
|
||||
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
||||
#define XP_PROJ XP_PROJ_A64FXd
|
||||
#define YP_PROJ YP_PROJ_A64FXd
|
||||
#define ZP_PROJ ZP_PROJ_A64FXd
|
||||
#define TP_PROJ TP_PROJ_A64FXd
|
||||
#define XM_PROJ XM_PROJ_A64FXd
|
||||
#define YM_PROJ YM_PROJ_A64FXd
|
||||
#define ZM_PROJ ZM_PROJ_A64FXd
|
||||
#define TM_PROJ TM_PROJ_A64FXd
|
||||
#define XP_RECON XP_RECON_A64FXd
|
||||
#define XM_RECON XM_RECON_A64FXd
|
||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
|
||||
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
|
||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
|
||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
|
||||
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; }
|
||||
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; }
|
||||
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; }
|
||||
#define PERMUTE_DIR3
|
||||
#define PERMUTE_DIR0 0
|
||||
#define PERMUTE_DIR1 1
|
||||
#define PERMUTE_DIR2 2
|
||||
#define PERMUTE_DIR3 3
|
||||
#define PERMUTE PERMUTE_A64FXd;
|
||||
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
|
||||
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
|
||||
// DECLARATIONS
|
||||
#define DECLARATIONS_A64FXd \
|
||||
const uint64_t lut[4][8] = { \
|
||||
@ -281,8 +280,8 @@ asm ( \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERM0
|
||||
#define PERM0_A64FXd \
|
||||
// PERMUTE
|
||||
#define PERMUTE_A64FXd \
|
||||
asm ( \
|
||||
"tbl z12.d, { z12.d }, z30.d \n\t" \
|
||||
"tbl z13.d, { z13.d }, z30.d \n\t" \
|
||||
@ -295,37 +294,6 @@ asm ( \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERM1
|
||||
#define PERM1_A64FXd \
|
||||
asm ( \
|
||||
"tbl z12.d, { z12.d }, z30.d \n\t" \
|
||||
"tbl z13.d, { z13.d }, z30.d \n\t" \
|
||||
"tbl z14.d, { z14.d }, z30.d \n\t" \
|
||||
"tbl z15.d, { z15.d }, z30.d \n\t" \
|
||||
"tbl z16.d, { z16.d }, z30.d \n\t" \
|
||||
"tbl z17.d, { z17.d }, z30.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERM2
|
||||
#define PERM2_A64FXd \
|
||||
asm ( \
|
||||
"tbl z12.d, { z12.d }, z30.d \n\t" \
|
||||
"tbl z13.d, { z13.d }, z30.d \n\t" \
|
||||
"tbl z14.d, { z14.d }, z30.d \n\t" \
|
||||
"tbl z15.d, { z15.d }, z30.d \n\t" \
|
||||
"tbl z16.d, { z16.d }, z30.d \n\t" \
|
||||
"tbl z17.d, { z17.d }, z30.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERM3
|
||||
#define PERM3_A64FXd
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
@ -344,7 +312,7 @@ asm ( \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXd(A) \
|
||||
#define MULT_2SPIN_1_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
@ -375,6 +343,15 @@ asm ( \
|
||||
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN_BACKEND
|
||||
#define MULT_2SPIN_2_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
|
||||
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
|
||||
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
|
||||
@ -400,15 +377,14 @@ asm ( \
|
||||
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
|
||||
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XP_PROJ
|
||||
#define XP_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
|
||||
"fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
|
||||
"fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
|
||||
|
@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
|
||||
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
|
||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
||||
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MASK_REGS DECLARATIONS_A64FXf
|
||||
#define COMPLEX_SIGNS(A)
|
||||
#define LOAD64(A,B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B);
|
||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
||||
MULT_2SPIN_A64FXf(A); \
|
||||
PREFETCH_CHIMU_L2(B); \
|
||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
||||
#define MAYBEPERM(A,perm) { A ; }
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
||||
#define ZERO_PSI
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
||||
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
|
||||
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
|
||||
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
|
||||
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
|
||||
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
|
||||
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
|
||||
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
|
||||
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
||||
#define XP_PROJ XP_PROJ_A64FXf
|
||||
#define YP_PROJ YP_PROJ_A64FXf
|
||||
#define ZP_PROJ ZP_PROJ_A64FXf
|
||||
#define TP_PROJ TP_PROJ_A64FXf
|
||||
#define XM_PROJ XM_PROJ_A64FXf
|
||||
#define YM_PROJ YM_PROJ_A64FXf
|
||||
#define ZM_PROJ ZM_PROJ_A64FXf
|
||||
#define TM_PROJ TM_PROJ_A64FXf
|
||||
#define XP_RECON XP_RECON_A64FXf
|
||||
#define XM_RECON XM_RECON_A64FXf
|
||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
|
||||
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
|
||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
|
||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
|
||||
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
|
||||
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
|
||||
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
|
||||
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
|
||||
#define PERMUTE_DIR0 0
|
||||
#define PERMUTE_DIR1 1
|
||||
#define PERMUTE_DIR2 2
|
||||
#define PERMUTE_DIR3 3
|
||||
#define PERMUTE PERMUTE_A64FXf;
|
||||
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
|
||||
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
|
||||
// DECLARATIONS
|
||||
#define DECLARATIONS_A64FXf \
|
||||
const uint32_t lut[4][16] = { \
|
||||
@ -281,50 +280,8 @@ asm ( \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERM0
|
||||
#define PERM0_A64FXf \
|
||||
asm ( \
|
||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
||||
"tbl z14.s, { z14.s }, z30.s \n\t" \
|
||||
"tbl z15.s, { z15.s }, z30.s \n\t" \
|
||||
"tbl z16.s, { z16.s }, z30.s \n\t" \
|
||||
"tbl z17.s, { z17.s }, z30.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERM1
|
||||
#define PERM1_A64FXf \
|
||||
asm ( \
|
||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
||||
"tbl z14.s, { z14.s }, z30.s \n\t" \
|
||||
"tbl z15.s, { z15.s }, z30.s \n\t" \
|
||||
"tbl z16.s, { z16.s }, z30.s \n\t" \
|
||||
"tbl z17.s, { z17.s }, z30.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERM2
|
||||
#define PERM2_A64FXf \
|
||||
asm ( \
|
||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
||||
"tbl z14.s, { z14.s }, z30.s \n\t" \
|
||||
"tbl z15.s, { z15.s }, z30.s \n\t" \
|
||||
"tbl z16.s, { z16.s }, z30.s \n\t" \
|
||||
"tbl z17.s, { z17.s }, z30.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERM3
|
||||
#define PERM3_A64FXf \
|
||||
// PERMUTE
|
||||
#define PERMUTE_A64FXf \
|
||||
asm ( \
|
||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
||||
@ -355,7 +312,7 @@ asm ( \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXf(A) \
|
||||
#define MULT_2SPIN_1_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
@ -386,6 +343,15 @@ asm ( \
|
||||
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN_BACKEND
|
||||
#define MULT_2SPIN_2_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
|
||||
"fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
|
||||
"fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
|
||||
@ -411,15 +377,14 @@ asm ( \
|
||||
"fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
|
||||
"fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XP_PROJ
|
||||
#define XP_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
|
||||
"fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
|
||||
"fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
|
||||
|
@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x)
|
||||
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
|
||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
||||
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MASK_REGS DECLARATIONS_A64FXd
|
||||
#define COMPLEX_SIGNS(A)
|
||||
#define LOAD64(A,B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B);
|
||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
||||
MULT_2SPIN_A64FXd(A); \
|
||||
PREFETCH_CHIMU_L2(B); \
|
||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
||||
#define MAYBEPERM(A,perm) { A ; }
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
||||
#define ZERO_PSI
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
||||
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
|
||||
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
|
||||
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
|
||||
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd
|
||||
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd
|
||||
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd
|
||||
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd
|
||||
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
||||
#define XP_PROJ XP_PROJ_A64FXd
|
||||
#define YP_PROJ YP_PROJ_A64FXd
|
||||
#define ZP_PROJ ZP_PROJ_A64FXd
|
||||
#define TP_PROJ TP_PROJ_A64FXd
|
||||
#define XM_PROJ XM_PROJ_A64FXd
|
||||
#define YM_PROJ YM_PROJ_A64FXd
|
||||
#define ZM_PROJ ZM_PROJ_A64FXd
|
||||
#define TM_PROJ TM_PROJ_A64FXd
|
||||
#define XP_RECON XP_RECON_A64FXd
|
||||
#define XM_RECON XM_RECON_A64FXd
|
||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
|
||||
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
|
||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
|
||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
|
||||
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; }
|
||||
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; }
|
||||
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; }
|
||||
#define PERMUTE_DIR3
|
||||
#define PERMUTE_DIR0 0
|
||||
#define PERMUTE_DIR1 1
|
||||
#define PERMUTE_DIR2 2
|
||||
#define PERMUTE_DIR3 3
|
||||
#define PERMUTE PERMUTE_A64FXd;
|
||||
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
|
||||
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
|
||||
// DECLARATIONS
|
||||
#define DECLARATIONS_A64FXd \
|
||||
const uint64_t lut[4][8] = { \
|
||||
@ -254,8 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define LOAD_TABLE3 \
|
||||
table0 = svld1(pg1, (uint64_t*)&lut[3]);
|
||||
|
||||
// PERM0
|
||||
#define PERM0_A64FXd \
|
||||
// PERMUTE
|
||||
#define PERMUTE_A64FXd \
|
||||
Chi_00 = svtbl(Chi_00, table0); \
|
||||
Chi_01 = svtbl(Chi_01, table0); \
|
||||
Chi_02 = svtbl(Chi_02, table0); \
|
||||
@ -263,27 +262,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
Chi_11 = svtbl(Chi_11, table0); \
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// PERM1
|
||||
#define PERM1_A64FXd \
|
||||
Chi_00 = svtbl(Chi_00, table0); \
|
||||
Chi_01 = svtbl(Chi_01, table0); \
|
||||
Chi_02 = svtbl(Chi_02, table0); \
|
||||
Chi_10 = svtbl(Chi_10, table0); \
|
||||
Chi_11 = svtbl(Chi_11, table0); \
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// PERM2
|
||||
#define PERM2_A64FXd \
|
||||
Chi_00 = svtbl(Chi_00, table0); \
|
||||
Chi_01 = svtbl(Chi_01, table0); \
|
||||
Chi_02 = svtbl(Chi_02, table0); \
|
||||
Chi_10 = svtbl(Chi_10, table0); \
|
||||
Chi_11 = svtbl(Chi_11, table0); \
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// PERM3
|
||||
#define PERM3_A64FXd
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
@ -296,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXd(A) \
|
||||
#define MULT_2SPIN_1_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||
@ -320,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
|
||||
}
|
||||
// MULT_2SPIN_BACKEND
|
||||
#define MULT_2SPIN_2_A64FXd \
|
||||
{ \
|
||||
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
|
||||
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
|
||||
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
|
||||
|
@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
|
||||
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
|
||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
||||
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MASK_REGS DECLARATIONS_A64FXf
|
||||
#define COMPLEX_SIGNS(A)
|
||||
#define LOAD64(A,B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B);
|
||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
||||
MULT_2SPIN_A64FXf(A); \
|
||||
PREFETCH_CHIMU_L2(B); \
|
||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
||||
#define MAYBEPERM(A,perm) { A ; }
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
||||
#define ZERO_PSI
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
||||
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
|
||||
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
|
||||
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
|
||||
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
|
||||
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
|
||||
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
|
||||
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
|
||||
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
||||
#define XP_PROJ XP_PROJ_A64FXf
|
||||
#define YP_PROJ YP_PROJ_A64FXf
|
||||
#define ZP_PROJ ZP_PROJ_A64FXf
|
||||
#define TP_PROJ TP_PROJ_A64FXf
|
||||
#define XM_PROJ XM_PROJ_A64FXf
|
||||
#define YM_PROJ YM_PROJ_A64FXf
|
||||
#define ZM_PROJ ZM_PROJ_A64FXf
|
||||
#define TM_PROJ TM_PROJ_A64FXf
|
||||
#define XP_RECON XP_RECON_A64FXf
|
||||
#define XM_RECON XM_RECON_A64FXf
|
||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
|
||||
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
|
||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
|
||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
|
||||
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
|
||||
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
|
||||
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
|
||||
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
|
||||
#define PERMUTE_DIR0 0
|
||||
#define PERMUTE_DIR1 1
|
||||
#define PERMUTE_DIR2 2
|
||||
#define PERMUTE_DIR3 3
|
||||
#define PERMUTE PERMUTE_A64FXf;
|
||||
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
|
||||
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
|
||||
// DECLARATIONS
|
||||
#define DECLARATIONS_A64FXf \
|
||||
const uint32_t lut[4][16] = { \
|
||||
@ -254,35 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define LOAD_TABLE3 \
|
||||
table0 = svld1(pg1, (uint32_t*)&lut[3]);
|
||||
|
||||
// PERM0
|
||||
#define PERM0_A64FXf \
|
||||
Chi_00 = svtbl(Chi_00, table0); \
|
||||
Chi_01 = svtbl(Chi_01, table0); \
|
||||
Chi_02 = svtbl(Chi_02, table0); \
|
||||
Chi_10 = svtbl(Chi_10, table0); \
|
||||
Chi_11 = svtbl(Chi_11, table0); \
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// PERM1
|
||||
#define PERM1_A64FXf \
|
||||
Chi_00 = svtbl(Chi_00, table0); \
|
||||
Chi_01 = svtbl(Chi_01, table0); \
|
||||
Chi_02 = svtbl(Chi_02, table0); \
|
||||
Chi_10 = svtbl(Chi_10, table0); \
|
||||
Chi_11 = svtbl(Chi_11, table0); \
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// PERM2
|
||||
#define PERM2_A64FXf \
|
||||
Chi_00 = svtbl(Chi_00, table0); \
|
||||
Chi_01 = svtbl(Chi_01, table0); \
|
||||
Chi_02 = svtbl(Chi_02, table0); \
|
||||
Chi_10 = svtbl(Chi_10, table0); \
|
||||
Chi_11 = svtbl(Chi_11, table0); \
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// PERM3
|
||||
#define PERM3_A64FXf \
|
||||
// PERMUTE
|
||||
#define PERMUTE_A64FXf \
|
||||
Chi_00 = svtbl(Chi_00, table0); \
|
||||
Chi_01 = svtbl(Chi_01, table0); \
|
||||
Chi_02 = svtbl(Chi_02, table0); \
|
||||
@ -302,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXf(A) \
|
||||
#define MULT_2SPIN_1_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||
@ -326,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
|
||||
}
|
||||
// MULT_2SPIN_BACKEND
|
||||
#define MULT_2SPIN_2_A64FXf \
|
||||
{ \
|
||||
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
|
||||
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
|
||||
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
|
||||
|
@ -26,14 +26,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#undef LOAD_CHIMU_A64FXd
|
||||
#undef LOAD_CHIMU_A64FXf
|
||||
#undef LOAD_CHIMU
|
||||
#undef PREFETCH_CHIMU_L1
|
||||
#undef PREFETCH_GAUGE_L1
|
||||
#undef PREFETCH_CHIMU_L2
|
||||
#undef PREFETCH_GAUGE_L2
|
||||
#undef PREFETCH_GAUGE_L1_INTERNAL
|
||||
#undef PF_GAUGE
|
||||
#undef PREFETCH1_CHIMU
|
||||
#undef PREFETCH_CHIMU
|
||||
#undef PREFETCH_RESULT_L2_STORE
|
||||
@ -42,22 +40,20 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#undef LOCK_GAUGE
|
||||
#undef UNLOCK_GAUGE
|
||||
#undef MASK_REGS
|
||||
#undef COMPLEX_SIGNS
|
||||
#undef LOAD64
|
||||
#undef SAVE_RESULT
|
||||
#undef ADD_RESULT
|
||||
#undef MULT_2SPIN_DIR_PF
|
||||
#undef MULT_2SPIN_1
|
||||
#undef MULT_2SPIN_2
|
||||
#undef MAYBEPERM
|
||||
#undef LOAD_CHI
|
||||
#undef ZERO_PSI
|
||||
#undef XP_PROJMEM
|
||||
#undef YP_PROJMEM
|
||||
#undef ZP_PROJMEM
|
||||
#undef TP_PROJMEM
|
||||
#undef XM_PROJMEM
|
||||
#undef YM_PROJMEM
|
||||
#undef ZM_PROJMEM
|
||||
#undef TM_PROJMEM
|
||||
#undef XP_PROJ
|
||||
#undef YP_PROJ
|
||||
#undef ZP_PROJ
|
||||
#undef TP_PROJ
|
||||
#undef XM_PROJ
|
||||
#undef YM_PROJ
|
||||
#undef ZM_PROJ
|
||||
#undef TM_PROJ
|
||||
#undef XP_RECON
|
||||
#undef XM_RECON
|
||||
#undef XM_RECON_ACCUM
|
||||
@ -68,10 +64,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#undef YP_RECON_ACCUM
|
||||
#undef ZP_RECON_ACCUM
|
||||
#undef TP_RECON_ACCUM
|
||||
#undef PERMUTE
|
||||
#undef PERMUTE_DIR0
|
||||
#undef PERMUTE_DIR1
|
||||
#undef PERMUTE_DIR2
|
||||
#undef PERMUTE_DIR3
|
||||
#undef LOAD_TABLE
|
||||
#undef LOAD_TABLE0
|
||||
#undef LOAD_TABLE1
|
||||
#undef LOAD_TABLE2
|
||||
|
@ -115,37 +115,9 @@ STORE_BASE_PTR_COLOR_OFFSET = 2
|
||||
# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2)
|
||||
|
||||
OPT = """
|
||||
#ifdef INTERIOR
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||
if ( local ) { \
|
||||
-- LOAD64(%r10,isigns); \
|
||||
PROJ(base); \
|
||||
++ PF_GAUGE(Dir); \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
} else if ( st.same_node[Dir] ) {
|
||||
LOAD_CHI(base);
|
||||
++ PF_GAUGE(Dir);
|
||||
} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
if ( local || st.same_node[Dir] ) { \
|
||||
MULT_2SPIN_DIR_PF(Dir,basep); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
-- LOAD64(%r10,isigns); \
|
||||
RECON; \
|
||||
} else { PREFETCH_CHIMU(base); }
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
-- PF_GAUGE(Xp); \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
{ ZERO_PSI; } \
|
||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||
|
||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||
|
||||
#endif
|
||||
* interleave prefetching and compute in MULT_2SPIN
|
||||
* could test storing U's in MULT_2SPIN to L1d, might be beneficial for life time cache lines
|
||||
* structure reordering: MAYBEPERM after MULT_2SPIN ?
|
||||
"""
|
||||
|
||||
filename = 'XXX'
|
||||
@ -905,7 +877,8 @@ if ALTERNATIVE_LOADS == True:
|
||||
define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}')
|
||||
define(F'LOAD_CHIMU(x)')
|
||||
else:
|
||||
define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)')
|
||||
#define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)')
|
||||
define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)')
|
||||
|
||||
if PREFETCH:
|
||||
define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)')
|
||||
@ -935,39 +908,22 @@ define(F'UNLOCK_GAUGE(A)')
|
||||
define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}')
|
||||
define(F'COMPLEX_SIGNS(A)')
|
||||
define(F'LOAD64(A,B)')
|
||||
# prefetch chimu here is useless, because already done in last leg
|
||||
#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);')
|
||||
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);')
|
||||
if PREFETCH:
|
||||
definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ')
|
||||
write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
|
||||
write (F' PREFETCH_CHIMU_L2(B); \\')
|
||||
write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}')
|
||||
|
||||
# definemultiline(F'MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A);')
|
||||
# write (F' PREFETCH_CHIMU_L2(B); \\')
|
||||
# write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
|
||||
# write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}')
|
||||
newline()
|
||||
else:
|
||||
define(F'MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_{PRECSUFFIX}(A)')
|
||||
# break out maybeperm in permutes
|
||||
#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}')
|
||||
define(F'MAYBEPERM(A,perm) {{ A ; }}')
|
||||
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)')
|
||||
define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)')
|
||||
define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}')
|
||||
define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)')
|
||||
# don't need zero psi, everything is done in recons
|
||||
#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}')
|
||||
define(F'ZERO_PSI')
|
||||
define(F'ADD_RESULT(base,basep) LOAD_CHIMU_{PRECSUFFIX}(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
|
||||
define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
|
||||
# loads projections
|
||||
define(F'XP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XP_PROJ_{PRECSUFFIX}')
|
||||
define(F'YP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YP_PROJ_{PRECSUFFIX}')
|
||||
define(F'ZP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZP_PROJ_{PRECSUFFIX}')
|
||||
define(F'TP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TP_PROJ_{PRECSUFFIX}')
|
||||
define(F'XM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XM_PROJ_{PRECSUFFIX}')
|
||||
define(F'YM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YM_PROJ_{PRECSUFFIX}')
|
||||
define(F'ZM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZM_PROJ_{PRECSUFFIX}')
|
||||
define(F'TM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TM_PROJ_{PRECSUFFIX}')
|
||||
define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}')
|
||||
define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}')
|
||||
define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}')
|
||||
define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}')
|
||||
define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}')
|
||||
define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}')
|
||||
define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}')
|
||||
define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}')
|
||||
# recons
|
||||
define(F'XP_RECON XP_RECON_{PRECSUFFIX}')
|
||||
define(F'XM_RECON XM_RECON_{PRECSUFFIX}')
|
||||
@ -979,14 +935,21 @@ define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}')
|
||||
define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}')
|
||||
define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}')
|
||||
define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}')
|
||||
# permutes
|
||||
define(F'PERMUTE_DIR0 LOAD_TABLE0; if (perm) {{ PERM0_{PRECSUFFIX}; }}')
|
||||
define(F'PERMUTE_DIR1 LOAD_TABLE1; if (perm) {{ PERM1_{PRECSUFFIX}; }}')
|
||||
define(F'PERMUTE_DIR2 LOAD_TABLE2; if (perm) {{ PERM2_{PRECSUFFIX}; }}')
|
||||
# new permutes
|
||||
define(F'PERMUTE_DIR0 0')
|
||||
define(F'PERMUTE_DIR1 1')
|
||||
define(F'PERMUTE_DIR2 2')
|
||||
define(F'PERMUTE_DIR3 3')
|
||||
define(F'PERMUTE PERMUTE_{PRECSUFFIX};')
|
||||
# load table
|
||||
#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}')
|
||||
if PRECISION == 'double':
|
||||
define(F'PERMUTE_DIR3')
|
||||
define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}')
|
||||
define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}')
|
||||
else:
|
||||
define(F'PERMUTE_DIR3 LOAD_TABLE3; if (perm) {{ PERM3_{PRECSUFFIX}; }}')
|
||||
define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}')
|
||||
define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}')
|
||||
|
||||
|
||||
|
||||
write('// DECLARATIONS')
|
||||
@ -1040,20 +1003,14 @@ U_01.declare()
|
||||
U_11.declare()
|
||||
U_21.declare() # 6 -> 30 regs
|
||||
|
||||
# all true
|
||||
# all predications true
|
||||
pg1.declare()
|
||||
if PRECISION == 'double':
|
||||
pg1.movestr('svptrue_b64()')
|
||||
else:
|
||||
pg1.movestr('svptrue_b32()')
|
||||
|
||||
# even elements only
|
||||
#pg2.declare()
|
||||
#pg2.movestr('svzip1_b64(svptrue_b64(), svpfalse_b())')
|
||||
|
||||
# preload tables
|
||||
# 0: swap
|
||||
# 1: permute 1
|
||||
# tables
|
||||
if PRECISION == 'double':
|
||||
write(' svuint64_t table0; \\', target='I') # -> 31 regs
|
||||
else:
|
||||
@ -1061,10 +1018,10 @@ else:
|
||||
|
||||
zero0.declare()
|
||||
|
||||
# zero register
|
||||
asmopen()
|
||||
zero0.zero(zeroreg=True)
|
||||
asmclose()
|
||||
|
||||
newline()
|
||||
|
||||
define('Chimu_00 Chi_00', target='I')
|
||||
@ -1087,7 +1044,6 @@ else: # wilson4.h
|
||||
define('Chimu_30 U_01', target='I')
|
||||
define('Chimu_31 U_11', target='I')
|
||||
define('Chimu_32 U_21', target='I')
|
||||
|
||||
newline()
|
||||
|
||||
|
||||
@ -1380,47 +1336,11 @@ table0.loadtable(3)
|
||||
asmclose()
|
||||
newline()
|
||||
|
||||
# 8 directions = 6x permutations
|
||||
d['factor'] = 2 # factor is 0
|
||||
d['cycles_PERM'] += 6 * d['factor']
|
||||
write('// PERM0')
|
||||
definemultiline(F'PERM0_{PRECSUFFIX}')
|
||||
debugall('PERM0 PRE', group='Chi')
|
||||
asmopen()
|
||||
#table0.loadtable(0)
|
||||
Chi_00.permute(0, table0)
|
||||
Chi_01.permute(0, table0)
|
||||
Chi_02.permute(0, table0)
|
||||
Chi_10.permute(0, table0)
|
||||
Chi_11.permute(0, table0)
|
||||
Chi_12.permute(0, table0)
|
||||
asmclose()
|
||||
debugall('PERM0 POST', group='Chi')
|
||||
newline()
|
||||
|
||||
d['factor'] = 2 # factor is 2
|
||||
d['cycles_PERM'] += 6 * d['factor']
|
||||
write('// PERM1')
|
||||
definemultiline(F'PERM1_{PRECSUFFIX}')
|
||||
debugall('PERM1 PRE', group='Chi')
|
||||
asmopen()
|
||||
#table0.loadtable(1)
|
||||
Chi_00.permute(1, table0)
|
||||
Chi_01.permute(1, table0)
|
||||
Chi_02.permute(1, table0)
|
||||
Chi_10.permute(1, table0)
|
||||
Chi_11.permute(1, table0)
|
||||
Chi_12.permute(1, table0)
|
||||
asmclose()
|
||||
debugall('PERM1 POST', group='Chi')
|
||||
newline()
|
||||
|
||||
d['factor'] = 2 # factor is 2
|
||||
# PERM2 = swap real and imaginary
|
||||
d['cycles_PERM'] += 6 * d['factor']
|
||||
write('// PERM2')
|
||||
definemultiline(F'PERM2_{PRECSUFFIX}')
|
||||
debugall('PERM2 PRE', group='Chi')
|
||||
write('// PERMUTE')
|
||||
definemultiline(F'PERMUTE_{PRECSUFFIX}')
|
||||
debugall('PERM PRE', group='Chi')
|
||||
asmopen()
|
||||
#table0.loadtable(2)
|
||||
Chi_00.permute(2, table0)
|
||||
@ -1430,26 +1350,7 @@ Chi_10.permute(2, table0)
|
||||
Chi_11.permute(2, table0)
|
||||
Chi_12.permute(2, table0)
|
||||
asmclose()
|
||||
debugall('PERM2 POST', group='Chi')
|
||||
newline()
|
||||
|
||||
# PERM3 = identity (DP), so exclude from counting
|
||||
d['factor'] = 0
|
||||
d['cycles_PERM'] += 6 * d['factor']
|
||||
write('// PERM3')
|
||||
definemultiline(F'PERM3_{PRECSUFFIX}')
|
||||
if PRECISION == 'single':
|
||||
debugall('PERM3 PRE', group='Chi')
|
||||
asmopen()
|
||||
#table0.loadtable(3)
|
||||
Chi_00.permute(3, table0)
|
||||
Chi_01.permute(3, table0)
|
||||
Chi_02.permute(3, table0)
|
||||
Chi_10.permute(3, table0)
|
||||
Chi_11.permute(3, table0)
|
||||
Chi_12.permute(3, table0)
|
||||
asmclose()
|
||||
debugall('PERM3 POST', group='Chi')
|
||||
debugall('PERM POST', group='Chi')
|
||||
newline()
|
||||
|
||||
write('// LOAD_GAUGE')
|
||||
@ -1473,7 +1374,7 @@ if ASM_LOAD_GAUGE:
|
||||
asmclose()
|
||||
curlyclose()
|
||||
newline()
|
||||
# XXXXXX remove loads
|
||||
|
||||
d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
|
||||
# assume all U loads are hidden
|
||||
# FCMLA issue latency = 2 cycles
|
||||
@ -1482,7 +1383,7 @@ d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
|
||||
# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9
|
||||
d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor']
|
||||
write('// MULT_2SPIN')
|
||||
definemultiline(F'MULT_2SPIN_{PRECSUFFIX}(A)')
|
||||
definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)')
|
||||
curlyopen()
|
||||
#write(' const auto & ref(U[sU][A]); \\')
|
||||
if GRIDBENCH: # referencing differs in Grid and GridBench
|
||||
@ -1541,7 +1442,15 @@ if ASM_LOAD_GAUGE:
|
||||
U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded
|
||||
U_10.load("ref[1][2]") # early load
|
||||
U_20.load("ref[2][2]") # A -->
|
||||
asmclose()
|
||||
debugall('MULT_2SPIN_1', group='UChi')
|
||||
curlyclose()
|
||||
newline()
|
||||
|
||||
write('// MULT_2SPIN_BACKEND')
|
||||
definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}')
|
||||
curlyopen()
|
||||
asmopen()
|
||||
# round 3
|
||||
UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and
|
||||
UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90)
|
||||
@ -1571,7 +1480,7 @@ UChi_11.mac1(U_10, Chi_12)
|
||||
UChi_02.mac1(U_20, Chi_02)
|
||||
UChi_12.mac1(U_20, Chi_12)
|
||||
asmclose()
|
||||
debugall('MULT_2SPIN', group='UChi')
|
||||
debugall('MULT_2SPIN_2', group='UChi')
|
||||
curlyclose()
|
||||
newline()
|
||||
|
||||
@ -1587,7 +1496,7 @@ if ALTERNATIVE_LOADS == True:
|
||||
write(' LOAD_CHIMU_0312_PLUG \\')
|
||||
curlyopen()
|
||||
asmopen()
|
||||
pg1.loadpredication()
|
||||
#pg1.loadpredication()
|
||||
Chi_00.addTimesI(Chimu_00, Chimu_30)
|
||||
Chi_01.addTimesI(Chimu_01, Chimu_31)
|
||||
Chi_02.addTimesI(Chimu_02, Chimu_32)
|
||||
|
Loading…
Reference in New Issue
Block a user