mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0)
This commit is contained in:
parent
852db4626a
commit
6fdce60492
@ -26,14 +26,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#ifdef KERNEL_DAG
|
#ifdef KERNEL_DAG
|
||||||
#define DIR0_PROJMEM(base) XP_PROJMEM(base);
|
#define DIR0_PROJ XP_PROJ
|
||||||
#define DIR1_PROJMEM(base) YP_PROJMEM(base);
|
#define DIR1_PROJ YP_PROJ
|
||||||
#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
|
#define DIR2_PROJ ZP_PROJ
|
||||||
#define DIR3_PROJMEM(base) TP_PROJMEM(base);
|
#define DIR3_PROJ TP_PROJ
|
||||||
#define DIR4_PROJMEM(base) XM_PROJMEM(base);
|
#define DIR4_PROJ XM_PROJ
|
||||||
#define DIR5_PROJMEM(base) YM_PROJMEM(base);
|
#define DIR5_PROJ YM_PROJ
|
||||||
#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
|
#define DIR6_PROJ ZM_PROJ
|
||||||
#define DIR7_PROJMEM(base) TM_PROJMEM(base);
|
#define DIR7_PROJ TM_PROJ
|
||||||
#define DIR0_RECON XP_RECON
|
#define DIR0_RECON XP_RECON
|
||||||
#define DIR1_RECON YP_RECON_ACCUM
|
#define DIR1_RECON YP_RECON_ACCUM
|
||||||
#define DIR2_RECON ZP_RECON_ACCUM
|
#define DIR2_RECON ZP_RECON_ACCUM
|
||||||
@ -43,14 +43,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define DIR6_RECON ZM_RECON_ACCUM
|
#define DIR6_RECON ZM_RECON_ACCUM
|
||||||
#define DIR7_RECON TM_RECON_ACCUM
|
#define DIR7_RECON TM_RECON_ACCUM
|
||||||
#else
|
#else
|
||||||
#define DIR0_PROJMEM(base) XM_PROJMEM(base);
|
#define DIR0_PROJ XM_PROJ
|
||||||
#define DIR1_PROJMEM(base) YM_PROJMEM(base);
|
#define DIR1_PROJ YM_PROJ
|
||||||
#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
|
#define DIR2_PROJ ZM_PROJ
|
||||||
#define DIR3_PROJMEM(base) TM_PROJMEM(base);
|
#define DIR3_PROJ TM_PROJ
|
||||||
#define DIR4_PROJMEM(base) XP_PROJMEM(base);
|
#define DIR4_PROJ XP_PROJ
|
||||||
#define DIR5_PROJMEM(base) YP_PROJMEM(base);
|
#define DIR5_PROJ YP_PROJ
|
||||||
#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
|
#define DIR6_PROJ ZP_PROJ
|
||||||
#define DIR7_PROJMEM(base) TP_PROJMEM(base);
|
#define DIR7_PROJ TP_PROJ
|
||||||
#define DIR0_RECON XM_RECON
|
#define DIR0_RECON XM_RECON
|
||||||
#define DIR1_RECON YM_RECON_ACCUM
|
#define DIR1_RECON YM_RECON_ACCUM
|
||||||
#define DIR2_RECON ZM_RECON_ACCUM
|
#define DIR2_RECON ZM_RECON_ACCUM
|
||||||
@ -91,23 +91,28 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
|
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
/* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \
|
LOAD_CHIMU(base); \
|
||||||
PROJ(base); \
|
LOAD_TABLE(PERMUTE_DIR); \
|
||||||
/* PREFETCH_GAUGE_L1(Dir); slightly worse performance */ \
|
PROJ; \
|
||||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||||
} else { \
|
} else { \
|
||||||
LOAD_CHI(base); \
|
LOAD_CHI(base); \
|
||||||
} \
|
} \
|
||||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||||
PREFETCH_CHIMU(base); \
|
MULT_2SPIN_1(Dir); \
|
||||||
MULT_2SPIN_DIR_PF(Dir,basep); \
|
PREFETCH_CHIMU(base); \
|
||||||
RECON; \
|
PREFETCH_CHIMU_L2(basep); \
|
||||||
|
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||||
|
MULT_2SPIN_2; \
|
||||||
|
if (s == 0) { \
|
||||||
|
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||||
|
} \
|
||||||
|
RECON; \
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
PF_GAUGE(Xp); \
|
PREFETCH1_CHIMU(base); \
|
||||||
PREFETCH1_CHIMU(base); \
|
|
||||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||||
|
|
||||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||||
@ -121,22 +126,28 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
|
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
PROJ(base); \
|
LOAD_CHIMU(base); \
|
||||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
LOAD_TABLE(PERMUTE_DIR); \
|
||||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
PROJ; \
|
||||||
if ( local || st.same_node[Dir] ) { \
|
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||||
MULT_2SPIN_DIR_PF(Dir,basep); \
|
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||||
RECON; \
|
|
||||||
} \
|
|
||||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||||
PREFETCH_CHIMU(base); \
|
if ( local || st.same_node[Dir] ) { \
|
||||||
|
MULT_2SPIN_1(Dir); \
|
||||||
|
PREFETCH_CHIMU(base); \
|
||||||
|
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||||
|
MULT_2SPIN_2; \
|
||||||
|
if (s == 0) { \
|
||||||
|
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||||
|
} \
|
||||||
|
RECON; \
|
||||||
|
PREFETCH_CHIMU_L2(basep); \
|
||||||
|
} else { PREFETCH_CHIMU(base); } \
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
PF_GAUGE(Xp); \
|
|
||||||
PREFETCH1_CHIMU(base); \
|
PREFETCH1_CHIMU(base); \
|
||||||
{ ZERO_PSI; } \
|
|
||||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||||
|
|
||||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||||
@ -149,23 +160,34 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
|
|
||||||
|
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||||
LOAD_CHI(base); \
|
LOAD_CHI(base); \
|
||||||
MULT_2SPIN_DIR_PF(Dir,base); \
|
MULT_2SPIN_1(Dir); \
|
||||||
RECON; \
|
PREFETCH_CHIMU(base); \
|
||||||
nmu++; \
|
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||||
|
MULT_2SPIN_2; \
|
||||||
|
if (s == 0) { \
|
||||||
|
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||||
|
} \
|
||||||
|
RECON; \
|
||||||
|
nmu++; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
nmu=0; \
|
nmu=0; \
|
||||||
{ ZERO_PSI;} \
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
LOAD_CHI(base); \
|
||||||
LOAD_CHI(base); \
|
MULT_2SPIN_1(Dir); \
|
||||||
MULT_2SPIN_DIR_PF(Dir,base); \
|
PREFETCH_CHIMU(base); \
|
||||||
RECON; \
|
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||||
nmu++; \
|
MULT_2SPIN_2; \
|
||||||
|
if (s == 0) { \
|
||||||
|
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||||
|
} \
|
||||||
|
RECON; \
|
||||||
|
nmu++; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
||||||
@ -201,7 +223,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
|
|
||||||
uint64_t delta_base, delta_base_p;
|
uint64_t delta_base, delta_base_p;
|
||||||
|
|
||||||
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
|
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
float rescale = 64. * 12.;
|
float rescale = 64. * 12.;
|
||||||
@ -221,7 +243,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
|
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
|
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
|
||||||
@ -234,7 +256,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
|
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
|
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
|
||||||
@ -247,7 +269,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
|
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
|
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
|
||||||
@ -260,7 +282,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
|
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
|
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
|
||||||
@ -273,7 +295,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
|
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
|
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
|
||||||
@ -286,7 +308,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
|
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
|
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
|
||||||
@ -299,7 +321,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
|
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
|
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
|
||||||
@ -337,14 +359,14 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef DIR0_PROJMEM
|
#undef DIR0_PROJ
|
||||||
#undef DIR1_PROJMEM
|
#undef DIR1_PROJ
|
||||||
#undef DIR2_PROJMEM
|
#undef DIR2_PROJ
|
||||||
#undef DIR3_PROJMEM
|
#undef DIR3_PROJ
|
||||||
#undef DIR4_PROJMEM
|
#undef DIR4_PROJ
|
||||||
#undef DIR5_PROJMEM
|
#undef DIR5_PROJ
|
||||||
#undef DIR6_PROJMEM
|
#undef DIR6_PROJ
|
||||||
#undef DIR7_PROJMEM
|
#undef DIR7_PROJ
|
||||||
#undef DIR0_RECON
|
#undef DIR0_RECON
|
||||||
#undef DIR1_RECON
|
#undef DIR1_RECON
|
||||||
#undef DIR2_RECON
|
#undef DIR2_RECON
|
||||||
|
@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x)
|
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
|
||||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
|
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
|
||||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
|
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
|
||||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
||||||
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define MASK_REGS DECLARATIONS_A64FXd
|
#define MASK_REGS DECLARATIONS_A64FXd
|
||||||
#define COMPLEX_SIGNS(A)
|
#define COMPLEX_SIGNS(A)
|
||||||
#define LOAD64(A,B)
|
#define LOAD64(A,B)
|
||||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B);
|
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
||||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
||||||
MULT_2SPIN_A64FXd(A); \
|
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
||||||
PREFETCH_CHIMU_L2(B); \
|
|
||||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
|
||||||
#define MAYBEPERM(A,perm) { A ; }
|
|
||||||
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
||||||
#define ZERO_PSI
|
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
||||||
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
#define XP_PROJ XP_PROJ_A64FXd
|
||||||
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
|
#define YP_PROJ YP_PROJ_A64FXd
|
||||||
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
|
#define ZP_PROJ ZP_PROJ_A64FXd
|
||||||
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
|
#define TP_PROJ TP_PROJ_A64FXd
|
||||||
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd
|
#define XM_PROJ XM_PROJ_A64FXd
|
||||||
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd
|
#define YM_PROJ YM_PROJ_A64FXd
|
||||||
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd
|
#define ZM_PROJ ZM_PROJ_A64FXd
|
||||||
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd
|
#define TM_PROJ TM_PROJ_A64FXd
|
||||||
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
|
|
||||||
#define XP_RECON XP_RECON_A64FXd
|
#define XP_RECON XP_RECON_A64FXd
|
||||||
#define XM_RECON XM_RECON_A64FXd
|
#define XM_RECON XM_RECON_A64FXd
|
||||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
|
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
|
||||||
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
|
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
|
||||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
|
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
|
||||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
|
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
|
||||||
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; }
|
#define PERMUTE_DIR0 0
|
||||||
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; }
|
#define PERMUTE_DIR1 1
|
||||||
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; }
|
#define PERMUTE_DIR2 2
|
||||||
#define PERMUTE_DIR3
|
#define PERMUTE_DIR3 3
|
||||||
|
#define PERMUTE PERMUTE_A64FXd;
|
||||||
|
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
|
||||||
|
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
|
||||||
// DECLARATIONS
|
// DECLARATIONS
|
||||||
#define DECLARATIONS_A64FXd \
|
#define DECLARATIONS_A64FXd \
|
||||||
const uint64_t lut[4][8] = { \
|
const uint64_t lut[4][8] = { \
|
||||||
@ -281,8 +280,8 @@ asm ( \
|
|||||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||||
);
|
);
|
||||||
|
|
||||||
// PERM0
|
// PERMUTE
|
||||||
#define PERM0_A64FXd \
|
#define PERMUTE_A64FXd \
|
||||||
asm ( \
|
asm ( \
|
||||||
"tbl z12.d, { z12.d }, z30.d \n\t" \
|
"tbl z12.d, { z12.d }, z30.d \n\t" \
|
||||||
"tbl z13.d, { z13.d }, z30.d \n\t" \
|
"tbl z13.d, { z13.d }, z30.d \n\t" \
|
||||||
@ -295,37 +294,6 @@ asm ( \
|
|||||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||||
);
|
);
|
||||||
|
|
||||||
// PERM1
|
|
||||||
#define PERM1_A64FXd \
|
|
||||||
asm ( \
|
|
||||||
"tbl z12.d, { z12.d }, z30.d \n\t" \
|
|
||||||
"tbl z13.d, { z13.d }, z30.d \n\t" \
|
|
||||||
"tbl z14.d, { z14.d }, z30.d \n\t" \
|
|
||||||
"tbl z15.d, { z15.d }, z30.d \n\t" \
|
|
||||||
"tbl z16.d, { z16.d }, z30.d \n\t" \
|
|
||||||
"tbl z17.d, { z17.d }, z30.d \n\t" \
|
|
||||||
: \
|
|
||||||
: \
|
|
||||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
|
||||||
);
|
|
||||||
|
|
||||||
// PERM2
|
|
||||||
#define PERM2_A64FXd \
|
|
||||||
asm ( \
|
|
||||||
"tbl z12.d, { z12.d }, z30.d \n\t" \
|
|
||||||
"tbl z13.d, { z13.d }, z30.d \n\t" \
|
|
||||||
"tbl z14.d, { z14.d }, z30.d \n\t" \
|
|
||||||
"tbl z15.d, { z15.d }, z30.d \n\t" \
|
|
||||||
"tbl z16.d, { z16.d }, z30.d \n\t" \
|
|
||||||
"tbl z17.d, { z17.d }, z30.d \n\t" \
|
|
||||||
: \
|
|
||||||
: \
|
|
||||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
|
||||||
);
|
|
||||||
|
|
||||||
// PERM3
|
|
||||||
#define PERM3_A64FXd
|
|
||||||
|
|
||||||
// LOAD_GAUGE
|
// LOAD_GAUGE
|
||||||
#define LOAD_GAUGE \
|
#define LOAD_GAUGE \
|
||||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||||
@ -344,7 +312,7 @@ asm ( \
|
|||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
// MULT_2SPIN
|
// MULT_2SPIN
|
||||||
#define MULT_2SPIN_A64FXd(A) \
|
#define MULT_2SPIN_1_A64FXd(A) \
|
||||||
{ \
|
{ \
|
||||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||||
asm ( \
|
asm ( \
|
||||||
@ -375,6 +343,15 @@ asm ( \
|
|||||||
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
|
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
|
||||||
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
|
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
|
||||||
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
|
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
|
||||||
|
: \
|
||||||
|
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||||
|
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||||
|
); \
|
||||||
|
}
|
||||||
|
// MULT_2SPIN_BACKEND
|
||||||
|
#define MULT_2SPIN_2_A64FXd \
|
||||||
|
{ \
|
||||||
|
asm ( \
|
||||||
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
|
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
|
||||||
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
|
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
|
||||||
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
|
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
|
||||||
@ -400,15 +377,14 @@ asm ( \
|
|||||||
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
|
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
|
||||||
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
|
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
|
||||||
: \
|
: \
|
||||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
: \
|
||||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
// XP_PROJ
|
// XP_PROJ
|
||||||
#define XP_PROJ_A64FXd \
|
#define XP_PROJ_A64FXd \
|
||||||
{ \
|
{ \
|
||||||
asm ( \
|
asm ( \
|
||||||
"ptrue p5.d \n\t" \
|
|
||||||
"fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
|
"fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
|
||||||
"fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
|
"fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
|
||||||
"fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
|
"fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
|
||||||
|
@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
|
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
|
||||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
|
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
|
||||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
|
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
|
||||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
||||||
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define MASK_REGS DECLARATIONS_A64FXf
|
#define MASK_REGS DECLARATIONS_A64FXf
|
||||||
#define COMPLEX_SIGNS(A)
|
#define COMPLEX_SIGNS(A)
|
||||||
#define LOAD64(A,B)
|
#define LOAD64(A,B)
|
||||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B);
|
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
||||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
||||||
MULT_2SPIN_A64FXf(A); \
|
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
||||||
PREFETCH_CHIMU_L2(B); \
|
|
||||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
|
||||||
#define MAYBEPERM(A,perm) { A ; }
|
|
||||||
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
||||||
#define ZERO_PSI
|
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
||||||
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
#define XP_PROJ XP_PROJ_A64FXf
|
||||||
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
|
#define YP_PROJ YP_PROJ_A64FXf
|
||||||
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
|
#define ZP_PROJ ZP_PROJ_A64FXf
|
||||||
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
|
#define TP_PROJ TP_PROJ_A64FXf
|
||||||
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
|
#define XM_PROJ XM_PROJ_A64FXf
|
||||||
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
|
#define YM_PROJ YM_PROJ_A64FXf
|
||||||
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
|
#define ZM_PROJ ZM_PROJ_A64FXf
|
||||||
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
|
#define TM_PROJ TM_PROJ_A64FXf
|
||||||
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
|
|
||||||
#define XP_RECON XP_RECON_A64FXf
|
#define XP_RECON XP_RECON_A64FXf
|
||||||
#define XM_RECON XM_RECON_A64FXf
|
#define XM_RECON XM_RECON_A64FXf
|
||||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
|
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
|
||||||
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
|
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
|
||||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
|
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
|
||||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
|
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
|
||||||
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
|
#define PERMUTE_DIR0 0
|
||||||
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
|
#define PERMUTE_DIR1 1
|
||||||
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
|
#define PERMUTE_DIR2 2
|
||||||
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
|
#define PERMUTE_DIR3 3
|
||||||
|
#define PERMUTE PERMUTE_A64FXf;
|
||||||
|
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
|
||||||
|
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
|
||||||
// DECLARATIONS
|
// DECLARATIONS
|
||||||
#define DECLARATIONS_A64FXf \
|
#define DECLARATIONS_A64FXf \
|
||||||
const uint32_t lut[4][16] = { \
|
const uint32_t lut[4][16] = { \
|
||||||
@ -281,50 +280,8 @@ asm ( \
|
|||||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||||
);
|
);
|
||||||
|
|
||||||
// PERM0
|
// PERMUTE
|
||||||
#define PERM0_A64FXf \
|
#define PERMUTE_A64FXf \
|
||||||
asm ( \
|
|
||||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
|
||||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
|
||||||
"tbl z14.s, { z14.s }, z30.s \n\t" \
|
|
||||||
"tbl z15.s, { z15.s }, z30.s \n\t" \
|
|
||||||
"tbl z16.s, { z16.s }, z30.s \n\t" \
|
|
||||||
"tbl z17.s, { z17.s }, z30.s \n\t" \
|
|
||||||
: \
|
|
||||||
: \
|
|
||||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
|
||||||
);
|
|
||||||
|
|
||||||
// PERM1
|
|
||||||
#define PERM1_A64FXf \
|
|
||||||
asm ( \
|
|
||||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
|
||||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
|
||||||
"tbl z14.s, { z14.s }, z30.s \n\t" \
|
|
||||||
"tbl z15.s, { z15.s }, z30.s \n\t" \
|
|
||||||
"tbl z16.s, { z16.s }, z30.s \n\t" \
|
|
||||||
"tbl z17.s, { z17.s }, z30.s \n\t" \
|
|
||||||
: \
|
|
||||||
: \
|
|
||||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
|
||||||
);
|
|
||||||
|
|
||||||
// PERM2
|
|
||||||
#define PERM2_A64FXf \
|
|
||||||
asm ( \
|
|
||||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
|
||||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
|
||||||
"tbl z14.s, { z14.s }, z30.s \n\t" \
|
|
||||||
"tbl z15.s, { z15.s }, z30.s \n\t" \
|
|
||||||
"tbl z16.s, { z16.s }, z30.s \n\t" \
|
|
||||||
"tbl z17.s, { z17.s }, z30.s \n\t" \
|
|
||||||
: \
|
|
||||||
: \
|
|
||||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
|
||||||
);
|
|
||||||
|
|
||||||
// PERM3
|
|
||||||
#define PERM3_A64FXf \
|
|
||||||
asm ( \
|
asm ( \
|
||||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
||||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
||||||
@ -355,7 +312,7 @@ asm ( \
|
|||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
// MULT_2SPIN
|
// MULT_2SPIN
|
||||||
#define MULT_2SPIN_A64FXf(A) \
|
#define MULT_2SPIN_1_A64FXf(A) \
|
||||||
{ \
|
{ \
|
||||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||||
asm ( \
|
asm ( \
|
||||||
@ -386,6 +343,15 @@ asm ( \
|
|||||||
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
|
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
|
||||||
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
|
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
|
||||||
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
|
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
|
||||||
|
: \
|
||||||
|
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||||
|
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||||
|
); \
|
||||||
|
}
|
||||||
|
// MULT_2SPIN_BACKEND
|
||||||
|
#define MULT_2SPIN_2_A64FXf \
|
||||||
|
{ \
|
||||||
|
asm ( \
|
||||||
"fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
|
"fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
|
||||||
"fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
|
"fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
|
||||||
"fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
|
"fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
|
||||||
@ -411,15 +377,14 @@ asm ( \
|
|||||||
"fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
|
"fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
|
||||||
"fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
|
"fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
|
||||||
: \
|
: \
|
||||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
: \
|
||||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
// XP_PROJ
|
// XP_PROJ
|
||||||
#define XP_PROJ_A64FXf \
|
#define XP_PROJ_A64FXf \
|
||||||
{ \
|
{ \
|
||||||
asm ( \
|
asm ( \
|
||||||
"ptrue p5.s \n\t" \
|
|
||||||
"fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
|
"fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
|
||||||
"fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
|
"fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
|
||||||
"fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
|
"fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
|
||||||
|
@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x)
|
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
|
||||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
|
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
|
||||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
|
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
|
||||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
||||||
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define MASK_REGS DECLARATIONS_A64FXd
|
#define MASK_REGS DECLARATIONS_A64FXd
|
||||||
#define COMPLEX_SIGNS(A)
|
#define COMPLEX_SIGNS(A)
|
||||||
#define LOAD64(A,B)
|
#define LOAD64(A,B)
|
||||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B);
|
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
||||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
||||||
MULT_2SPIN_A64FXd(A); \
|
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
||||||
PREFETCH_CHIMU_L2(B); \
|
|
||||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
|
||||||
#define MAYBEPERM(A,perm) { A ; }
|
|
||||||
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
||||||
#define ZERO_PSI
|
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
||||||
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
#define XP_PROJ XP_PROJ_A64FXd
|
||||||
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
|
#define YP_PROJ YP_PROJ_A64FXd
|
||||||
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
|
#define ZP_PROJ ZP_PROJ_A64FXd
|
||||||
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
|
#define TP_PROJ TP_PROJ_A64FXd
|
||||||
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd
|
#define XM_PROJ XM_PROJ_A64FXd
|
||||||
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd
|
#define YM_PROJ YM_PROJ_A64FXd
|
||||||
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd
|
#define ZM_PROJ ZM_PROJ_A64FXd
|
||||||
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd
|
#define TM_PROJ TM_PROJ_A64FXd
|
||||||
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
|
|
||||||
#define XP_RECON XP_RECON_A64FXd
|
#define XP_RECON XP_RECON_A64FXd
|
||||||
#define XM_RECON XM_RECON_A64FXd
|
#define XM_RECON XM_RECON_A64FXd
|
||||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
|
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
|
||||||
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
|
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
|
||||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
|
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
|
||||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
|
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
|
||||||
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; }
|
#define PERMUTE_DIR0 0
|
||||||
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; }
|
#define PERMUTE_DIR1 1
|
||||||
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; }
|
#define PERMUTE_DIR2 2
|
||||||
#define PERMUTE_DIR3
|
#define PERMUTE_DIR3 3
|
||||||
|
#define PERMUTE PERMUTE_A64FXd;
|
||||||
|
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
|
||||||
|
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
|
||||||
// DECLARATIONS
|
// DECLARATIONS
|
||||||
#define DECLARATIONS_A64FXd \
|
#define DECLARATIONS_A64FXd \
|
||||||
const uint64_t lut[4][8] = { \
|
const uint64_t lut[4][8] = { \
|
||||||
@ -254,8 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define LOAD_TABLE3 \
|
#define LOAD_TABLE3 \
|
||||||
table0 = svld1(pg1, (uint64_t*)&lut[3]);
|
table0 = svld1(pg1, (uint64_t*)&lut[3]);
|
||||||
|
|
||||||
// PERM0
|
// PERMUTE
|
||||||
#define PERM0_A64FXd \
|
#define PERMUTE_A64FXd \
|
||||||
Chi_00 = svtbl(Chi_00, table0); \
|
Chi_00 = svtbl(Chi_00, table0); \
|
||||||
Chi_01 = svtbl(Chi_01, table0); \
|
Chi_01 = svtbl(Chi_01, table0); \
|
||||||
Chi_02 = svtbl(Chi_02, table0); \
|
Chi_02 = svtbl(Chi_02, table0); \
|
||||||
@ -263,27 +262,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
Chi_11 = svtbl(Chi_11, table0); \
|
Chi_11 = svtbl(Chi_11, table0); \
|
||||||
Chi_12 = svtbl(Chi_12, table0);
|
Chi_12 = svtbl(Chi_12, table0);
|
||||||
|
|
||||||
// PERM1
|
|
||||||
#define PERM1_A64FXd \
|
|
||||||
Chi_00 = svtbl(Chi_00, table0); \
|
|
||||||
Chi_01 = svtbl(Chi_01, table0); \
|
|
||||||
Chi_02 = svtbl(Chi_02, table0); \
|
|
||||||
Chi_10 = svtbl(Chi_10, table0); \
|
|
||||||
Chi_11 = svtbl(Chi_11, table0); \
|
|
||||||
Chi_12 = svtbl(Chi_12, table0);
|
|
||||||
|
|
||||||
// PERM2
|
|
||||||
#define PERM2_A64FXd \
|
|
||||||
Chi_00 = svtbl(Chi_00, table0); \
|
|
||||||
Chi_01 = svtbl(Chi_01, table0); \
|
|
||||||
Chi_02 = svtbl(Chi_02, table0); \
|
|
||||||
Chi_10 = svtbl(Chi_10, table0); \
|
|
||||||
Chi_11 = svtbl(Chi_11, table0); \
|
|
||||||
Chi_12 = svtbl(Chi_12, table0);
|
|
||||||
|
|
||||||
// PERM3
|
|
||||||
#define PERM3_A64FXd
|
|
||||||
|
|
||||||
// LOAD_GAUGE
|
// LOAD_GAUGE
|
||||||
#define LOAD_GAUGE \
|
#define LOAD_GAUGE \
|
||||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||||
@ -296,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||||
}
|
}
|
||||||
// MULT_2SPIN
|
// MULT_2SPIN
|
||||||
#define MULT_2SPIN_A64FXd(A) \
|
#define MULT_2SPIN_1_A64FXd(A) \
|
||||||
{ \
|
{ \
|
||||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||||
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||||
@ -320,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
|
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
|
||||||
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
|
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
|
||||||
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
|
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
|
||||||
|
}
|
||||||
|
// MULT_2SPIN_BACKEND
|
||||||
|
#define MULT_2SPIN_2_A64FXd \
|
||||||
|
{ \
|
||||||
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
|
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
|
||||||
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
|
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
|
||||||
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
|
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
|
||||||
|
@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
|
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
|
||||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
|
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
|
||||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
|
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
|
||||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
||||||
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define MASK_REGS DECLARATIONS_A64FXf
|
#define MASK_REGS DECLARATIONS_A64FXf
|
||||||
#define COMPLEX_SIGNS(A)
|
#define COMPLEX_SIGNS(A)
|
||||||
#define LOAD64(A,B)
|
#define LOAD64(A,B)
|
||||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B);
|
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
||||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
||||||
MULT_2SPIN_A64FXf(A); \
|
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
||||||
PREFETCH_CHIMU_L2(B); \
|
|
||||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
|
||||||
#define MAYBEPERM(A,perm) { A ; }
|
|
||||||
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
||||||
#define ZERO_PSI
|
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
||||||
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
#define XP_PROJ XP_PROJ_A64FXf
|
||||||
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
|
#define YP_PROJ YP_PROJ_A64FXf
|
||||||
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
|
#define ZP_PROJ ZP_PROJ_A64FXf
|
||||||
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
|
#define TP_PROJ TP_PROJ_A64FXf
|
||||||
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
|
#define XM_PROJ XM_PROJ_A64FXf
|
||||||
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
|
#define YM_PROJ YM_PROJ_A64FXf
|
||||||
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
|
#define ZM_PROJ ZM_PROJ_A64FXf
|
||||||
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
|
#define TM_PROJ TM_PROJ_A64FXf
|
||||||
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
|
|
||||||
#define XP_RECON XP_RECON_A64FXf
|
#define XP_RECON XP_RECON_A64FXf
|
||||||
#define XM_RECON XM_RECON_A64FXf
|
#define XM_RECON XM_RECON_A64FXf
|
||||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
|
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
|
||||||
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
|
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
|
||||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
|
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
|
||||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
|
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
|
||||||
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
|
#define PERMUTE_DIR0 0
|
||||||
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
|
#define PERMUTE_DIR1 1
|
||||||
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
|
#define PERMUTE_DIR2 2
|
||||||
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
|
#define PERMUTE_DIR3 3
|
||||||
|
#define PERMUTE PERMUTE_A64FXf;
|
||||||
|
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
|
||||||
|
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
|
||||||
// DECLARATIONS
|
// DECLARATIONS
|
||||||
#define DECLARATIONS_A64FXf \
|
#define DECLARATIONS_A64FXf \
|
||||||
const uint32_t lut[4][16] = { \
|
const uint32_t lut[4][16] = { \
|
||||||
@ -254,35 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define LOAD_TABLE3 \
|
#define LOAD_TABLE3 \
|
||||||
table0 = svld1(pg1, (uint32_t*)&lut[3]);
|
table0 = svld1(pg1, (uint32_t*)&lut[3]);
|
||||||
|
|
||||||
// PERM0
|
// PERMUTE
|
||||||
#define PERM0_A64FXf \
|
#define PERMUTE_A64FXf \
|
||||||
Chi_00 = svtbl(Chi_00, table0); \
|
|
||||||
Chi_01 = svtbl(Chi_01, table0); \
|
|
||||||
Chi_02 = svtbl(Chi_02, table0); \
|
|
||||||
Chi_10 = svtbl(Chi_10, table0); \
|
|
||||||
Chi_11 = svtbl(Chi_11, table0); \
|
|
||||||
Chi_12 = svtbl(Chi_12, table0);
|
|
||||||
|
|
||||||
// PERM1
|
|
||||||
#define PERM1_A64FXf \
|
|
||||||
Chi_00 = svtbl(Chi_00, table0); \
|
|
||||||
Chi_01 = svtbl(Chi_01, table0); \
|
|
||||||
Chi_02 = svtbl(Chi_02, table0); \
|
|
||||||
Chi_10 = svtbl(Chi_10, table0); \
|
|
||||||
Chi_11 = svtbl(Chi_11, table0); \
|
|
||||||
Chi_12 = svtbl(Chi_12, table0);
|
|
||||||
|
|
||||||
// PERM2
|
|
||||||
#define PERM2_A64FXf \
|
|
||||||
Chi_00 = svtbl(Chi_00, table0); \
|
|
||||||
Chi_01 = svtbl(Chi_01, table0); \
|
|
||||||
Chi_02 = svtbl(Chi_02, table0); \
|
|
||||||
Chi_10 = svtbl(Chi_10, table0); \
|
|
||||||
Chi_11 = svtbl(Chi_11, table0); \
|
|
||||||
Chi_12 = svtbl(Chi_12, table0);
|
|
||||||
|
|
||||||
// PERM3
|
|
||||||
#define PERM3_A64FXf \
|
|
||||||
Chi_00 = svtbl(Chi_00, table0); \
|
Chi_00 = svtbl(Chi_00, table0); \
|
||||||
Chi_01 = svtbl(Chi_01, table0); \
|
Chi_01 = svtbl(Chi_01, table0); \
|
||||||
Chi_02 = svtbl(Chi_02, table0); \
|
Chi_02 = svtbl(Chi_02, table0); \
|
||||||
@ -302,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||||
}
|
}
|
||||||
// MULT_2SPIN
|
// MULT_2SPIN
|
||||||
#define MULT_2SPIN_A64FXf(A) \
|
#define MULT_2SPIN_1_A64FXf(A) \
|
||||||
{ \
|
{ \
|
||||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||||
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||||
@ -326,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
|
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
|
||||||
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
|
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
|
||||||
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
|
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
|
||||||
|
}
|
||||||
|
// MULT_2SPIN_BACKEND
|
||||||
|
#define MULT_2SPIN_2_A64FXf \
|
||||||
|
{ \
|
||||||
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
|
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
|
||||||
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
|
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
|
||||||
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
|
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
|
||||||
|
@ -26,14 +26,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
#undef LOAD_CHIMU_A64FXd
|
#undef LOAD_CHIMU
|
||||||
#undef LOAD_CHIMU_A64FXf
|
|
||||||
#undef PREFETCH_CHIMU_L1
|
#undef PREFETCH_CHIMU_L1
|
||||||
#undef PREFETCH_GAUGE_L1
|
#undef PREFETCH_GAUGE_L1
|
||||||
#undef PREFETCH_CHIMU_L2
|
#undef PREFETCH_CHIMU_L2
|
||||||
#undef PREFETCH_GAUGE_L2
|
#undef PREFETCH_GAUGE_L2
|
||||||
#undef PREFETCH_GAUGE_L1_INTERNAL
|
#undef PREFETCH_GAUGE_L1_INTERNAL
|
||||||
#undef PF_GAUGE
|
|
||||||
#undef PREFETCH1_CHIMU
|
#undef PREFETCH1_CHIMU
|
||||||
#undef PREFETCH_CHIMU
|
#undef PREFETCH_CHIMU
|
||||||
#undef PREFETCH_RESULT_L2_STORE
|
#undef PREFETCH_RESULT_L2_STORE
|
||||||
@ -42,22 +40,20 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#undef LOCK_GAUGE
|
#undef LOCK_GAUGE
|
||||||
#undef UNLOCK_GAUGE
|
#undef UNLOCK_GAUGE
|
||||||
#undef MASK_REGS
|
#undef MASK_REGS
|
||||||
#undef COMPLEX_SIGNS
|
|
||||||
#undef LOAD64
|
|
||||||
#undef SAVE_RESULT
|
#undef SAVE_RESULT
|
||||||
#undef ADD_RESULT
|
#undef ADD_RESULT
|
||||||
#undef MULT_2SPIN_DIR_PF
|
#undef MULT_2SPIN_1
|
||||||
|
#undef MULT_2SPIN_2
|
||||||
#undef MAYBEPERM
|
#undef MAYBEPERM
|
||||||
#undef LOAD_CHI
|
#undef LOAD_CHI
|
||||||
#undef ZERO_PSI
|
#undef XP_PROJ
|
||||||
#undef XP_PROJMEM
|
#undef YP_PROJ
|
||||||
#undef YP_PROJMEM
|
#undef ZP_PROJ
|
||||||
#undef ZP_PROJMEM
|
#undef TP_PROJ
|
||||||
#undef TP_PROJMEM
|
#undef XM_PROJ
|
||||||
#undef XM_PROJMEM
|
#undef YM_PROJ
|
||||||
#undef YM_PROJMEM
|
#undef ZM_PROJ
|
||||||
#undef ZM_PROJMEM
|
#undef TM_PROJ
|
||||||
#undef TM_PROJMEM
|
|
||||||
#undef XP_RECON
|
#undef XP_RECON
|
||||||
#undef XM_RECON
|
#undef XM_RECON
|
||||||
#undef XM_RECON_ACCUM
|
#undef XM_RECON_ACCUM
|
||||||
@ -68,10 +64,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#undef YP_RECON_ACCUM
|
#undef YP_RECON_ACCUM
|
||||||
#undef ZP_RECON_ACCUM
|
#undef ZP_RECON_ACCUM
|
||||||
#undef TP_RECON_ACCUM
|
#undef TP_RECON_ACCUM
|
||||||
|
#undef PERMUTE
|
||||||
#undef PERMUTE_DIR0
|
#undef PERMUTE_DIR0
|
||||||
#undef PERMUTE_DIR1
|
#undef PERMUTE_DIR1
|
||||||
#undef PERMUTE_DIR2
|
#undef PERMUTE_DIR2
|
||||||
#undef PERMUTE_DIR3
|
#undef PERMUTE_DIR3
|
||||||
|
#undef LOAD_TABLE
|
||||||
#undef LOAD_TABLE0
|
#undef LOAD_TABLE0
|
||||||
#undef LOAD_TABLE1
|
#undef LOAD_TABLE1
|
||||||
#undef LOAD_TABLE2
|
#undef LOAD_TABLE2
|
||||||
|
@ -115,37 +115,9 @@ STORE_BASE_PTR_COLOR_OFFSET = 2
|
|||||||
# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2)
|
# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2)
|
||||||
|
|
||||||
OPT = """
|
OPT = """
|
||||||
#ifdef INTERIOR
|
* interleave prefetching and compute in MULT_2SPIN
|
||||||
|
* could test storing U's in MULT_2SPIN to L1d, might be beneficial for life time cache lines
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
* structure reordering: MAYBEPERM after MULT_2SPIN ?
|
||||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
|
||||||
if ( local ) { \
|
|
||||||
-- LOAD64(%r10,isigns); \
|
|
||||||
PROJ(base); \
|
|
||||||
++ PF_GAUGE(Dir); \
|
|
||||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
|
||||||
} else if ( st.same_node[Dir] ) {
|
|
||||||
LOAD_CHI(base);
|
|
||||||
++ PF_GAUGE(Dir);
|
|
||||||
} \
|
|
||||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
|
||||||
if ( local || st.same_node[Dir] ) { \
|
|
||||||
MULT_2SPIN_DIR_PF(Dir,basep); \
|
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
-- LOAD64(%r10,isigns); \
|
|
||||||
RECON; \
|
|
||||||
} else { PREFETCH_CHIMU(base); }
|
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
|
||||||
-- PF_GAUGE(Xp); \
|
|
||||||
PREFETCH1_CHIMU(base); \
|
|
||||||
{ ZERO_PSI; } \
|
|
||||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
|
||||||
|
|
||||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
filename = 'XXX'
|
filename = 'XXX'
|
||||||
@ -905,7 +877,8 @@ if ALTERNATIVE_LOADS == True:
|
|||||||
define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}')
|
define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}')
|
||||||
define(F'LOAD_CHIMU(x)')
|
define(F'LOAD_CHIMU(x)')
|
||||||
else:
|
else:
|
||||||
define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)')
|
#define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)')
|
||||||
|
define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)')
|
||||||
|
|
||||||
if PREFETCH:
|
if PREFETCH:
|
||||||
define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)')
|
define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)')
|
||||||
@ -935,39 +908,22 @@ define(F'UNLOCK_GAUGE(A)')
|
|||||||
define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}')
|
define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}')
|
||||||
define(F'COMPLEX_SIGNS(A)')
|
define(F'COMPLEX_SIGNS(A)')
|
||||||
define(F'LOAD64(A,B)')
|
define(F'LOAD64(A,B)')
|
||||||
# prefetch chimu here is useless, because already done in last leg
|
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)')
|
||||||
#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);')
|
define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)')
|
||||||
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);')
|
define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}')
|
||||||
if PREFETCH:
|
|
||||||
definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ')
|
|
||||||
write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
|
|
||||||
write (F' PREFETCH_CHIMU_L2(B); \\')
|
|
||||||
write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}')
|
|
||||||
|
|
||||||
# definemultiline(F'MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A);')
|
|
||||||
# write (F' PREFETCH_CHIMU_L2(B); \\')
|
|
||||||
# write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
|
|
||||||
# write (F' if (s == 0) {{ if ((A == 0) || (A == 4)) {{ PREFETCH_GAUGE_L2(A); }} }}')
|
|
||||||
newline()
|
|
||||||
else:
|
|
||||||
define(F'MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_{PRECSUFFIX}(A)')
|
|
||||||
# break out maybeperm in permutes
|
|
||||||
#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}')
|
|
||||||
define(F'MAYBEPERM(A,perm) {{ A ; }}')
|
|
||||||
define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)')
|
define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)')
|
||||||
# don't need zero psi, everything is done in recons
|
# don't need zero psi, everything is done in recons
|
||||||
#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}')
|
#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}')
|
||||||
define(F'ZERO_PSI')
|
define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
|
||||||
define(F'ADD_RESULT(base,basep) LOAD_CHIMU_{PRECSUFFIX}(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
|
|
||||||
# loads projections
|
# loads projections
|
||||||
define(F'XP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XP_PROJ_{PRECSUFFIX}')
|
define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}')
|
||||||
define(F'YP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YP_PROJ_{PRECSUFFIX}')
|
define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}')
|
||||||
define(F'ZP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZP_PROJ_{PRECSUFFIX}')
|
define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}')
|
||||||
define(F'TP_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TP_PROJ_{PRECSUFFIX}')
|
define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}')
|
||||||
define(F'XM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); XM_PROJ_{PRECSUFFIX}')
|
define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}')
|
||||||
define(F'YM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); YM_PROJ_{PRECSUFFIX}')
|
define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}')
|
||||||
define(F'ZM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); ZM_PROJ_{PRECSUFFIX}')
|
define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}')
|
||||||
define(F'TM_PROJMEM(base) LOAD_CHIMU_{PRECSUFFIX}(base); TM_PROJ_{PRECSUFFIX}')
|
define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}')
|
||||||
# recons
|
# recons
|
||||||
define(F'XP_RECON XP_RECON_{PRECSUFFIX}')
|
define(F'XP_RECON XP_RECON_{PRECSUFFIX}')
|
||||||
define(F'XM_RECON XM_RECON_{PRECSUFFIX}')
|
define(F'XM_RECON XM_RECON_{PRECSUFFIX}')
|
||||||
@ -979,14 +935,21 @@ define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}')
|
|||||||
define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}')
|
define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}')
|
||||||
define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}')
|
define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}')
|
||||||
define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}')
|
define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}')
|
||||||
# permutes
|
# new permutes
|
||||||
define(F'PERMUTE_DIR0 LOAD_TABLE0; if (perm) {{ PERM0_{PRECSUFFIX}; }}')
|
define(F'PERMUTE_DIR0 0')
|
||||||
define(F'PERMUTE_DIR1 LOAD_TABLE1; if (perm) {{ PERM1_{PRECSUFFIX}; }}')
|
define(F'PERMUTE_DIR1 1')
|
||||||
define(F'PERMUTE_DIR2 LOAD_TABLE2; if (perm) {{ PERM2_{PRECSUFFIX}; }}')
|
define(F'PERMUTE_DIR2 2')
|
||||||
|
define(F'PERMUTE_DIR3 3')
|
||||||
|
define(F'PERMUTE PERMUTE_{PRECSUFFIX};')
|
||||||
|
# load table
|
||||||
|
#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}')
|
||||||
if PRECISION == 'double':
|
if PRECISION == 'double':
|
||||||
define(F'PERMUTE_DIR3')
|
define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}')
|
||||||
|
define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}')
|
||||||
else:
|
else:
|
||||||
define(F'PERMUTE_DIR3 LOAD_TABLE3; if (perm) {{ PERM3_{PRECSUFFIX}; }}')
|
define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}')
|
||||||
|
define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
write('// DECLARATIONS')
|
write('// DECLARATIONS')
|
||||||
@ -1040,20 +1003,14 @@ U_01.declare()
|
|||||||
U_11.declare()
|
U_11.declare()
|
||||||
U_21.declare() # 6 -> 30 regs
|
U_21.declare() # 6 -> 30 regs
|
||||||
|
|
||||||
# all true
|
# all predications true
|
||||||
pg1.declare()
|
pg1.declare()
|
||||||
if PRECISION == 'double':
|
if PRECISION == 'double':
|
||||||
pg1.movestr('svptrue_b64()')
|
pg1.movestr('svptrue_b64()')
|
||||||
else:
|
else:
|
||||||
pg1.movestr('svptrue_b32()')
|
pg1.movestr('svptrue_b32()')
|
||||||
|
|
||||||
# even elements only
|
# tables
|
||||||
#pg2.declare()
|
|
||||||
#pg2.movestr('svzip1_b64(svptrue_b64(), svpfalse_b())')
|
|
||||||
|
|
||||||
# preload tables
|
|
||||||
# 0: swap
|
|
||||||
# 1: permute 1
|
|
||||||
if PRECISION == 'double':
|
if PRECISION == 'double':
|
||||||
write(' svuint64_t table0; \\', target='I') # -> 31 regs
|
write(' svuint64_t table0; \\', target='I') # -> 31 regs
|
||||||
else:
|
else:
|
||||||
@ -1061,10 +1018,10 @@ else:
|
|||||||
|
|
||||||
zero0.declare()
|
zero0.declare()
|
||||||
|
|
||||||
|
# zero register
|
||||||
asmopen()
|
asmopen()
|
||||||
zero0.zero(zeroreg=True)
|
zero0.zero(zeroreg=True)
|
||||||
asmclose()
|
asmclose()
|
||||||
|
|
||||||
newline()
|
newline()
|
||||||
|
|
||||||
define('Chimu_00 Chi_00', target='I')
|
define('Chimu_00 Chi_00', target='I')
|
||||||
@ -1087,7 +1044,6 @@ else: # wilson4.h
|
|||||||
define('Chimu_30 U_01', target='I')
|
define('Chimu_30 U_01', target='I')
|
||||||
define('Chimu_31 U_11', target='I')
|
define('Chimu_31 U_11', target='I')
|
||||||
define('Chimu_32 U_21', target='I')
|
define('Chimu_32 U_21', target='I')
|
||||||
|
|
||||||
newline()
|
newline()
|
||||||
|
|
||||||
|
|
||||||
@ -1380,47 +1336,11 @@ table0.loadtable(3)
|
|||||||
asmclose()
|
asmclose()
|
||||||
newline()
|
newline()
|
||||||
|
|
||||||
# 8 directions = 6x permutations
|
|
||||||
d['factor'] = 2 # factor is 0
|
|
||||||
d['cycles_PERM'] += 6 * d['factor']
|
|
||||||
write('// PERM0')
|
|
||||||
definemultiline(F'PERM0_{PRECSUFFIX}')
|
|
||||||
debugall('PERM0 PRE', group='Chi')
|
|
||||||
asmopen()
|
|
||||||
#table0.loadtable(0)
|
|
||||||
Chi_00.permute(0, table0)
|
|
||||||
Chi_01.permute(0, table0)
|
|
||||||
Chi_02.permute(0, table0)
|
|
||||||
Chi_10.permute(0, table0)
|
|
||||||
Chi_11.permute(0, table0)
|
|
||||||
Chi_12.permute(0, table0)
|
|
||||||
asmclose()
|
|
||||||
debugall('PERM0 POST', group='Chi')
|
|
||||||
newline()
|
|
||||||
|
|
||||||
d['factor'] = 2 # factor is 2
|
d['factor'] = 2 # factor is 2
|
||||||
d['cycles_PERM'] += 6 * d['factor']
|
d['cycles_PERM'] += 6 * d['factor']
|
||||||
write('// PERM1')
|
write('// PERMUTE')
|
||||||
definemultiline(F'PERM1_{PRECSUFFIX}')
|
definemultiline(F'PERMUTE_{PRECSUFFIX}')
|
||||||
debugall('PERM1 PRE', group='Chi')
|
debugall('PERM PRE', group='Chi')
|
||||||
asmopen()
|
|
||||||
#table0.loadtable(1)
|
|
||||||
Chi_00.permute(1, table0)
|
|
||||||
Chi_01.permute(1, table0)
|
|
||||||
Chi_02.permute(1, table0)
|
|
||||||
Chi_10.permute(1, table0)
|
|
||||||
Chi_11.permute(1, table0)
|
|
||||||
Chi_12.permute(1, table0)
|
|
||||||
asmclose()
|
|
||||||
debugall('PERM1 POST', group='Chi')
|
|
||||||
newline()
|
|
||||||
|
|
||||||
d['factor'] = 2 # factor is 2
|
|
||||||
# PERM2 = swap real and imaginary
|
|
||||||
d['cycles_PERM'] += 6 * d['factor']
|
|
||||||
write('// PERM2')
|
|
||||||
definemultiline(F'PERM2_{PRECSUFFIX}')
|
|
||||||
debugall('PERM2 PRE', group='Chi')
|
|
||||||
asmopen()
|
asmopen()
|
||||||
#table0.loadtable(2)
|
#table0.loadtable(2)
|
||||||
Chi_00.permute(2, table0)
|
Chi_00.permute(2, table0)
|
||||||
@ -1430,26 +1350,7 @@ Chi_10.permute(2, table0)
|
|||||||
Chi_11.permute(2, table0)
|
Chi_11.permute(2, table0)
|
||||||
Chi_12.permute(2, table0)
|
Chi_12.permute(2, table0)
|
||||||
asmclose()
|
asmclose()
|
||||||
debugall('PERM2 POST', group='Chi')
|
debugall('PERM POST', group='Chi')
|
||||||
newline()
|
|
||||||
|
|
||||||
# PERM3 = identity (DP), so exclude from counting
|
|
||||||
d['factor'] = 0
|
|
||||||
d['cycles_PERM'] += 6 * d['factor']
|
|
||||||
write('// PERM3')
|
|
||||||
definemultiline(F'PERM3_{PRECSUFFIX}')
|
|
||||||
if PRECISION == 'single':
|
|
||||||
debugall('PERM3 PRE', group='Chi')
|
|
||||||
asmopen()
|
|
||||||
#table0.loadtable(3)
|
|
||||||
Chi_00.permute(3, table0)
|
|
||||||
Chi_01.permute(3, table0)
|
|
||||||
Chi_02.permute(3, table0)
|
|
||||||
Chi_10.permute(3, table0)
|
|
||||||
Chi_11.permute(3, table0)
|
|
||||||
Chi_12.permute(3, table0)
|
|
||||||
asmclose()
|
|
||||||
debugall('PERM3 POST', group='Chi')
|
|
||||||
newline()
|
newline()
|
||||||
|
|
||||||
write('// LOAD_GAUGE')
|
write('// LOAD_GAUGE')
|
||||||
@ -1473,7 +1374,7 @@ if ASM_LOAD_GAUGE:
|
|||||||
asmclose()
|
asmclose()
|
||||||
curlyclose()
|
curlyclose()
|
||||||
newline()
|
newline()
|
||||||
# XXXXXX remove loads
|
|
||||||
d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
|
d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
|
||||||
# assume all U loads are hidden
|
# assume all U loads are hidden
|
||||||
# FCMLA issue latency = 2 cycles
|
# FCMLA issue latency = 2 cycles
|
||||||
@ -1482,7 +1383,7 @@ d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total
|
|||||||
# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9
|
# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9
|
||||||
d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor']
|
d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor']
|
||||||
write('// MULT_2SPIN')
|
write('// MULT_2SPIN')
|
||||||
definemultiline(F'MULT_2SPIN_{PRECSUFFIX}(A)')
|
definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)')
|
||||||
curlyopen()
|
curlyopen()
|
||||||
#write(' const auto & ref(U[sU][A]); \\')
|
#write(' const auto & ref(U[sU][A]); \\')
|
||||||
if GRIDBENCH: # referencing differs in Grid and GridBench
|
if GRIDBENCH: # referencing differs in Grid and GridBench
|
||||||
@ -1541,7 +1442,15 @@ if ASM_LOAD_GAUGE:
|
|||||||
U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded
|
U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded
|
||||||
U_10.load("ref[1][2]") # early load
|
U_10.load("ref[1][2]") # early load
|
||||||
U_20.load("ref[2][2]") # A -->
|
U_20.load("ref[2][2]") # A -->
|
||||||
|
asmclose()
|
||||||
|
debugall('MULT_2SPIN_1', group='UChi')
|
||||||
|
curlyclose()
|
||||||
|
newline()
|
||||||
|
|
||||||
|
write('// MULT_2SPIN_BACKEND')
|
||||||
|
definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}')
|
||||||
|
curlyopen()
|
||||||
|
asmopen()
|
||||||
# round 3
|
# round 3
|
||||||
UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and
|
UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and
|
||||||
UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90)
|
UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90)
|
||||||
@ -1571,7 +1480,7 @@ UChi_11.mac1(U_10, Chi_12)
|
|||||||
UChi_02.mac1(U_20, Chi_02)
|
UChi_02.mac1(U_20, Chi_02)
|
||||||
UChi_12.mac1(U_20, Chi_12)
|
UChi_12.mac1(U_20, Chi_12)
|
||||||
asmclose()
|
asmclose()
|
||||||
debugall('MULT_2SPIN', group='UChi')
|
debugall('MULT_2SPIN_2', group='UChi')
|
||||||
curlyclose()
|
curlyclose()
|
||||||
newline()
|
newline()
|
||||||
|
|
||||||
@ -1587,7 +1496,7 @@ if ALTERNATIVE_LOADS == True:
|
|||||||
write(' LOAD_CHIMU_0312_PLUG \\')
|
write(' LOAD_CHIMU_0312_PLUG \\')
|
||||||
curlyopen()
|
curlyopen()
|
||||||
asmopen()
|
asmopen()
|
||||||
pg1.loadpredication()
|
#pg1.loadpredication()
|
||||||
Chi_00.addTimesI(Chimu_00, Chimu_30)
|
Chi_00.addTimesI(Chimu_00, Chimu_30)
|
||||||
Chi_01.addTimesI(Chimu_01, Chimu_31)
|
Chi_01.addTimesI(Chimu_01, Chimu_31)
|
||||||
Chi_02.addTimesI(Chimu_02, Chimu_32)
|
Chi_02.addTimesI(Chimu_02, Chimu_32)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user