1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 15:27:06 +01:00

revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0)

This commit is contained in:
nils meyer
2020-04-16 22:43:32 +02:00
parent 852db4626a
commit 6fdce60492
7 changed files with 279 additions and 451 deletions

View File

@ -25,7 +25,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
@ -40,23 +40,19 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MASK_REGS DECLARATIONS_A64FXf
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B);
#define MULT_2SPIN_DIR_PF(A,B) \
MULT_2SPIN_A64FXf(A); \
PREFETCH_CHIMU_L2(B); \
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
#define MAYBEPERM(A,perm) { A ; }
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJ XP_PROJ_A64FXf
#define YP_PROJ YP_PROJ_A64FXf
#define ZP_PROJ ZP_PROJ_A64FXf
#define TP_PROJ TP_PROJ_A64FXf
#define XM_PROJ XM_PROJ_A64FXf
#define YM_PROJ YM_PROJ_A64FXf
#define ZM_PROJ ZM_PROJ_A64FXf
#define TM_PROJ TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
@ -67,10 +63,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXf;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
@ -254,35 +253,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD_TABLE3 \
table0 = svld1(pg1, (uint32_t*)&lut[3]);
// PERM0
#define PERM0_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM1
#define PERM1_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM2
#define PERM2_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM3
#define PERM3_A64FXf \
// PERMUTE
#define PERMUTE_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -302,7 +274,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
}
// MULT_2SPIN
#define MULT_2SPIN_A64FXf(A) \
#define MULT_2SPIN_1_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
@ -326,6 +298,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
{ \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \