mirror of
https://github.com/paboyle/Grid.git
synced 2026-01-03 19:29:29 +00:00
980 GiB/s Wilson; 680 GiB/s DW (DP)
This commit is contained in:
@@ -31,6 +31,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)
|
||||
#define PF_GAUGE(A)
|
||||
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define LOCK_GAUGE(A)
|
||||
@@ -38,8 +39,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MASK_REGS DECLARATIONS_A64FXd
|
||||
#define COMPLEX_SIGNS(A)
|
||||
#define LOAD64(A,B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A)
|
||||
#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); }
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_CHIMU_L1(B);
|
||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
||||
MULT_2SPIN_A64FXd(A); \
|
||||
PREFETCH_CHIMU_L2(B); \
|
||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
||||
#define MAYBEPERM(A,perm) { A ; }
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
||||
#define ZERO_PSI
|
||||
@@ -321,12 +325,28 @@ asm ( \
|
||||
// PERM3
|
||||
#define PERM3_A64FXd
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
@@ -738,6 +758,18 @@ asm ( \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// ADD_RESULT_INTERNAL
|
||||
#define ADD_RESULT_INTERNAL_A64FXd \
|
||||
asm ( \
|
||||
|
||||
@@ -31,6 +31,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)
|
||||
#define PF_GAUGE(A)
|
||||
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define LOCK_GAUGE(A)
|
||||
@@ -38,8 +39,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MASK_REGS DECLARATIONS_A64FXf
|
||||
#define COMPLEX_SIGNS(A)
|
||||
#define LOAD64(A,B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A)
|
||||
#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); }
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_CHIMU_L1(B);
|
||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
||||
MULT_2SPIN_A64FXf(A); \
|
||||
PREFETCH_CHIMU_L2(B); \
|
||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
||||
#define MAYBEPERM(A,perm) { A ; }
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
||||
#define ZERO_PSI
|
||||
@@ -332,12 +336,28 @@ asm ( \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
@@ -749,6 +769,18 @@ asm ( \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// ADD_RESULT_INTERNAL
|
||||
#define ADD_RESULT_INTERNAL_A64FXf \
|
||||
asm ( \
|
||||
|
||||
@@ -31,6 +31,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)
|
||||
#define PF_GAUGE(A)
|
||||
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define LOCK_GAUGE(A)
|
||||
@@ -38,8 +39,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MASK_REGS DECLARATIONS_A64FXd
|
||||
#define COMPLEX_SIGNS(A)
|
||||
#define LOAD64(A,B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A)
|
||||
#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); }
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_CHIMU_L1(B);
|
||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
||||
MULT_2SPIN_A64FXd(A); \
|
||||
PREFETCH_CHIMU_L2(B); \
|
||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
||||
#define MAYBEPERM(A,perm) { A ; }
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
||||
#define ZERO_PSI
|
||||
@@ -279,6 +283,17 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
// PERM3
|
||||
#define PERM3_A64FXd
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
{ \
|
||||
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXd(A) \
|
||||
{ \
|
||||
@@ -574,6 +589,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
result_31 = __svzero(result_31); \
|
||||
result_32 = __svzero(result_32);
|
||||
|
||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
|
||||
}
|
||||
// ADD_RESULT_INTERNAL
|
||||
#define ADD_RESULT_INTERNAL_A64FXd \
|
||||
result_00 = svadd_x(pg1, result_00, Chimu_00); \
|
||||
|
||||
@@ -31,6 +31,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)
|
||||
#define PF_GAUGE(A)
|
||||
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define LOCK_GAUGE(A)
|
||||
@@ -38,8 +39,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MASK_REGS DECLARATIONS_A64FXf
|
||||
#define COMPLEX_SIGNS(A)
|
||||
#define LOAD64(A,B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A)
|
||||
#define MULT_2SPIN_DIR_PF(A,B) PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); }
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_CHIMU_L1(B);
|
||||
#define MULT_2SPIN_DIR_PF(A,B) \
|
||||
MULT_2SPIN_A64FXf(A); \
|
||||
PREFETCH_CHIMU_L2(B); \
|
||||
if (s == 0) { if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); } }
|
||||
#define MAYBEPERM(A,perm) { A ; }
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
||||
#define ZERO_PSI
|
||||
@@ -285,6 +289,17 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
Chi_11 = svtbl(Chi_11, table0); \
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
{ \
|
||||
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXf(A) \
|
||||
{ \
|
||||
@@ -580,6 +595,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
result_31 = __svzero(result_31); \
|
||||
result_32 = __svzero(result_32);
|
||||
|
||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
|
||||
}
|
||||
// ADD_RESULT_INTERNAL
|
||||
#define ADD_RESULT_INTERNAL_A64FXf \
|
||||
result_00 = svadd_x(pg1, result_00, Chimu_00); \
|
||||
|
||||
@@ -36,6 +36,8 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#undef PF_GAUGE
|
||||
#undef PREFETCH1_CHIMU
|
||||
#undef PREFETCH_CHIMU
|
||||
#undef PREFETCH_RESULT_L2_STORE
|
||||
#undef LOAD_GAUGE
|
||||
#undef LOCK_GAUGE
|
||||
#undef UNLOCK_GAUGE
|
||||
#undef MASK_REGS
|
||||
|
||||
2410
Grid/simd/gridverter.py
Executable file
2410
Grid/simd/gridverter.py
Executable file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user