mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-27 18:19:34 +00:00 
			
		
		
		
	up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1
This commit is contained in:
		| @@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #define LOCK_GAUGE(A)   | ||||
| #define UNLOCK_GAUGE(A)   | ||||
| #define MASK_REGS                      DECLARATIONS_A64FXd   | ||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)   | ||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXd(A);   | ||||
| #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)   | ||||
| #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd   | ||||
| #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)   | ||||
| #define ZERO_PSI                       ZERO_PSI_A64FXd   | ||||
| #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)   | ||||
| #define XP_PROJ                        XP_PROJ_A64FXd   | ||||
| #define YP_PROJ                        YP_PROJ_A64FXd   | ||||
| @@ -70,11 +71,18 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }   | ||||
| // DECLARATIONS | ||||
| #define DECLARATIONS_A64FXd  \ | ||||
|     uint64_t baseU; \ | ||||
|     const uint64_t lut[4][8] = { \ | ||||
|         {4, 5, 6, 7, 0, 1, 2, 3}, \ | ||||
|         {2, 3, 0, 1, 6, 7, 4, 5}, \ | ||||
|         {1, 0, 3, 2, 5, 4, 7, 6}, \ | ||||
|         {0, 1, 2, 4, 5, 6, 7, 8} };\ | ||||
| asm ( \ | ||||
|     "ptrue p5.d \n\t" \ | ||||
|     :  \ | ||||
|     :  \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ | ||||
| ); \ | ||||
| asm ( \ | ||||
|     "fmov z31.d , 0 \n\t" \ | ||||
|     :  \ | ||||
| @@ -130,7 +138,7 @@ asm ( \ | ||||
| // PREFETCH_GAUGE_L2 (prefetch to L2) | ||||
| #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||
|     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||
| asm ( \ | ||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
| @@ -149,7 +157,7 @@ asm ( \ | ||||
| // PREFETCH_GAUGE_L1 (prefetch to L1) | ||||
| #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
| asm ( \ | ||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
| @@ -163,12 +171,12 @@ asm ( \ | ||||
| #define LOAD_CHI_A64FXd(base)  \ | ||||
| { \ | ||||
| asm ( \ | ||||
|     "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (base) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -178,19 +186,18 @@ asm ( \ | ||||
| #define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \ | ||||
| { \ | ||||
| asm ( \ | ||||
|     "ptrue p5.d \n\t" \ | ||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (base + 2 * 3 * 64) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -201,19 +208,18 @@ asm ( \ | ||||
| { \ | ||||
|     const SiteSpinor & ref(in[offset]); \ | ||||
| asm ( \ | ||||
|     "ptrue p5.d \n\t" \ | ||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (&ref[2][0]) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -224,19 +230,18 @@ asm ( \ | ||||
| { \ | ||||
|     const SiteSpinor & ref(in[offset]); \ | ||||
| asm ( \ | ||||
|     "ptrue p5.d \n\t" \ | ||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (&ref[2][0]) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -293,17 +298,16 @@ asm ( \ | ||||
| );  | ||||
|  | ||||
| // LOAD_GAUGE | ||||
| #define LOAD_GAUGE  \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
| #define LOAD_GAUGE(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
| asm ( \ | ||||
|     "ptrue p5.d \n\t" \ | ||||
|     "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (baseU + 2 * 3 * 64) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -312,14 +316,14 @@ asm ( \ | ||||
| // MULT_2SPIN | ||||
| #define MULT_2SPIN_1_A64FXd(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
| asm ( \ | ||||
|     "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "movprfx z18.d, p5/m, z31.d \n\t" \ | ||||
|     "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ | ||||
|     "movprfx z21.d, p5/m, z31.d \n\t" \ | ||||
| @@ -338,9 +342,9 @@ asm ( \ | ||||
|     "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ | ||||
|     "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ | ||||
|     "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ | ||||
|     "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (baseU + 2 * 3 * 64) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -560,7 +564,6 @@ asm ( \ | ||||
| #define TM_PROJ_A64FXd  \ | ||||
| { \ | ||||
| asm ( \ | ||||
|     "ptrue p5.d \n\t" \ | ||||
|     "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ | ||||
|     "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ | ||||
|     "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ | ||||
| @@ -715,7 +718,6 @@ asm ( \ | ||||
| // ZERO_PSI | ||||
| #define ZERO_PSI_A64FXd  \ | ||||
| asm ( \ | ||||
|     "ptrue p5.d \n\t" \ | ||||
|     "fmov z0.d , 0 \n\t" \ | ||||
|     "fmov z1.d , 0 \n\t" \ | ||||
|     "fmov z2.d , 0 \n\t" \ | ||||
| @@ -733,13 +735,13 @@ asm ( \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ | ||||
| );  | ||||
|  | ||||
| // PREFETCH_RESULT_L2_STORE (prefetch store to L2) | ||||
| // PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) | ||||
| #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \ | ||||
| { \ | ||||
| asm ( \ | ||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ | ||||
|     "dc zva, %[fetchptr]\n\t" \ | ||||
|     "dc zva, %[fetchptr]\n\t" \ | ||||
|     "dc zva, %[fetchptr]\n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (base) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
|   | ||||
| @@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #define LOCK_GAUGE(A)   | ||||
| #define UNLOCK_GAUGE(A)   | ||||
| #define MASK_REGS                      DECLARATIONS_A64FXf   | ||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)   | ||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXf(A);   | ||||
| #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)   | ||||
| #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf   | ||||
| #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)   | ||||
| #define ZERO_PSI                       ZERO_PSI_A64FXf   | ||||
| #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)   | ||||
| #define XP_PROJ                        XP_PROJ_A64FXf   | ||||
| #define YP_PROJ                        YP_PROJ_A64FXf   | ||||
| @@ -70,11 +71,18 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }   | ||||
| // DECLARATIONS | ||||
| #define DECLARATIONS_A64FXf  \ | ||||
|     uint64_t baseU; \ | ||||
|     const uint32_t lut[4][16] = { \ | ||||
|         {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ | ||||
|         {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ | ||||
|         {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ | ||||
|         {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ | ||||
| asm ( \ | ||||
|     "ptrue p5.s \n\t" \ | ||||
|     :  \ | ||||
|     :  \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ | ||||
| ); \ | ||||
| asm ( \ | ||||
|     "fmov z31.s , 0 \n\t" \ | ||||
|     :  \ | ||||
| @@ -130,7 +138,7 @@ asm ( \ | ||||
| // PREFETCH_GAUGE_L2 (prefetch to L2) | ||||
| #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||
|     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||
| asm ( \ | ||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
| @@ -149,7 +157,7 @@ asm ( \ | ||||
| // PREFETCH_GAUGE_L1 (prefetch to L1) | ||||
| #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
| asm ( \ | ||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
| @@ -163,12 +171,12 @@ asm ( \ | ||||
| #define LOAD_CHI_A64FXf(base)  \ | ||||
| { \ | ||||
| asm ( \ | ||||
|     "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (base) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -178,19 +186,18 @@ asm ( \ | ||||
| #define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \ | ||||
| { \ | ||||
| asm ( \ | ||||
|     "ptrue p5.s \n\t" \ | ||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (base + 2 * 3 * 64) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -201,19 +208,18 @@ asm ( \ | ||||
| { \ | ||||
|     const SiteSpinor & ref(in[offset]); \ | ||||
| asm ( \ | ||||
|     "ptrue p5.s \n\t" \ | ||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (&ref[2][0]) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -224,19 +230,18 @@ asm ( \ | ||||
| { \ | ||||
|     const SiteSpinor & ref(in[offset]); \ | ||||
| asm ( \ | ||||
|     "ptrue p5.s \n\t" \ | ||||
|     "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ | ||||
|     "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ | ||||
|     "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (&ref[2][0]) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -293,17 +298,16 @@ asm ( \ | ||||
| );  | ||||
|  | ||||
| // LOAD_GAUGE | ||||
| #define LOAD_GAUGE  \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
| #define LOAD_GAUGE(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
| asm ( \ | ||||
|     "ptrue p5.s \n\t" \ | ||||
|     "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (baseU + 2 * 3 * 64) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -312,14 +316,14 @@ asm ( \ | ||||
| // MULT_2SPIN | ||||
| #define MULT_2SPIN_1_A64FXf(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
| asm ( \ | ||||
|     "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ | ||||
|     "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ | ||||
|     "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ | ||||
|     "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ | ||||
|     "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ | ||||
|     "movprfx z18.s, p5/m, z31.s \n\t" \ | ||||
|     "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ | ||||
|     "movprfx z21.s, p5/m, z31.s \n\t" \ | ||||
| @@ -338,9 +342,9 @@ asm ( \ | ||||
|     "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ | ||||
|     "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ | ||||
|     "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ | ||||
|     "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ | ||||
|     "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ | ||||
|     "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (baseU + 2 * 3 * 64) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
| @@ -560,7 +564,6 @@ asm ( \ | ||||
| #define TM_PROJ_A64FXf  \ | ||||
| { \ | ||||
| asm ( \ | ||||
|     "ptrue p5.s \n\t" \ | ||||
|     "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ | ||||
|     "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ | ||||
|     "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ | ||||
| @@ -715,7 +718,6 @@ asm ( \ | ||||
| // ZERO_PSI | ||||
| #define ZERO_PSI_A64FXf  \ | ||||
| asm ( \ | ||||
|     "ptrue p5.s \n\t" \ | ||||
|     "fmov z0.s , 0 \n\t" \ | ||||
|     "fmov z1.s , 0 \n\t" \ | ||||
|     "fmov z2.s , 0 \n\t" \ | ||||
| @@ -733,13 +735,13 @@ asm ( \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ | ||||
| );  | ||||
|  | ||||
| // PREFETCH_RESULT_L2_STORE (prefetch store to L2) | ||||
| // PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) | ||||
| #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \ | ||||
| { \ | ||||
| asm ( \ | ||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ | ||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ | ||||
|     "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ | ||||
|     "dc zva, %[fetchptr]\n\t" \ | ||||
|     "dc zva, %[fetchptr]\n\t" \ | ||||
|     "dc zva, %[fetchptr]\n\t" \ | ||||
|     :  \ | ||||
|     : [fetchptr] "r" (base) \ | ||||
|     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ | ||||
|   | ||||
| @@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #define LOCK_GAUGE(A)   | ||||
| #define UNLOCK_GAUGE(A)   | ||||
| #define MASK_REGS                      DECLARATIONS_A64FXd   | ||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)   | ||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXd(A);   | ||||
| #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)   | ||||
| #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd   | ||||
| #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)   | ||||
| #define ZERO_PSI                       ZERO_PSI_A64FXd   | ||||
| #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)   | ||||
| #define XP_PROJ                        XP_PROJ_A64FXd   | ||||
| #define YP_PROJ                        YP_PROJ_A64FXd   | ||||
| @@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }   | ||||
| // DECLARATIONS | ||||
| #define DECLARATIONS_A64FXd  \ | ||||
|     uint64_t baseU; \ | ||||
|     const uint64_t lut[4][8] = { \ | ||||
|         {4, 5, 6, 7, 0, 1, 2, 3}, \ | ||||
|         {2, 3, 0, 1, 6, 7, 4, 5}, \ | ||||
| @@ -126,18 +128,18 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| // RESULT | ||||
| #define RESULT_A64FXd(base)  \ | ||||
| { \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \ | ||||
|     svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \ | ||||
|     svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \ | ||||
| } | ||||
| // PREFETCH_CHIMU_L2 (prefetch to L2) | ||||
| #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \ | ||||
| @@ -156,7 +158,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| // PREFETCH_GAUGE_L2 (prefetch to L2) | ||||
| #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||
|     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ | ||||
| @@ -170,7 +172,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| // PREFETCH_GAUGE_L1 (prefetch to L1) | ||||
| #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ | ||||
| @@ -178,62 +180,62 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| // LOAD_CHI | ||||
| #define LOAD_CHI_A64FXd(base)  \ | ||||
| { \ | ||||
|     Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64));  \ | ||||
|     Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64));  \ | ||||
|     Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64));  \ | ||||
|     Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64));  \ | ||||
|     Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64));  \ | ||||
|     Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64));  \ | ||||
|     Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0));  \ | ||||
|     Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1));  \ | ||||
|     Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2));  \ | ||||
|     Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3));  \ | ||||
|     Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4));  \ | ||||
|     Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5));  \ | ||||
| } | ||||
| // LOAD_CHIMU | ||||
| #define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \ | ||||
| { \ | ||||
|     Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \ | ||||
|     Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \ | ||||
|     Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \ | ||||
|     Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \ | ||||
|     Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \ | ||||
|     Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \ | ||||
|     Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||
|     Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||
|     Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||
|     Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||
|     Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||
|     Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||
|     Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||
| } | ||||
| // LOAD_CHIMU_0213 | ||||
| #define LOAD_CHIMU_0213_A64FXd  \ | ||||
| { \ | ||||
|     const SiteSpinor & ref(in[offset]); \ | ||||
|     Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \ | ||||
|     Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \ | ||||
|     Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \ | ||||
|     Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \ | ||||
|     Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \ | ||||
|     Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \ | ||||
|     Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||
|     Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||
|     Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||
|     Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||
|     Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||
|     Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||
|     Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||
| } | ||||
| // LOAD_CHIMU_0312 | ||||
| #define LOAD_CHIMU_0312_A64FXd  \ | ||||
| { \ | ||||
|     const SiteSpinor & ref(in[offset]); \ | ||||
|     Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \ | ||||
|     Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \ | ||||
|     Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \ | ||||
|     Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \ | ||||
|     Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \ | ||||
|     Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \ | ||||
|     Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||
|     Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||
|     Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||
|     Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||
|     Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||
|     Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||
|     Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||
| } | ||||
| // LOAD_TABLE0 | ||||
| #define LOAD_TABLE0  \ | ||||
| @@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
|     Chi_12 = svtbl(Chi_12, table0);     | ||||
|  | ||||
| // LOAD_GAUGE | ||||
| #define LOAD_GAUGE  \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
| #define LOAD_GAUGE(A)  \ | ||||
| { \ | ||||
|     U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
|     U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \ | ||||
| } | ||||
| // MULT_2SPIN | ||||
| #define MULT_2SPIN_1_A64FXd(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
|     U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
|     U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \ | ||||
|     UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ | ||||
|     UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ | ||||
|     UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ | ||||
| @@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
|     UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ | ||||
|     UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ | ||||
|     UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ | ||||
|     U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \ | ||||
|     U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \ | ||||
|     U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \ | ||||
|     U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \ | ||||
|     U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \ | ||||
|     U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \ | ||||
| } | ||||
| // MULT_2SPIN_BACKEND | ||||
| #define MULT_2SPIN_2_A64FXd  \ | ||||
| @@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
|     result_31 = svdup_f64(0.); \ | ||||
|     result_32 = svdup_f64(0.);  | ||||
|  | ||||
| // PREFETCH_RESULT_L2_STORE (prefetch store to L2) | ||||
| // PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) | ||||
| #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \ | ||||
| { \ | ||||
|     svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ | ||||
|     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ | ||||
|     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ | ||||
|     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ | ||||
| } | ||||
| // PREFETCH_RESULT_L1_STORE (prefetch store to L1) | ||||
| #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \ | ||||
|   | ||||
| @@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #define LOCK_GAUGE(A)   | ||||
| #define UNLOCK_GAUGE(A)   | ||||
| #define MASK_REGS                      DECLARATIONS_A64FXf   | ||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)   | ||||
| #define SAVE_RESULT(A,B)               RESULT_A64FXf(A);   | ||||
| #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)   | ||||
| #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf   | ||||
| #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)   | ||||
| #define ZERO_PSI                       ZERO_PSI_A64FXf   | ||||
| #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)   | ||||
| #define XP_PROJ                        XP_PROJ_A64FXf   | ||||
| #define YP_PROJ                        YP_PROJ_A64FXf   | ||||
| @@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }   | ||||
| // DECLARATIONS | ||||
| #define DECLARATIONS_A64FXf  \ | ||||
|     uint64_t baseU; \ | ||||
|     const uint32_t lut[4][16] = { \ | ||||
|         {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ | ||||
|         {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ | ||||
| @@ -126,18 +128,18 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| // RESULT | ||||
| #define RESULT_A64FXf(base)  \ | ||||
| { \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \ | ||||
|     svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \ | ||||
|     svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \ | ||||
| } | ||||
| // PREFETCH_CHIMU_L2 (prefetch to L2) | ||||
| #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \ | ||||
| @@ -156,7 +158,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| // PREFETCH_GAUGE_L2 (prefetch to L2) | ||||
| #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||
|     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ | ||||
| @@ -170,7 +172,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| // PREFETCH_GAUGE_L1 (prefetch to L1) | ||||
| #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ | ||||
| @@ -178,62 +180,62 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| // LOAD_CHI | ||||
| #define LOAD_CHI_A64FXf(base)  \ | ||||
| { \ | ||||
|     Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64));  \ | ||||
|     Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64));  \ | ||||
|     Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64));  \ | ||||
|     Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64));  \ | ||||
|     Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64));  \ | ||||
|     Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64));  \ | ||||
|     Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0));  \ | ||||
|     Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1));  \ | ||||
|     Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2));  \ | ||||
|     Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3));  \ | ||||
|     Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4));  \ | ||||
|     Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5));  \ | ||||
| } | ||||
| // LOAD_CHIMU | ||||
| #define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \ | ||||
| { \ | ||||
|     Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \ | ||||
|     Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \ | ||||
|     Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \ | ||||
|     Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \ | ||||
|     Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \ | ||||
|     Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \ | ||||
|     Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||
|     Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||
|     Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||
|     Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||
|     Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||
|     Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||
|     Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||
| } | ||||
| // LOAD_CHIMU_0213 | ||||
| #define LOAD_CHIMU_0213_A64FXf  \ | ||||
| { \ | ||||
|     const SiteSpinor & ref(in[offset]); \ | ||||
|     Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \ | ||||
|     Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \ | ||||
|     Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \ | ||||
|     Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \ | ||||
|     Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \ | ||||
|     Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \ | ||||
|     Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||
|     Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||
|     Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||
|     Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||
|     Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||
|     Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||
|     Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||
| } | ||||
| // LOAD_CHIMU_0312 | ||||
| #define LOAD_CHIMU_0312_A64FXf  \ | ||||
| { \ | ||||
|     const SiteSpinor & ref(in[offset]); \ | ||||
|     Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \ | ||||
|     Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \ | ||||
|     Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \ | ||||
|     Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \ | ||||
|     Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \ | ||||
|     Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \ | ||||
|     Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \ | ||||
|     Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \ | ||||
|     Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \ | ||||
|     Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \ | ||||
|     Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \ | ||||
|     Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \ | ||||
|     Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \ | ||||
| } | ||||
| // LOAD_TABLE0 | ||||
| #define LOAD_TABLE0  \ | ||||
| @@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
|     Chi_12 = svtbl(Chi_12, table0);     | ||||
|  | ||||
| // LOAD_GAUGE | ||||
| #define LOAD_GAUGE  \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
| #define LOAD_GAUGE(A)  \ | ||||
| { \ | ||||
|     U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
|     U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \ | ||||
| } | ||||
| // MULT_2SPIN | ||||
| #define MULT_2SPIN_1_A64FXf(A)  \ | ||||
| { \ | ||||
|     const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ | ||||
|     U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \ | ||||
|     U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \ | ||||
|     U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \ | ||||
|     U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \ | ||||
|     U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \ | ||||
|     U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \ | ||||
|     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ | ||||
|     U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \ | ||||
|     U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \ | ||||
|     U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \ | ||||
|     U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \ | ||||
|     U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \ | ||||
|     U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \ | ||||
|     UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ | ||||
|     UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ | ||||
|     UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ | ||||
| @@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
|     UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ | ||||
|     UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ | ||||
|     UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ | ||||
|     U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \ | ||||
|     U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \ | ||||
|     U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \ | ||||
|     U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \ | ||||
|     U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \ | ||||
|     U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \ | ||||
| } | ||||
| // MULT_2SPIN_BACKEND | ||||
| #define MULT_2SPIN_2_A64FXf  \ | ||||
| @@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
|     result_31 = svdup_f32(0.); \ | ||||
|     result_32 = svdup_f32(0.);  | ||||
|  | ||||
| // PREFETCH_RESULT_L2_STORE (prefetch store to L2) | ||||
| // PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) | ||||
| #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \ | ||||
| { \ | ||||
|     svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ | ||||
|     svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ | ||||
|     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ | ||||
|     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ | ||||
|     asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ | ||||
| } | ||||
| // PREFETCH_RESULT_L1_STORE (prefetch store to L1) | ||||
| #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \ | ||||
|   | ||||
| @@ -46,6 +46,7 @@ Author: Nils Meyer <nils.meyer@ur.de> | ||||
| #undef MULT_2SPIN_2 | ||||
| #undef MAYBEPERM | ||||
| #undef LOAD_CHI | ||||
| #undef ZERO_PSI | ||||
| #undef XP_PROJ | ||||
| #undef YP_PROJ | ||||
| #undef ZP_PROJ | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user