diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 0a65294d..bd9ebe5d 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -337,7 +337,7 @@ asm ( \ "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index 7e58a9d3..2ece4299 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -349,7 +349,7 @@ asm ( \ "fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \ "fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \ : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ ); \ } diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 9cf33c23..4b85563c 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -262,12 +262,12 @@ Author: Nils Meyer #define MULT_2SPIN_A64FXd(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ UChi_00 = __svzero(UChi_00); \ UChi_10 = __svzero(UChi_10); \ UChi_01 = __svzero(UChi_01); \ @@ -286,9 +286,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 2728f507..7f8132e8 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -269,12 +269,12 @@ Author: Nils Meyer #define MULT_2SPIN_A64FXf(A) \ { \ const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ UChi_00 = __svzero(UChi_00); \ UChi_10 = __svzero(UChi_10); \ UChi_01 = __svzero(UChi_01); \ @@ -293,9 +293,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \