mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
addressing
This commit is contained in:
parent
326de36467
commit
b140c6a4f9
@ -283,7 +283,7 @@ asm ( \
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
"ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
@ -337,7 +337,7 @@ asm ( \
|
||||
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
|
||||
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" ((uint64_t)&ref[2][0]) \
|
||||
: [fetchptr] "r" (base + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
|
@ -295,7 +295,7 @@ asm ( \
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
"ld1d { z24.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ld1d { z25.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
@ -349,7 +349,7 @@ asm ( \
|
||||
"fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \
|
||||
"fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" ((uint64_t)&ref[2][0]) \
|
||||
: [fetchptr] "r" (base + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
|
@ -261,13 +261,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); \
|
||||
U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 1 * 64)); \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
|
||||
UChi_00 = __svzero(UChi_00); \
|
||||
UChi_10 = __svzero(UChi_10); \
|
||||
UChi_01 = __svzero(UChi_01); \
|
||||
@ -286,9 +286,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
|
||||
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
|
||||
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
|
||||
U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -4 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -1 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 2 * 64)); \
|
||||
U_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
|
||||
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
|
||||
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
|
||||
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
|
||||
|
@ -268,13 +268,13 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); \
|
||||
U_00 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 1 * 64)); \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
|
||||
UChi_00 = __svzero(UChi_00); \
|
||||
UChi_10 = __svzero(UChi_10); \
|
||||
UChi_01 = __svzero(UChi_01); \
|
||||
@ -293,9 +293,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
|
||||
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
|
||||
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
|
||||
U_00 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -4 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + -1 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)((uint64_t)&ref[2][0] + 2 * 64)); \
|
||||
U_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
|
||||
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
|
||||
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
|
||||
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
|
||||
|
Loading…
Reference in New Issue
Block a user