1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00
This commit is contained in:
nmeyer-ur 2020-04-09 16:32:31 +02:00
parent 304762e7ac
commit bd310932f7
11 changed files with 84 additions and 2694 deletions

View File

@ -1,713 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x)
#define PREFETCH_CHIMU_L1(A)
#define PREFETCH_GAUGE_L1(A)
#define PREFETCH_CHIMU_L2(A)
#define PREFETCH_GAUGE_L2(A)
#define PF_GAUGE(A)
#define PREFETCH1_CHIMU(A)
#define PREFETCH_CHIMU(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXd
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI ZERO_PSI_A64FXd
#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 PERM0_A64FXd
#define PERMUTE_DIR1 PERM1_A64FXd
#define PERMUTE_DIR2 PERM2_A64FXd
#define PERMUTE_DIR3 PERM3_A64FXd
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
{4, 5, 6, 7, 0, 1, 2, 3}, \
{2, 3, 0, 1, 6, 7, 4, 5}, \
{1, 0, 3, 2, 5, 4, 7, 6}, \
{0, 1, 2, 4, 5, 6, 7, 8} };\
asm ( \
"fmov z31.d , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// RESULT
#define RESULT_A64FXd(base) \
{ \
asm ( \
"stnt1d { z0.d }, p5, [%[storeptr], -6, mul vl] \n\t" \
"stnt1d { z1.d }, p5, [%[storeptr], -5, mul vl] \n\t" \
"stnt1d { z2.d }, p5, [%[storeptr], -4, mul vl] \n\t" \
"stnt1d { z3.d }, p5, [%[storeptr], -3, mul vl] \n\t" \
"stnt1d { z4.d }, p5, [%[storeptr], -2, mul vl] \n\t" \
"stnt1d { z5.d }, p5, [%[storeptr], -1, mul vl] \n\t" \
"stnt1d { z6.d }, p5, [%[storeptr], 0, mul vl] \n\t" \
"stnt1d { z7.d }, p5, [%[storeptr], 1, mul vl] \n\t" \
"stnt1d { z8.d }, p5, [%[storeptr], 2, mul vl] \n\t" \
"stnt1d { z9.d }, p5, [%[storeptr], 3, mul vl] \n\t" \
"stnt1d { z10.d }, p5, [%[storeptr], 4, mul vl] \n\t" \
"stnt1d { z11.d }, p5, [%[storeptr], 5, mul vl] \n\t" \
: \
: [storeptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \
{ \
const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXd(base) \
{ \
asm ( \
"ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
{ \
asm ( \
"ptrue p5.d \n\t" \
"ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
"ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
"ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
"ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.d \n\t" \
"ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
"ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
"ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
"ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.d \n\t" \
"ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
"ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
"ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
"ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PERM0
#define PERM0_A64FXd \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXd \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXd \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
#define PERM3_A64FXd
// MULT_2SPIN
#define MULT_2SPIN_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"fmov z18.d , 0 \n\t" \
"fmov z21.d , 0 \n\t" \
"fmov z19.d , 0 \n\t" \
"fmov z22.d , 0 \n\t" \
"fmov z20.d , 0 \n\t" \
"fmov z23.d , 0 \n\t" \
"fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
"fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
"fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
"fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
"fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
"fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
"fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
"fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
"fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \
"fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
"fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
"fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
"ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
"fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \
"fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \
"fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \
"fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \
"fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \
"fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \
"fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \
"fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \
"fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \
"fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \
"fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \
"fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \
"fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \
"fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \
"fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \
"fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \
"fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \
"fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \
"fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// XP_PROJ
#define XP_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \
"fcadd z15.d, p5/m, z15.d, z24.d, 90 \n\t" \
"fcadd z16.d, p5/m, z16.d, z25.d, 90 \n\t" \
"fcadd z17.d, p5/m, z17.d, z26.d, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_RECON
#define XP_RECON_A64FXd \
asm ( \
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
"mov z0.d, z18.d \n\t" \
"mov z1.d, z19.d \n\t" \
"mov z2.d, z20.d \n\t" \
"mov z3.d, z21.d \n\t" \
"mov z4.d, z22.d \n\t" \
"mov z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_PROJ
#define YP_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fsub z12.d, p5/m, z12.d, z27.d \n\t" \
"fsub z13.d, p5/m, z13.d, z28.d \n\t" \
"fsub z14.d, p5/m, z14.d, z29.d \n\t" \
"fadd z15.d, p5/m, z15.d, z24.d \n\t" \
"fadd z16.d, p5/m, z16.d, z25.d \n\t" \
"fadd z17.d, p5/m, z17.d, z26.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \
"fcadd z15.d, p5/m, z15.d, z27.d, 270 \n\t" \
"fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \
"fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TP_PROJ
#define TP_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fadd z12.d, p5/m, z12.d, z24.d \n\t" \
"fadd z13.d, p5/m, z13.d, z25.d \n\t" \
"fadd z14.d, p5/m, z14.d, z26.d \n\t" \
"fadd z15.d, p5/m, z15.d, z27.d \n\t" \
"fadd z16.d, p5/m, z16.d, z28.d \n\t" \
"fadd z17.d, p5/m, z17.d, z29.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_PROJ
#define XM_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \
"fcadd z15.d, p5/m, z15.d, z24.d, 270 \n\t" \
"fcadd z16.d, p5/m, z16.d, z25.d, 270 \n\t" \
"fcadd z17.d, p5/m, z17.d, z26.d, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON
#define XM_RECON_A64FXd \
asm ( \
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
"mov z0.d, z18.d \n\t" \
"mov z1.d, z19.d \n\t" \
"mov z2.d, z20.d \n\t" \
"mov z3.d, z21.d \n\t" \
"mov z4.d, z22.d \n\t" \
"mov z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_PROJ
#define YM_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fadd z12.d, p5/m, z12.d, z27.d \n\t" \
"fadd z13.d, p5/m, z13.d, z28.d \n\t" \
"fadd z14.d, p5/m, z14.d, z29.d \n\t" \
"fsub z15.d, p5/m, z15.d, z24.d \n\t" \
"fsub z16.d, p5/m, z16.d, z25.d \n\t" \
"fsub z17.d, p5/m, z17.d, z26.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \
"fcadd z15.d, p5/m, z15.d, z27.d, 90 \n\t" \
"fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \
"fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TM_PROJ
#define TM_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fsub z12.d, p5/m, z12.d, z24.d \n\t" \
"fsub z13.d, p5/m, z13.d, z25.d \n\t" \
"fsub z14.d, p5/m, z14.d, z26.d \n\t" \
"fsub z15.d, p5/m, z15.d, z27.d \n\t" \
"fsub z16.d, p5/m, z16.d, z28.d \n\t" \
"fsub z17.d, p5/m, z17.d, z29.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
"mov z0.d, z18.d \n\t" \
"mov z1.d, z19.d \n\t" \
"mov z2.d, z20.d \n\t" \
"mov z3.d, z21.d \n\t" \
"mov z4.d, z22.d \n\t" \
"mov z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fsub z9.d, p5/m, z9.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fsub z10.d, p5/m, z10.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fsub z11.d, p5/m, z11.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z6.d, p5/m, z6.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z7.d, p5/m, z7.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fadd z8.d, p5/m, z8.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z9.d, p5/m, z9.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z10.d, p5/m, z10.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z11.d, p5/m, z11.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fsub z6.d, p5/m, z6.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fsub z7.d, p5/m, z7.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fsub z8.d, p5/m, z8.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXd \
asm ( \
"ptrue p5.d \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z7.d, p5/m, z7.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z8.d, p5/m, z8.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z9.d, p5/m, z9.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z10.d, p5/m, z10.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fadd z11.d, p5/m, z11.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fsub z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fsub z7.d, p5/m, z7.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fsub z8.d, p5/m, z8.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fsub z9.d, p5/m, z9.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fsub z10.d, p5/m, z10.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fsub z11.d, p5/m, z11.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZERO_PSI
#define ZERO_PSI_A64FXd \
asm ( \
"ptrue p5.d \n\t" \
"fmov z0.d , 0 \n\t" \
"fmov z1.d , 0 \n\t" \
"fmov z2.d , 0 \n\t" \
"fmov z3.d , 0 \n\t" \
"fmov z4.d , 0 \n\t" \
"fmov z5.d , 0 \n\t" \
"fmov z6.d , 0 \n\t" \
"fmov z7.d , 0 \n\t" \
"fmov z8.d , 0 \n\t" \
"fmov z9.d , 0 \n\t" \
"fmov z10.d , 0 \n\t" \
"fmov z11.d , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z12.d \n\t" \
"fadd z1.d, p5/m, z1.d, z13.d \n\t" \
"fadd z2.d, p5/m, z2.d, z14.d \n\t" \
"fadd z3.d, p5/m, z3.d, z15.d \n\t" \
"fadd z4.d, p5/m, z4.d, z16.d \n\t" \
"fadd z5.d, p5/m, z5.d, z17.d \n\t" \
"fadd z6.d, p5/m, z6.d, z24.d \n\t" \
"fadd z7.d, p5/m, z7.d, z25.d \n\t" \
"fadd z8.d, p5/m, z8.d, z26.d \n\t" \
"fadd z9.d, p5/m, z9.d, z27.d \n\t" \
"fadd z10.d, p5/m, z10.d, z28.d \n\t" \
"fadd z11.d, p5/m, z11.d, z29.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);

View File

@ -1,727 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
#define PREFETCH_CHIMU_L1(A)
#define PREFETCH_GAUGE_L1(A)
#define PREFETCH_CHIMU_L2(A)
#define PREFETCH_GAUGE_L2(A)
#define PF_GAUGE(A)
#define PREFETCH1_CHIMU(A)
#define PREFETCH_CHIMU(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXf
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI ZERO_PSI_A64FXf
#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 PERM0_A64FXf
#define PERMUTE_DIR1 PERM1_A64FXf
#define PERMUTE_DIR2 PERM2_A64FXf
#define PERMUTE_DIR3 PERM3_A64FXf
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
asm ( \
"fmov z31.f , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// RESULT
#define RESULT_A64FXf(base) \
{ \
asm ( \
"stnt1d { z0.f }, p5, [%[storeptr], -6, mul vl] \n\t" \
"stnt1d { z1.f }, p5, [%[storeptr], -5, mul vl] \n\t" \
"stnt1d { z2.f }, p5, [%[storeptr], -4, mul vl] \n\t" \
"stnt1d { z3.f }, p5, [%[storeptr], -3, mul vl] \n\t" \
"stnt1d { z4.f }, p5, [%[storeptr], -2, mul vl] \n\t" \
"stnt1d { z5.f }, p5, [%[storeptr], -1, mul vl] \n\t" \
"stnt1d { z6.f }, p5, [%[storeptr], 0, mul vl] \n\t" \
"stnt1d { z7.f }, p5, [%[storeptr], 1, mul vl] \n\t" \
"stnt1d { z8.f }, p5, [%[storeptr], 2, mul vl] \n\t" \
"stnt1d { z9.f }, p5, [%[storeptr], 3, mul vl] \n\t" \
"stnt1d { z10.f }, p5, [%[storeptr], 4, mul vl] \n\t" \
"stnt1d { z11.f }, p5, [%[storeptr], 5, mul vl] \n\t" \
: \
: [storeptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \
{ \
const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXf(base) \
{ \
asm ( \
"ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
{ \
asm ( \
"ptrue p5.f \n\t" \
"ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
"ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
"ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
"ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.f \n\t" \
"ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
"ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
"ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
"ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.f \n\t" \
"ld1d { z12.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z27.f }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
"ld1d { z13.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z28.f }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
"ld1d { z14.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z29.f }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
"ld1d { z15.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z24.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z16.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z25.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"ld1d { z17.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PERM0
#define PERM0_A64FXf \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.f, { z12.f }, z30.f \n\t" \
"tbl z13.f, { z13.f }, z30.f \n\t" \
"tbl z14.f, { z14.f }, z30.f \n\t" \
"tbl z15.f, { z15.f }, z30.f \n\t" \
"tbl z16.f, { z16.f }, z30.f \n\t" \
"tbl z17.f, { z17.f }, z30.f \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXf \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.f, { z12.f }, z30.f \n\t" \
"tbl z13.f, { z13.f }, z30.f \n\t" \
"tbl z14.f, { z14.f }, z30.f \n\t" \
"tbl z15.f, { z15.f }, z30.f \n\t" \
"tbl z16.f, { z16.f }, z30.f \n\t" \
"tbl z17.f, { z17.f }, z30.f \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXf \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.f, { z12.f }, z30.f \n\t" \
"tbl z13.f, { z13.f }, z30.f \n\t" \
"tbl z14.f, { z14.f }, z30.f \n\t" \
"tbl z15.f, { z15.f }, z30.f \n\t" \
"tbl z16.f, { z16.f }, z30.f \n\t" \
"tbl z17.f, { z17.f }, z30.f \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
#define PERM3_A64FXf \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.f, { z12.f }, z30.f \n\t" \
"tbl z13.f, { z13.f }, z30.f \n\t" \
"tbl z14.f, { z14.f }, z30.f \n\t" \
"tbl z15.f, { z15.f }, z30.f \n\t" \
"tbl z16.f, { z16.f }, z30.f \n\t" \
"tbl z17.f, { z17.f }, z30.f \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// MULT_2SPIN
#define MULT_2SPIN_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"ld1d { z24.f }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z25.f }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z26.f }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z27.f }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z28.f }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z29.f }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"fmov z18.f , 0 \n\t" \
"fmov z21.f , 0 \n\t" \
"fmov z19.f , 0 \n\t" \
"fmov z22.f , 0 \n\t" \
"fmov z20.f , 0 \n\t" \
"fmov z23.f , 0 \n\t" \
"fcmla z18.f, p5/m, z24.f, z12.f, 0 \n\t" \
"fcmla z21.f, p5/m, z24.f, z15.f, 0 \n\t" \
"fcmla z19.f, p5/m, z25.f, z12.f, 0 \n\t" \
"fcmla z22.f, p5/m, z25.f, z15.f, 0 \n\t" \
"fcmla z20.f, p5/m, z26.f, z12.f, 0 \n\t" \
"fcmla z23.f, p5/m, z26.f, z15.f, 0 \n\t" \
"fcmla z18.f, p5/m, z24.f, z12.f, 90 \n\t" \
"fcmla z21.f, p5/m, z24.f, z15.f, 90 \n\t" \
"fcmla z19.f, p5/m, z25.f, z12.f, 90 \n\t" \
"fcmla z22.f, p5/m, z25.f, z15.f, 90 \n\t" \
"fcmla z20.f, p5/m, z26.f, z12.f, 90 \n\t" \
"fcmla z23.f, p5/m, z26.f, z15.f, 90 \n\t" \
"ld1d { z24.f }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z25.f }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z26.f }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
"fcmla z18.f, p5/m, z27.f, z13.f, 0 \n\t" \
"fcmla z21.f, p5/m, z27.f, z16.f, 0 \n\t" \
"fcmla z19.f, p5/m, z28.f, z13.f, 0 \n\t" \
"fcmla z22.f, p5/m, z28.f, z16.f, 0 \n\t" \
"fcmla z20.f, p5/m, z29.f, z13.f, 0 \n\t" \
"fcmla z23.f, p5/m, z29.f, z16.f, 0 \n\t" \
"fcmla z18.f, p5/m, z27.f, z13.f, 90 \n\t" \
"fcmla z21.f, p5/m, z27.f, z16.f, 90 \n\t" \
"fcmla z19.f, p5/m, z28.f, z13.f, 90 \n\t" \
"fcmla z22.f, p5/m, z28.f, z16.f, 90 \n\t" \
"fcmla z20.f, p5/m, z29.f, z13.f, 90 \n\t" \
"fcmla z23.f, p5/m, z29.f, z16.f, 90 \n\t" \
"fcmla z18.f, p5/m, z24.f, z14.f, 0 \n\t" \
"fcmla z21.f, p5/m, z24.f, z17.f, 0 \n\t" \
"fcmla z19.f, p5/m, z25.f, z14.f, 0 \n\t" \
"fcmla z22.f, p5/m, z25.f, z17.f, 0 \n\t" \
"fcmla z20.f, p5/m, z26.f, z14.f, 0 \n\t" \
"fcmla z23.f, p5/m, z26.f, z17.f, 0 \n\t" \
"fcmla z18.f, p5/m, z24.f, z14.f, 90 \n\t" \
"fcmla z21.f, p5/m, z24.f, z17.f, 90 \n\t" \
"fcmla z19.f, p5/m, z25.f, z14.f, 90 \n\t" \
"fcmla z22.f, p5/m, z25.f, z17.f, 90 \n\t" \
"fcmla z20.f, p5/m, z26.f, z14.f, 90 \n\t" \
"fcmla z23.f, p5/m, z26.f, z17.f, 90 \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// XP_PROJ
#define XP_PROJ_A64FXf \
{ \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fcadd z12.f, p5/m, z12.f, z27.f, 90 \n\t" \
"fcadd z13.f, p5/m, z13.f, z28.f, 90 \n\t" \
"fcadd z14.f, p5/m, z14.f, z29.f, 90 \n\t" \
"fcadd z15.f, p5/m, z15.f, z24.f, 90 \n\t" \
"fcadd z16.f, p5/m, z16.f, z25.f, 90 \n\t" \
"fcadd z17.f, p5/m, z17.f, z26.f, 90 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_RECON
#define XP_RECON_A64FXf \
asm ( \
"fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \
"fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \
"fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \
"fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \
"fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \
"fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \
"mov z0.f, z18.f \n\t" \
"mov z1.f, z19.f \n\t" \
"mov z2.f, z20.f \n\t" \
"mov z3.f, z21.f \n\t" \
"mov z4.f, z22.f \n\t" \
"mov z5.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z9.f, p5/m, z9.f, z18.f, 270 \n\t" \
"fadd z0.f, p5/m, z0.f, z18.f \n\t" \
"fcadd z10.f, p5/m, z10.f, z19.f, 270 \n\t" \
"fadd z1.f, p5/m, z1.f, z19.f \n\t" \
"fcadd z11.f, p5/m, z11.f, z20.f, 270 \n\t" \
"fadd z2.f, p5/m, z2.f, z20.f \n\t" \
"fcadd z6.f, p5/m, z6.f, z21.f, 270 \n\t" \
"fadd z3.f, p5/m, z3.f, z21.f \n\t" \
"fcadd z7.f, p5/m, z7.f, z22.f, 270 \n\t" \
"fadd z4.f, p5/m, z4.f, z22.f \n\t" \
"fcadd z8.f, p5/m, z8.f, z23.f, 270 \n\t" \
"fadd z5.f, p5/m, z5.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_PROJ
#define YP_PROJ_A64FXf \
{ \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fsub z12.f, p5/m, z12.f, z27.f \n\t" \
"fsub z13.f, p5/m, z13.f, z28.f \n\t" \
"fsub z14.f, p5/m, z14.f, z29.f \n\t" \
"fadd z15.f, p5/m, z15.f, z24.f \n\t" \
"fadd z16.f, p5/m, z16.f, z25.f \n\t" \
"fadd z17.f, p5/m, z17.f, z26.f \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXf \
{ \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fcadd z12.f, p5/m, z12.f, z24.f, 90 \n\t" \
"fcadd z13.f, p5/m, z13.f, z25.f, 90 \n\t" \
"fcadd z14.f, p5/m, z14.f, z26.f, 90 \n\t" \
"fcadd z15.f, p5/m, z15.f, z27.f, 270 \n\t" \
"fcadd z16.f, p5/m, z16.f, z28.f, 270 \n\t" \
"fcadd z17.f, p5/m, z17.f, z29.f, 270 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TP_PROJ
#define TP_PROJ_A64FXf \
{ \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fadd z12.f, p5/m, z12.f, z24.f \n\t" \
"fadd z13.f, p5/m, z13.f, z25.f \n\t" \
"fadd z14.f, p5/m, z14.f, z26.f \n\t" \
"fadd z15.f, p5/m, z15.f, z27.f \n\t" \
"fadd z16.f, p5/m, z16.f, z28.f \n\t" \
"fadd z17.f, p5/m, z17.f, z29.f \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_PROJ
#define XM_PROJ_A64FXf \
{ \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fcadd z12.f, p5/m, z12.f, z27.f, 270 \n\t" \
"fcadd z13.f, p5/m, z13.f, z28.f, 270 \n\t" \
"fcadd z14.f, p5/m, z14.f, z29.f, 270 \n\t" \
"fcadd z15.f, p5/m, z15.f, z24.f, 270 \n\t" \
"fcadd z16.f, p5/m, z16.f, z25.f, 270 \n\t" \
"fcadd z17.f, p5/m, z17.f, z26.f, 270 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON
#define XM_RECON_A64FXf \
asm ( \
"fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \
"fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \
"fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \
"fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \
"fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \
"fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \
"mov z0.f, z18.f \n\t" \
"mov z1.f, z19.f \n\t" \
"mov z2.f, z20.f \n\t" \
"mov z3.f, z21.f \n\t" \
"mov z4.f, z22.f \n\t" \
"mov z5.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_PROJ
#define YM_PROJ_A64FXf \
{ \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fadd z12.f, p5/m, z12.f, z27.f \n\t" \
"fadd z13.f, p5/m, z13.f, z28.f \n\t" \
"fadd z14.f, p5/m, z14.f, z29.f \n\t" \
"fsub z15.f, p5/m, z15.f, z24.f \n\t" \
"fsub z16.f, p5/m, z16.f, z25.f \n\t" \
"fsub z17.f, p5/m, z17.f, z26.f \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXf \
{ \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fcadd z12.f, p5/m, z12.f, z24.f, 270 \n\t" \
"fcadd z13.f, p5/m, z13.f, z25.f, 270 \n\t" \
"fcadd z14.f, p5/m, z14.f, z26.f, 270 \n\t" \
"fcadd z15.f, p5/m, z15.f, z27.f, 90 \n\t" \
"fcadd z16.f, p5/m, z16.f, z28.f, 90 \n\t" \
"fcadd z17.f, p5/m, z17.f, z29.f, 90 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TM_PROJ
#define TM_PROJ_A64FXf \
{ \
asm ( \
"ld1d { z30.f }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"fsub z12.f, p5/m, z12.f, z24.f \n\t" \
"fsub z13.f, p5/m, z13.f, z25.f \n\t" \
"fsub z14.f, p5/m, z14.f, z26.f \n\t" \
"fsub z15.f, p5/m, z15.f, z27.f \n\t" \
"fsub z16.f, p5/m, z16.f, z28.f \n\t" \
"fsub z17.f, p5/m, z17.f, z29.f \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z9.f, p5/m, z9.f, z18.f, 90 \n\t" \
"fcadd z10.f, p5/m, z10.f, z19.f, 90 \n\t" \
"fcadd z11.f, p5/m, z11.f, z20.f, 90 \n\t" \
"fcadd z6.f, p5/m, z6.f, z21.f, 90 \n\t" \
"fcadd z7.f, p5/m, z7.f, z22.f, 90 \n\t" \
"fcadd z8.f, p5/m, z8.f, z23.f, 90 \n\t" \
"mov z0.f, z18.f \n\t" \
"mov z1.f, z19.f \n\t" \
"mov z2.f, z20.f \n\t" \
"mov z3.f, z21.f \n\t" \
"mov z4.f, z22.f \n\t" \
"mov z5.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.f, p5/m, z0.f, z18.f \n\t" \
"fsub z9.f, p5/m, z9.f, z18.f \n\t" \
"fadd z1.f, p5/m, z1.f, z19.f \n\t" \
"fsub z10.f, p5/m, z10.f, z19.f \n\t" \
"fadd z2.f, p5/m, z2.f, z20.f \n\t" \
"fsub z11.f, p5/m, z11.f, z20.f \n\t" \
"fadd z3.f, p5/m, z3.f, z21.f \n\t" \
"fadd z6.f, p5/m, z6.f, z21.f \n\t" \
"fadd z4.f, p5/m, z4.f, z22.f \n\t" \
"fadd z7.f, p5/m, z7.f, z22.f \n\t" \
"fadd z5.f, p5/m, z5.f, z23.f \n\t" \
"fadd z8.f, p5/m, z8.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.f, p5/m, z0.f, z18.f \n\t" \
"fadd z9.f, p5/m, z9.f, z18.f \n\t" \
"fadd z1.f, p5/m, z1.f, z19.f \n\t" \
"fadd z10.f, p5/m, z10.f, z19.f \n\t" \
"fadd z2.f, p5/m, z2.f, z20.f \n\t" \
"fadd z11.f, p5/m, z11.f, z20.f \n\t" \
"fadd z3.f, p5/m, z3.f, z21.f \n\t" \
"fsub z6.f, p5/m, z6.f, z21.f \n\t" \
"fadd z4.f, p5/m, z4.f, z22.f \n\t" \
"fsub z7.f, p5/m, z7.f, z22.f \n\t" \
"fadd z5.f, p5/m, z5.f, z23.f \n\t" \
"fsub z8.f, p5/m, z8.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z6.f, p5/m, z6.f, z18.f, 270 \n\t" \
"fadd z0.f, p5/m, z0.f, z18.f \n\t" \
"fcadd z7.f, p5/m, z7.f, z19.f, 270 \n\t" \
"fadd z1.f, p5/m, z1.f, z19.f \n\t" \
"fcadd z8.f, p5/m, z8.f, z20.f, 270 \n\t" \
"fadd z2.f, p5/m, z2.f, z20.f \n\t" \
"fcadd z9.f, p5/m, z9.f, z21.f, 90 \n\t" \
"fadd z3.f, p5/m, z3.f, z21.f \n\t" \
"fcadd z10.f, p5/m, z10.f, z22.f, 90 \n\t" \
"fadd z4.f, p5/m, z4.f, z22.f \n\t" \
"fcadd z11.f, p5/m, z11.f, z23.f, 90 \n\t" \
"fadd z5.f, p5/m, z5.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z6.f, p5/m, z6.f, z18.f, 90 \n\t" \
"fadd z0.f, p5/m, z0.f, z18.f \n\t" \
"fcadd z7.f, p5/m, z7.f, z19.f, 90 \n\t" \
"fadd z1.f, p5/m, z1.f, z19.f \n\t" \
"fcadd z8.f, p5/m, z8.f, z20.f, 90 \n\t" \
"fadd z2.f, p5/m, z2.f, z20.f \n\t" \
"fcadd z9.f, p5/m, z9.f, z21.f, 270 \n\t" \
"fadd z3.f, p5/m, z3.f, z21.f \n\t" \
"fcadd z10.f, p5/m, z10.f, z22.f, 270 \n\t" \
"fadd z4.f, p5/m, z4.f, z22.f \n\t" \
"fcadd z11.f, p5/m, z11.f, z23.f, 270 \n\t" \
"fadd z5.f, p5/m, z5.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXf \
asm ( \
"ptrue p5.f \n\t" \
"fadd z0.f, p5/m, z0.f, z18.f \n\t" \
"fadd z6.f, p5/m, z6.f, z18.f \n\t" \
"fadd z1.f, p5/m, z1.f, z19.f \n\t" \
"fadd z7.f, p5/m, z7.f, z19.f \n\t" \
"fadd z2.f, p5/m, z2.f, z20.f \n\t" \
"fadd z8.f, p5/m, z8.f, z20.f \n\t" \
"fadd z3.f, p5/m, z3.f, z21.f \n\t" \
"fadd z9.f, p5/m, z9.f, z21.f \n\t" \
"fadd z4.f, p5/m, z4.f, z22.f \n\t" \
"fadd z10.f, p5/m, z10.f, z22.f \n\t" \
"fadd z5.f, p5/m, z5.f, z23.f \n\t" \
"fadd z11.f, p5/m, z11.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.f, p5/m, z0.f, z18.f \n\t" \
"fsub z6.f, p5/m, z6.f, z18.f \n\t" \
"fadd z1.f, p5/m, z1.f, z19.f \n\t" \
"fsub z7.f, p5/m, z7.f, z19.f \n\t" \
"fadd z2.f, p5/m, z2.f, z20.f \n\t" \
"fsub z8.f, p5/m, z8.f, z20.f \n\t" \
"fadd z3.f, p5/m, z3.f, z21.f \n\t" \
"fsub z9.f, p5/m, z9.f, z21.f \n\t" \
"fadd z4.f, p5/m, z4.f, z22.f \n\t" \
"fsub z10.f, p5/m, z10.f, z22.f \n\t" \
"fadd z5.f, p5/m, z5.f, z23.f \n\t" \
"fsub z11.f, p5/m, z11.f, z23.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZERO_PSI
#define ZERO_PSI_A64FXf \
asm ( \
"ptrue p5.f \n\t" \
"fmov z0.f , 0 \n\t" \
"fmov z1.f , 0 \n\t" \
"fmov z2.f , 0 \n\t" \
"fmov z3.f , 0 \n\t" \
"fmov z4.f , 0 \n\t" \
"fmov z5.f , 0 \n\t" \
"fmov z6.f , 0 \n\t" \
"fmov z7.f , 0 \n\t" \
"fmov z8.f , 0 \n\t" \
"fmov z9.f , 0 \n\t" \
"fmov z10.f , 0 \n\t" \
"fmov z11.f , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXf \
asm ( \
"fadd z0.f, p5/m, z0.f, z12.f \n\t" \
"fadd z1.f, p5/m, z1.f, z13.f \n\t" \
"fadd z2.f, p5/m, z2.f, z14.f \n\t" \
"fadd z3.f, p5/m, z3.f, z15.f \n\t" \
"fadd z4.f, p5/m, z4.f, z16.f \n\t" \
"fadd z5.f, p5/m, z5.f, z17.f \n\t" \
"fadd z6.f, p5/m, z6.f, z24.f \n\t" \
"fadd z7.f, p5/m, z7.f, z25.f \n\t" \
"fadd z8.f, p5/m, z8.f, z26.f \n\t" \
"fadd z9.f, p5/m, z9.f, z27.f \n\t" \
"fadd z10.f, p5/m, z10.f, z28.f \n\t" \
"fadd z11.f, p5/m, z11.f, z29.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);

View File

@ -1,584 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXd(x) LOAD_CHIMU_INTERLEAVED_A64FXd(x)
#define PREFETCH_CHIMU_L1(A)
#define PREFETCH_GAUGE_L1(A)
#define PREFETCH_CHIMU_L2(A)
#define PREFETCH_GAUGE_L2(A)
#define PF_GAUGE(A)
#define PREFETCH1_CHIMU(A)
#define PREFETCH_CHIMU(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXd
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI ZERO_PSI_A64FXd
#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TP_PROJ_A64FXd
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XM_PROJ_A64FXd
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YM_PROJ_A64FXd
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZM_PROJ_A64FXd
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 PERM0_A64FXd
#define PERMUTE_DIR1 PERM1_A64FXd
#define PERMUTE_DIR2 PERM2_A64FXd
#define PERMUTE_DIR3 PERM3_A64FXd
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
{4, 5, 6, 7, 0, 1, 2, 3}, \
{2, 3, 0, 1, 6, 7, 4, 5}, \
{1, 0, 3, 2, 5, 4, 7, 6}, \
{0, 1, 2, 4, 5, 6, 7, 8} };\
svfloat64_t result_00; \
svfloat64_t result_01; \
svfloat64_t result_02; \
svfloat64_t result_10; \
svfloat64_t result_11; \
svfloat64_t result_12; \
svfloat64_t result_20; \
svfloat64_t result_21; \
svfloat64_t result_22; \
svfloat64_t result_30; \
svfloat64_t result_31; \
svfloat64_t result_32; \
svfloat64_t Chi_00; \
svfloat64_t Chi_01; \
svfloat64_t Chi_02; \
svfloat64_t Chi_10; \
svfloat64_t Chi_11; \
svfloat64_t Chi_12; \
svfloat64_t UChi_00; \
svfloat64_t UChi_01; \
svfloat64_t UChi_02; \
svfloat64_t UChi_10; \
svfloat64_t UChi_11; \
svfloat64_t UChi_12; \
svfloat64_t U_00; \
svfloat64_t U_10; \
svfloat64_t U_20; \
svfloat64_t U_01; \
svfloat64_t U_11; \
svfloat64_t U_21; \
svbool_t pg1; \
pg1 = svptrue_b64(); \
svuint64_t table0; \
svfloat64_t zero0; \
zero0 = __svzero(zero0);
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 U_00
#define Chimu_21 U_10
#define Chimu_22 U_20
#define Chimu_30 U_01
#define Chimu_31 U_11
#define Chimu_32 U_21
// RESULT
#define RESULT_A64FXd(base) \
{ \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \
svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd \
{ \
const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXd(base) \
{ \
Chi_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chi_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chi_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chi_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chi_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
{ \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// PERM0
#define PERM0_A64FXd \
table0 = svld1(pg1, (uint64_t*)&lut[0]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM1
#define PERM1_A64FXd \
table0 = svld1(pg1, (uint64_t*)&lut[1]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM2
#define PERM2_A64FXd \
table0 = svld1(pg1, (uint64_t*)&lut[2]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM3
#define PERM3_A64FXd
// MULT_2SPIN
#define MULT_2SPIN_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
UChi_00 = __svzero(UChi_00); \
UChi_10 = __svzero(UChi_10); \
UChi_01 = __svzero(UChi_01); \
UChi_11 = __svzero(UChi_11); \
UChi_02 = __svzero(UChi_02); \
UChi_12 = __svzero(UChi_12); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \
}
// XP_PROJ
#define XP_PROJ_A64FXd \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \
}
// XP_RECON
#define XP_RECON_A64FXd \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXd \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_PROJ
#define YP_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[2]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \
Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \
Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \
Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \
}
// TP_PROJ
#define TP_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[0]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \
Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \
Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \
Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \
}
// XM_PROJ
#define XM_PROJ_A64FXd \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \
}
// XM_RECON
#define XM_RECON_A64FXd \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// YM_PROJ
#define YM_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[2]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \
Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \
Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \
Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \
}
// TM_PROJ
#define TM_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[0]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \
Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \
Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \
Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXd \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXd \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_30 = svsub_x(pg1, result_30, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_31 = svsub_x(pg1, result_31, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_32 = svsub_x(pg1, result_32, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_20 = svadd_x(pg1, result_20, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_21 = svadd_x(pg1, result_21, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_22 = svadd_x(pg1, result_22, UChi_12);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXd \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_30 = svadd_x(pg1, result_30, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_31 = svadd_x(pg1, result_31, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_32 = svadd_x(pg1, result_32, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_20 = svsub_x(pg1, result_20, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_21 = svsub_x(pg1, result_21, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_22 = svsub_x(pg1, result_22, UChi_12);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXd \
result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXd \
result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXd \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_20 = svadd_x(pg1, result_20, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_21 = svadd_x(pg1, result_21, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_22 = svadd_x(pg1, result_22, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_30 = svadd_x(pg1, result_30, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_31 = svadd_x(pg1, result_31, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_32 = svadd_x(pg1, result_32, UChi_12);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXd \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_20 = svsub_x(pg1, result_20, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_21 = svsub_x(pg1, result_21, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_22 = svsub_x(pg1, result_22, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_30 = svsub_x(pg1, result_30, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_31 = svsub_x(pg1, result_31, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_32 = svsub_x(pg1, result_32, UChi_12);
// ZERO_PSI
#define ZERO_PSI_A64FXd \
result_00 = __svzero(result_00); \
result_01 = __svzero(result_01); \
result_02 = __svzero(result_02); \
result_10 = __svzero(result_10); \
result_11 = __svzero(result_11); \
result_12 = __svzero(result_12); \
result_20 = __svzero(result_20); \
result_21 = __svzero(result_21); \
result_22 = __svzero(result_22); \
result_30 = __svzero(result_30); \
result_31 = __svzero(result_31); \
result_32 = __svzero(result_32);
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXd \
result_00 = svadd_x(pg1, result_00, Chimu_00); \
result_01 = svadd_x(pg1, result_01, Chimu_01); \
result_02 = svadd_x(pg1, result_02, Chimu_02); \
result_10 = svadd_x(pg1, result_10, Chimu_10); \
result_11 = svadd_x(pg1, result_11, Chimu_11); \
result_12 = svadd_x(pg1, result_12, Chimu_12); \
result_20 = svadd_x(pg1, result_20, Chimu_20); \
result_21 = svadd_x(pg1, result_21, Chimu_21); \
result_22 = svadd_x(pg1, result_22, Chimu_22); \
result_30 = svadd_x(pg1, result_30, Chimu_30); \
result_31 = svadd_x(pg1, result_31, Chimu_31); \
result_32 = svadd_x(pg1, result_32, Chimu_32);

View File

@ -1,593 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU_A64FXf(x) LOAD_CHIMU_INTERLEAVED_A64FXf(x)
#define PREFETCH_CHIMU_L1(A)
#define PREFETCH_GAUGE_L1(A)
#define PREFETCH_CHIMU_L2(A)
#define PREFETCH_GAUGE_L2(A)
#define PF_GAUGE(A)
#define PREFETCH1_CHIMU(A)
#define PREFETCH_CHIMU(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXf
#define COMPLEX_SIGNS(A)
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI ZERO_PSI_A64FXf
#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
#define TP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TP_PROJ_A64FXf
#define XM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XM_PROJ_A64FXf
#define YM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YM_PROJ_A64FXf
#define ZM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZM_PROJ_A64FXf
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 PERM0_A64FXf
#define PERMUTE_DIR1 PERM1_A64FXf
#define PERMUTE_DIR2 PERM2_A64FXf
#define PERMUTE_DIR3 PERM3_A64FXf
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
svfloat32_t result_00; \
svfloat32_t result_01; \
svfloat32_t result_02; \
svfloat32_t result_10; \
svfloat32_t result_11; \
svfloat32_t result_12; \
svfloat32_t result_20; \
svfloat32_t result_21; \
svfloat32_t result_22; \
svfloat32_t result_30; \
svfloat32_t result_31; \
svfloat32_t result_32; \
svfloat32_t Chi_00; \
svfloat32_t Chi_01; \
svfloat32_t Chi_02; \
svfloat32_t Chi_10; \
svfloat32_t Chi_11; \
svfloat32_t Chi_12; \
svfloat32_t UChi_00; \
svfloat32_t UChi_01; \
svfloat32_t UChi_02; \
svfloat32_t UChi_10; \
svfloat32_t UChi_11; \
svfloat32_t UChi_12; \
svfloat32_t U_00; \
svfloat32_t U_10; \
svfloat32_t U_20; \
svfloat32_t U_01; \
svfloat32_t U_11; \
svfloat32_t U_21; \
svbool_t pg1; \
pg1 = svptrue_b32(); \
svuint32_t table0; \
svfloat32_t zero0; \
zero0 = __svzero(zero0);
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 U_00
#define Chimu_21 U_10
#define Chimu_22 U_20
#define Chimu_30 U_01
#define Chimu_31 U_11
#define Chimu_32 U_21
// RESULT
#define RESULT_A64FXf(base) \
{ \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \
svstnt1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf \
{ \
const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXf(base) \
{ \
Chi_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chi_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chi_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chi_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chi_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
{ \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// PERM0
#define PERM0_A64FXf \
table0 = svld1(pg1, (uint32_t*)&lut[0]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM1
#define PERM1_A64FXf \
table0 = svld1(pg1, (uint32_t*)&lut[1]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM2
#define PERM2_A64FXf \
table0 = svld1(pg1, (uint32_t*)&lut[2]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// PERM3
#define PERM3_A64FXf \
table0 = svld1(pg1, (uint32_t*)&lut[3]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// MULT_2SPIN
#define MULT_2SPIN_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
UChi_00 = __svzero(UChi_00); \
UChi_10 = __svzero(UChi_10); \
UChi_01 = __svzero(UChi_01); \
UChi_11 = __svzero(UChi_11); \
UChi_02 = __svzero(UChi_02); \
UChi_12 = __svzero(UChi_12); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \
}
// XP_PROJ
#define XP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (uint32_t*)&lut[3]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \
}
// XP_RECON
#define XP_RECON_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXf \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_PROJ
#define YP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (uint32_t*)&lut[2]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \
Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \
Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \
Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (uint32_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \
}
// TP_PROJ
#define TP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (uint32_t*)&lut[0]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \
Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \
Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \
Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \
}
// XM_PROJ
#define XM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (uint32_t*)&lut[3]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \
}
// XM_RECON
#define XM_RECON_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// YM_PROJ
#define YM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (uint32_t*)&lut[2]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \
Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \
Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \
Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (uint32_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \
}
// TM_PROJ
#define TM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (uint32_t*)&lut[0]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \
Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \
Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \
Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXf \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXf \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_30 = svsub_x(pg1, result_30, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_31 = svsub_x(pg1, result_31, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_32 = svsub_x(pg1, result_32, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_20 = svadd_x(pg1, result_20, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_21 = svadd_x(pg1, result_21, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_22 = svadd_x(pg1, result_22, UChi_12);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXf \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_30 = svadd_x(pg1, result_30, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_31 = svadd_x(pg1, result_31, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_32 = svadd_x(pg1, result_32, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_20 = svsub_x(pg1, result_20, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_21 = svsub_x(pg1, result_21, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_22 = svsub_x(pg1, result_22, UChi_12);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXf \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_20 = svadd_x(pg1, result_20, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_21 = svadd_x(pg1, result_21, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_22 = svadd_x(pg1, result_22, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_30 = svadd_x(pg1, result_30, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_31 = svadd_x(pg1, result_31, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_32 = svadd_x(pg1, result_32, UChi_12);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXf \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_20 = svsub_x(pg1, result_20, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_21 = svsub_x(pg1, result_21, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_22 = svsub_x(pg1, result_22, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_30 = svsub_x(pg1, result_30, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_31 = svsub_x(pg1, result_31, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_32 = svsub_x(pg1, result_32, UChi_12);
// ZERO_PSI
#define ZERO_PSI_A64FXf \
result_00 = __svzero(result_00); \
result_01 = __svzero(result_01); \
result_02 = __svzero(result_02); \
result_10 = __svzero(result_10); \
result_11 = __svzero(result_11); \
result_12 = __svzero(result_12); \
result_20 = __svzero(result_20); \
result_21 = __svzero(result_21); \
result_22 = __svzero(result_22); \
result_30 = __svzero(result_30); \
result_31 = __svzero(result_31); \
result_32 = __svzero(result_32);
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXf \
result_00 = svadd_x(pg1, result_00, Chimu_00); \
result_01 = svadd_x(pg1, result_01, Chimu_01); \
result_02 = svadd_x(pg1, result_02, Chimu_02); \
result_10 = svadd_x(pg1, result_10, Chimu_10); \
result_11 = svadd_x(pg1, result_11, Chimu_11); \
result_12 = svadd_x(pg1, result_12, Chimu_12); \
result_20 = svadd_x(pg1, result_20, Chimu_20); \
result_21 = svadd_x(pg1, result_21, Chimu_21); \
result_22 = svadd_x(pg1, result_22, Chimu_22); \
result_30 = svadd_x(pg1, result_30, Chimu_30); \
result_31 = svadd_x(pg1, result_31, Chimu_31); \
result_32 = svadd_x(pg1, result_32, Chimu_32);

View File

@ -1,70 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Fujitsu_A64FX_undef.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#undef LOAD_CHIMU_A64FXd
#undef LOAD_CHIMU_A64FXf
#undef PREFETCH_CHIMU_L1
#undef PREFETCH_GAUGE_L1
#undef PREFETCH_CHIMU_L2
#undef PREFETCH_GAUGE_L2
#undef PF_GAUGE
#undef PREFETCH1_CHIMU
#undef PREFETCH_CHIMU
#undef LOCK_GAUGE
#undef UNLOCK_GAUGE
#undef MASK_REGS
#undef COMPLEX_SIGNS
#undef LOAD64
#undef SAVE_RESULT
#undef ADD_RESULT
#undef MULT_2SPIN_DIR_PF
#undef MAYBEPERM
#undef LOAD_CHI
#undef ZERO_PSI
#undef XP_PROJMEM
#undef YP_PROJMEM
#undef ZP_PROJMEM
#undef TP_PROJMEM
#undef XM_PROJMEM
#undef YM_PROJMEM
#undef ZM_PROJMEM
#undef TM_PROJMEM
#undef XP_RECON
#undef XM_RECON
#undef XM_RECON_ACCUM
#undef YM_RECON_ACCUM
#undef ZM_RECON_ACCUM
#undef TM_RECON_ACCUM
#undef XP_RECON_ACCUM
#undef YP_RECON_ACCUM
#undef ZP_RECON_ACCUM
#undef TP_RECON_ACCUM
#undef PERMUTE_DIR0
#undef PERMUTE_DIR1
#undef PERMUTE_DIR2
#undef PERMUTE_DIR3

View File

@ -30,13 +30,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#pragma once
#if defined(A64FXINTRIN)
#pragma message("A64FX Wilson kernels intrin")
#else
#pragma message("A64FX Wilson kernels asm")
#endif
#if defined(A64FX)
// undefine everything
#include <simd/Fujitsu_A64FX_undef.h>
///////////////////////////////////////////////////////////
// If we are A64FX specialise the single precision routine
///////////////////////////////////////////////////////////
@ -46,7 +44,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <simd/Fujitsu_A64FX_asm_single.h>
#endif
/// Switch off the 5d vectorised code optimisations
#undef DWFVEC5D

View File

@ -43,6 +43,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI ZERO_PSI_A64FXd
#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
@ -53,6 +54,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
@ -689,3 +691,23 @@ asm ( \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z12.d \n\t" \
"fadd z1.d, p5/m, z1.d, z13.d \n\t" \
"fadd z2.d, p5/m, z2.d, z14.d \n\t" \
"fadd z3.d, p5/m, z3.d, z15.d \n\t" \
"fadd z4.d, p5/m, z4.d, z16.d \n\t" \
"fadd z5.d, p5/m, z5.d, z17.d \n\t" \
"fadd z6.d, p5/m, z6.d, z24.d \n\t" \
"fadd z7.d, p5/m, z7.d, z25.d \n\t" \
"fadd z8.d, p5/m, z8.d, z26.d \n\t" \
"fadd z9.d, p5/m, z9.d, z27.d \n\t" \
"fadd z10.d, p5/m, z10.d, z28.d \n\t" \
"fadd z11.d, p5/m, z11.d, z29.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);

View File

@ -43,6 +43,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI ZERO_PSI_A64FXf
#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
@ -53,6 +54,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf
@ -703,3 +705,23 @@ asm ( \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXf \
asm ( \
"fadd z0.f, p5/m, z0.f, z12.f \n\t" \
"fadd z1.f, p5/m, z1.f, z13.f \n\t" \
"fadd z2.f, p5/m, z2.f, z14.f \n\t" \
"fadd z3.f, p5/m, z3.f, z15.f \n\t" \
"fadd z4.f, p5/m, z4.f, z16.f \n\t" \
"fadd z5.f, p5/m, z5.f, z17.f \n\t" \
"fadd z6.f, p5/m, z6.f, z24.f \n\t" \
"fadd z7.f, p5/m, z7.f, z25.f \n\t" \
"fadd z8.f, p5/m, z8.f, z26.f \n\t" \
"fadd z9.f, p5/m, z9.f, z27.f \n\t" \
"fadd z10.f, p5/m, z10.f, z28.f \n\t" \
"fadd z11.f, p5/m, z11.f, z29.f \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);

View File

@ -43,6 +43,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI ZERO_PSI_A64FXd
#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd
@ -53,6 +54,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXd(base); TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
@ -565,3 +567,18 @@ Author: Nils Meyer <nils.meyer@ur.de>
result_31 = __svzero(result_31); \
result_32 = __svzero(result_32);
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXd \
result_00 = svadd_x(pg1, result_00, Chimu_00); \
result_01 = svadd_x(pg1, result_01, Chimu_01); \
result_02 = svadd_x(pg1, result_02, Chimu_02); \
result_10 = svadd_x(pg1, result_10, Chimu_10); \
result_11 = svadd_x(pg1, result_11, Chimu_11); \
result_12 = svadd_x(pg1, result_12, Chimu_12); \
result_20 = svadd_x(pg1, result_20, Chimu_20); \
result_21 = svadd_x(pg1, result_21, Chimu_21); \
result_22 = svadd_x(pg1, result_22, Chimu_22); \
result_30 = svadd_x(pg1, result_30, Chimu_30); \
result_31 = svadd_x(pg1, result_31, Chimu_31); \
result_32 = svadd_x(pg1, result_32, Chimu_32);

View File

@ -43,6 +43,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI ZERO_PSI_A64FXf
#define ADD_RESULT(base,base) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
#define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf
@ -53,6 +54,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define TM_PROJMEM(base) LOAD_CHIMU_A64FXf(base); TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf
@ -574,3 +576,18 @@ Author: Nils Meyer <nils.meyer@ur.de>
result_31 = __svzero(result_31); \
result_32 = __svzero(result_32);
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXf \
result_00 = svadd_x(pg1, result_00, Chimu_00); \
result_01 = svadd_x(pg1, result_01, Chimu_01); \
result_02 = svadd_x(pg1, result_02, Chimu_02); \
result_10 = svadd_x(pg1, result_10, Chimu_10); \
result_11 = svadd_x(pg1, result_11, Chimu_11); \
result_12 = svadd_x(pg1, result_12, Chimu_12); \
result_20 = svadd_x(pg1, result_20, Chimu_20); \
result_21 = svadd_x(pg1, result_21, Chimu_21); \
result_22 = svadd_x(pg1, result_22, Chimu_22); \
result_30 = svadd_x(pg1, result_30, Chimu_30); \
result_31 = svadd_x(pg1, result_31, Chimu_31); \
result_32 = svadd_x(pg1, result_32, Chimu_32);

View File

@ -41,6 +41,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#undef COMPLEX_SIGNS
#undef LOAD64
#undef SAVE_RESULT
#undef ADD_RESULT
#undef MULT_2SPIN_DIR_PF
#undef MAYBEPERM
#undef LOAD_CHI
@ -55,6 +56,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#undef TM_PROJMEM
#undef XP_RECON
#undef XM_RECON
#undef XM_RECON_ACCUM
#undef YM_RECON_ACCUM
#undef ZM_RECON_ACCUM
#undef TM_RECON_ACCUM