1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Dslash finally works; cleaned up; uses MOVPRFX in assembly

This commit is contained in:
nils meyer 2020-04-10 22:26:40 +02:00
parent 160f78c1e4
commit 974586bedc
5 changed files with 344 additions and 252 deletions

View File

@ -372,19 +372,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); /* printf("."); */ return;}
#endif
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); /* printf("-"); */ return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); /* printf("+"); */ return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");

View File

@ -2,7 +2,7 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Source file: Fujitsu_A64FX_asm_double.h
Copyright (C) 2020
@ -40,9 +40,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI ZERO_PSI_A64FXd
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
@ -62,10 +62,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 PERM0_A64FXd
#define PERMUTE_DIR1 PERM1_A64FXd
#define PERMUTE_DIR2 PERM2_A64FXd
#define PERMUTE_DIR3 PERM3_A64FXd
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; }
#define PERMUTE_DIR3
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
@ -155,14 +155,14 @@ asm ( \
#define LOAD_CHI_A64FXd(base) \
{ \
asm ( \
"ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
"ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
"ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
"ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
@ -234,40 +234,45 @@ asm ( \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PERM0
#define PERM0_A64FXd \
// LOAD_TABLE0
#define LOAD_TABLE0 \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXd \
// LOAD_TABLE1
#define LOAD_TABLE1 \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXd \
// LOAD_TABLE2
#define LOAD_TABLE2 \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE3
#define LOAD_TABLE3 \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM0
#define PERM0_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
@ -275,8 +280,36 @@ asm ( \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
@ -287,23 +320,24 @@ asm ( \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"ptrue p5.d \n\t" \
"ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"fmov z18.d , 0 \n\t" \
"fmov z21.d , 0 \n\t" \
"fmov z19.d , 0 \n\t" \
"fmov z22.d , 0 \n\t" \
"fmov z20.d , 0 \n\t" \
"fmov z23.d , 0 \n\t" \
"movprfx z18.d, p5/m, z31.d \n\t" \
"fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
"movprfx z21.d, p5/m, z31.d \n\t" \
"fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
"movprfx z19.d, p5/m, z31.d \n\t" \
"fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
"movprfx z22.d, p5/m, z31.d \n\t" \
"fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
"movprfx z20.d, p5/m, z31.d \n\t" \
"fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
"movprfx z23.d, p5/m, z31.d \n\t" \
"fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
"fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
"fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
@ -347,6 +381,7 @@ asm ( \
#define XP_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \
@ -361,11 +396,17 @@ asm ( \
// XP_RECON
#define XP_RECON_A64FXd \
asm ( \
"movprfx z6.d, p5/m, z31.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
"movprfx z7.d, p5/m, z31.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
"movprfx z8.d, p5/m, z31.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
"movprfx z9.d, p5/m, z31.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
"movprfx z10.d, p5/m, z31.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
"movprfx z11.d, p5/m, z31.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
"mov z0.d, p5/m, z18.d \n\t" \
"mov z1.d, p5/m, z19.d \n\t" \
@ -402,7 +443,7 @@ asm ( \
#define YP_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.d \n\t" \
"fsub z12.d, p5/m, z12.d, z27.d \n\t" \
"fsub z13.d, p5/m, z13.d, z28.d \n\t" \
"fsub z14.d, p5/m, z14.d, z29.d \n\t" \
@ -410,15 +451,15 @@ asm ( \
"fadd z16.d, p5/m, z16.d, z25.d \n\t" \
"fadd z17.d, p5/m, z17.d, z26.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \
@ -426,15 +467,15 @@ asm ( \
"fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \
"fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TP_PROJ
#define TP_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.d \n\t" \
"fadd z12.d, p5/m, z12.d, z24.d \n\t" \
"fadd z13.d, p5/m, z13.d, z25.d \n\t" \
"fadd z14.d, p5/m, z14.d, z26.d \n\t" \
@ -442,14 +483,15 @@ asm ( \
"fadd z16.d, p5/m, z16.d, z28.d \n\t" \
"fadd z17.d, p5/m, z17.d, z29.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_PROJ
#define XM_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \
@ -464,11 +506,17 @@ asm ( \
// XM_RECON
#define XM_RECON_A64FXd \
asm ( \
"movprfx z6.d, p5/m, z31.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
"movprfx z7.d, p5/m, z31.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
"movprfx z8.d, p5/m, z31.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
"movprfx z9.d, p5/m, z31.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
"movprfx z10.d, p5/m, z31.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
"movprfx z11.d, p5/m, z31.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
"mov z0.d, p5/m, z18.d \n\t" \
"mov z1.d, p5/m, z19.d \n\t" \
@ -485,7 +533,7 @@ asm ( \
#define YM_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.d \n\t" \
"fadd z12.d, p5/m, z12.d, z27.d \n\t" \
"fadd z13.d, p5/m, z13.d, z28.d \n\t" \
"fadd z14.d, p5/m, z14.d, z29.d \n\t" \
@ -493,15 +541,15 @@ asm ( \
"fsub z16.d, p5/m, z16.d, z25.d \n\t" \
"fsub z17.d, p5/m, z17.d, z26.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \
@ -509,15 +557,15 @@ asm ( \
"fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \
"fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TM_PROJ
#define TM_PROJ_A64FXd \
{ \
asm ( \
"ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.d \n\t" \
"fsub z12.d, p5/m, z12.d, z24.d \n\t" \
"fsub z13.d, p5/m, z13.d, z25.d \n\t" \
"fsub z14.d, p5/m, z14.d, z26.d \n\t" \
@ -525,8 +573,8 @@ asm ( \
"fsub z16.d, p5/m, z16.d, z28.d \n\t" \
"fsub z17.d, p5/m, z17.d, z29.d \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON_ACCUM
@ -538,12 +586,12 @@ asm ( \
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
"mov z0.d, p5/m, z18.d \n\t" \
"mov z1.d, p5/m, z19.d \n\t" \
"mov z2.d, p5/m, z20.d \n\t" \
"mov z3.d, p5/m, z21.d \n\t" \
"mov z4.d, p5/m, z22.d \n\t" \
"mov z5.d, p5/m, z23.d \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \

View File

@ -2,7 +2,7 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Source file: Fujitsu_A64FX_asm_single.h
Copyright (C) 2020
@ -40,9 +40,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI ZERO_PSI_A64FXf
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
@ -62,10 +62,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 PERM0_A64FXf
#define PERMUTE_DIR1 PERM1_A64FXf
#define PERMUTE_DIR2 PERM2_A64FXf
#define PERMUTE_DIR3 PERM3_A64FXf
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
@ -155,14 +155,14 @@ asm ( \
#define LOAD_CHI_A64FXf(base) \
{ \
asm ( \
"ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
"ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
"ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
"ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
"ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
"ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
@ -234,55 +234,45 @@ asm ( \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PERM0
#define PERM0_A64FXf \
// LOAD_TABLE0
#define LOAD_TABLE0 \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXf \
// LOAD_TABLE1
#define LOAD_TABLE1 \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXf \
// LOAD_TABLE2
#define LOAD_TABLE2 \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
#define PERM3_A64FXf \
// LOAD_TABLE3
#define LOAD_TABLE3 \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM0
#define PERM0_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
@ -290,8 +280,50 @@ asm ( \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM1
#define PERM1_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM2
#define PERM2_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERM3
#define PERM3_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// MULT_2SPIN
@ -299,23 +331,24 @@ asm ( \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"ptrue p5.s \n\t" \
"ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
"ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
"ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
"ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
"ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
"ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
"fmov z18.s , 0 \n\t" \
"fmov z21.s , 0 \n\t" \
"fmov z19.s , 0 \n\t" \
"fmov z22.s , 0 \n\t" \
"fmov z20.s , 0 \n\t" \
"fmov z23.s , 0 \n\t" \
"movprfx z18.s, p5/m, z31.s \n\t" \
"fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \
"movprfx z21.s, p5/m, z31.s \n\t" \
"fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \
"movprfx z19.s, p5/m, z31.s \n\t" \
"fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \
"movprfx z22.s, p5/m, z31.s \n\t" \
"fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \
"movprfx z20.s, p5/m, z31.s \n\t" \
"fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \
"movprfx z23.s, p5/m, z31.s \n\t" \
"fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \
"fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \
"fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \
@ -359,7 +392,7 @@ asm ( \
#define XP_PROJ_A64FXf \
{ \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z27.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z28.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z29.s, 90 \n\t" \
@ -367,18 +400,24 @@ asm ( \
"fcadd z16.s, p5/m, z16.s, z25.s, 90 \n\t" \
"fcadd z17.s, p5/m, z17.s, z26.s, 90 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_RECON
#define XP_RECON_A64FXf \
asm ( \
"movprfx z6.s, p5/m, z31.s \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
"movprfx z7.s, p5/m, z31.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
"movprfx z8.s, p5/m, z31.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
"movprfx z9.s, p5/m, z31.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
"movprfx z10.s, p5/m, z31.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
"movprfx z11.s, p5/m, z31.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
"mov z0.s, p5/m, z18.s \n\t" \
"mov z1.s, p5/m, z19.s \n\t" \
@ -415,7 +454,7 @@ asm ( \
#define YP_PROJ_A64FXf \
{ \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.s \n\t" \
"fsub z12.s, p5/m, z12.s, z27.s \n\t" \
"fsub z13.s, p5/m, z13.s, z28.s \n\t" \
"fsub z14.s, p5/m, z14.s, z29.s \n\t" \
@ -423,15 +462,15 @@ asm ( \
"fadd z16.s, p5/m, z16.s, z25.s \n\t" \
"fadd z17.s, p5/m, z17.s, z26.s \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXf \
{ \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z24.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z25.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z26.s, 90 \n\t" \
@ -439,15 +478,15 @@ asm ( \
"fcadd z16.s, p5/m, z16.s, z28.s, 270 \n\t" \
"fcadd z17.s, p5/m, z17.s, z29.s, 270 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TP_PROJ
#define TP_PROJ_A64FXf \
{ \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.s \n\t" \
"fadd z12.s, p5/m, z12.s, z24.s \n\t" \
"fadd z13.s, p5/m, z13.s, z25.s \n\t" \
"fadd z14.s, p5/m, z14.s, z26.s \n\t" \
@ -455,15 +494,15 @@ asm ( \
"fadd z16.s, p5/m, z16.s, z28.s \n\t" \
"fadd z17.s, p5/m, z17.s, z29.s \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_PROJ
#define XM_PROJ_A64FXf \
{ \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z27.s, 270 \n\t" \
"fcadd z13.s, p5/m, z13.s, z28.s, 270 \n\t" \
"fcadd z14.s, p5/m, z14.s, z29.s, 270 \n\t" \
@ -471,18 +510,24 @@ asm ( \
"fcadd z16.s, p5/m, z16.s, z25.s, 270 \n\t" \
"fcadd z17.s, p5/m, z17.s, z26.s, 270 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON
#define XM_RECON_A64FXf \
asm ( \
"movprfx z6.s, p5/m, z31.s \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
"movprfx z7.s, p5/m, z31.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
"movprfx z8.s, p5/m, z31.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
"movprfx z9.s, p5/m, z31.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
"movprfx z10.s, p5/m, z31.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
"movprfx z11.s, p5/m, z31.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
"mov z0.s, p5/m, z18.s \n\t" \
"mov z1.s, p5/m, z19.s \n\t" \
@ -499,7 +544,7 @@ asm ( \
#define YM_PROJ_A64FXf \
{ \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.s \n\t" \
"fadd z12.s, p5/m, z12.s, z27.s \n\t" \
"fadd z13.s, p5/m, z13.s, z28.s \n\t" \
"fadd z14.s, p5/m, z14.s, z29.s \n\t" \
@ -507,15 +552,15 @@ asm ( \
"fsub z16.s, p5/m, z16.s, z25.s \n\t" \
"fsub z17.s, p5/m, z17.s, z26.s \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXf \
{ \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z24.s, 270 \n\t" \
"fcadd z13.s, p5/m, z13.s, z25.s, 270 \n\t" \
"fcadd z14.s, p5/m, z14.s, z26.s, 270 \n\t" \
@ -523,15 +568,15 @@ asm ( \
"fcadd z16.s, p5/m, z16.s, z28.s, 90 \n\t" \
"fcadd z17.s, p5/m, z17.s, z29.s, 90 \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TM_PROJ
#define TM_PROJ_A64FXf \
{ \
asm ( \
"ld1w { z30.s }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
"ptrue p5.s \n\t" \
"fsub z12.s, p5/m, z12.s, z24.s \n\t" \
"fsub z13.s, p5/m, z13.s, z25.s \n\t" \
"fsub z14.s, p5/m, z14.s, z26.s \n\t" \
@ -539,8 +584,8 @@ asm ( \
"fsub z16.s, p5/m, z16.s, z28.s \n\t" \
"fsub z17.s, p5/m, z17.s, z29.s \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON_ACCUM
@ -552,12 +597,12 @@ asm ( \
"fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
"mov z0.s, p5/m, z18.s \n\t" \
"mov z1.s, p5/m, z19.s \n\t" \
"mov z2.s, p5/m, z20.s \n\t" \
"mov z3.s, p5/m, z21.s \n\t" \
"mov z4.s, p5/m, z22.s \n\t" \
"mov z5.s, p5/m, z23.s \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \

View File

@ -2,7 +2,7 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Source file: Fujitsu_A64FX_intrin_double.h
Copyright (C) 2020
@ -40,9 +40,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXd(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI ZERO_PSI_A64FXd
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd
@ -62,10 +62,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 PERM0_A64FXd
#define PERMUTE_DIR1 PERM1_A64FXd
#define PERMUTE_DIR2 PERM2_A64FXd
#define PERMUTE_DIR3 PERM3_A64FXd
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXd; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXd; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXd; }
#define PERMUTE_DIR3
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
@ -170,12 +170,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
// LOAD_CHI
#define LOAD_CHI_A64FXd(base) \
{ \
Chi_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chi_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chi_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chi_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chi_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \
Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \
Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \
Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \
Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \
Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
@ -227,9 +227,24 @@ Author: Nils Meyer <nils.meyer@ur.de>
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
table0 = svld1(pg1, (uint64_t*)&lut[0]);
// LOAD_TABLE1
#define LOAD_TABLE1 \
table0 = svld1(pg1, (uint64_t*)&lut[1]);
// LOAD_TABLE2
#define LOAD_TABLE2 \
table0 = svld1(pg1, (uint64_t*)&lut[2]);
// LOAD_TABLE3
#define LOAD_TABLE3 \
table0 = svld1(pg1, (uint64_t*)&lut[3]);
// PERM0
#define PERM0_A64FXd \
table0 = svld1(pg1, (uint64_t*)&lut[0]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -239,7 +254,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// PERM1
#define PERM1_A64FXd \
table0 = svld1(pg1, (uint64_t*)&lut[1]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -249,7 +263,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// PERM2
#define PERM2_A64FXd \
table0 = svld1(pg1, (uint64_t*)&lut[2]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -270,18 +283,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
UChi_00 = __svzero(UChi_00); \
UChi_10 = __svzero(UChi_10); \
UChi_01 = __svzero(UChi_01); \
UChi_11 = __svzero(UChi_11); \
UChi_02 = __svzero(UChi_02); \
UChi_12 = __svzero(UChi_12); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
@ -328,12 +335,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
}
// XP_RECON
#define XP_RECON_A64FXd \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
@ -359,7 +366,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// YP_PROJ
#define YP_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[2]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \
@ -370,7 +376,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// ZP_PROJ
#define ZP_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \
@ -381,7 +386,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// TP_PROJ
#define TP_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[0]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \
@ -401,12 +405,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
}
// XM_RECON
#define XM_RECON_A64FXd \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
@ -417,7 +421,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// YM_PROJ
#define YM_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[2]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \
@ -428,7 +431,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// ZM_PROJ
#define ZM_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \
@ -439,7 +441,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// TM_PROJ
#define TM_PROJ_A64FXd \
{ \
table0 = svld1(pg1, (uint64_t*)&lut[0]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \
@ -455,12 +456,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXd \

View File

@ -2,7 +2,7 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Source file: Fujitsu_A64FX_intrin_single.h
Copyright (C) 2020
@ -40,9 +40,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI ZERO_PSI_A64FXf
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
@ -62,10 +62,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 PERM0_A64FXf
#define PERMUTE_DIR1 PERM1_A64FXf
#define PERMUTE_DIR2 PERM2_A64FXf
#define PERMUTE_DIR3 PERM3_A64FXf
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
@ -170,12 +170,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
// LOAD_CHI
#define LOAD_CHI_A64FXf(base) \
{ \
Chi_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chi_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chi_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chi_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chi_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \
Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \
Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \
Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \
Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \
Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
@ -227,9 +227,24 @@ Author: Nils Meyer <nils.meyer@ur.de>
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
table0 = svld1(pg1, (uint32_t*)&lut[0]);
// LOAD_TABLE1
#define LOAD_TABLE1 \
table0 = svld1(pg1, (uint32_t*)&lut[1]);
// LOAD_TABLE2
#define LOAD_TABLE2 \
table0 = svld1(pg1, (uint32_t*)&lut[2]);
// LOAD_TABLE3
#define LOAD_TABLE3 \
table0 = svld1(pg1, (uint32_t*)&lut[3]);
// PERM0
#define PERM0_A64FXf \
table0 = svld1(pg1, (float32_t*)&lut[0]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -239,7 +254,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// PERM1
#define PERM1_A64FXf \
table0 = svld1(pg1, (float32_t*)&lut[1]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -249,7 +263,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// PERM2
#define PERM2_A64FXf \
table0 = svld1(pg1, (float32_t*)&lut[2]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -259,7 +272,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// PERM3
#define PERM3_A64FXf \
table0 = svld1(pg1, (float32_t*)&lut[3]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -277,18 +289,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
UChi_00 = __svzero(UChi_00); \
UChi_10 = __svzero(UChi_10); \
UChi_01 = __svzero(UChi_01); \
UChi_11 = __svzero(UChi_11); \
UChi_02 = __svzero(UChi_02); \
UChi_12 = __svzero(UChi_12); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
@ -326,7 +332,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// XP_PROJ
#define XP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[3]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \
@ -336,12 +341,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
}
// XP_RECON
#define XP_RECON_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
@ -367,7 +372,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// YP_PROJ
#define YP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[2]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \
@ -378,7 +382,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// ZP_PROJ
#define ZP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \
@ -389,7 +392,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// TP_PROJ
#define TP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[0]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \
@ -400,7 +402,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// XM_PROJ
#define XM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[3]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \
@ -410,12 +411,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
}
// XM_RECON
#define XM_RECON_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
@ -426,7 +427,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// YM_PROJ
#define YM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[2]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \
@ -437,7 +437,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// ZM_PROJ
#define ZM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \
@ -448,7 +447,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// TM_PROJ
#define TM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[0]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \
@ -464,12 +462,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXf \