1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 15:27:06 +01:00

enable dslash asm using -DA64FXASM, additionaly -DDSLASHINTRIN for intrinsics impl

This commit is contained in:
nils meyer
2020-04-11 04:55:01 +02:00
parent 974586bedc
commit 113f277b6a
6 changed files with 1020 additions and 346 deletions

View File

@ -443,7 +443,6 @@ asm ( \
#define YP_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fsub z12.d, p5/m, z12.d, z27.d \n\t" \
"fsub z13.d, p5/m, z13.d, z28.d \n\t" \
"fsub z14.d, p5/m, z14.d, z29.d \n\t" \
@ -459,7 +458,6 @@ asm ( \
#define ZP_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \
@ -475,7 +473,6 @@ asm ( \
#define TP_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fadd z12.d, p5/m, z12.d, z24.d \n\t" \
"fadd z13.d, p5/m, z13.d, z25.d \n\t" \
"fadd z14.d, p5/m, z14.d, z26.d \n\t" \
@ -491,7 +488,6 @@ asm ( \
#define XM_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \
@ -533,7 +529,6 @@ asm ( \
#define YM_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fadd z12.d, p5/m, z12.d, z27.d \n\t" \
"fadd z13.d, p5/m, z13.d, z28.d \n\t" \
"fadd z14.d, p5/m, z14.d, z29.d \n\t" \
@ -549,7 +544,6 @@ asm ( \
#define ZM_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \
@ -680,7 +674,6 @@ asm ( \
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXd \
asm ( \
"ptrue p5.d \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \

View File

@ -454,7 +454,6 @@ asm ( \
#define YP_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fsub z12.s, p5/m, z12.s, z27.s \n\t" \
"fsub z13.s, p5/m, z13.s, z28.s \n\t" \
"fsub z14.s, p5/m, z14.s, z29.s \n\t" \
@ -470,7 +469,6 @@ asm ( \
#define ZP_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z24.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z25.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z26.s, 90 \n\t" \
@ -486,7 +484,6 @@ asm ( \
#define TP_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fadd z12.s, p5/m, z12.s, z24.s \n\t" \
"fadd z13.s, p5/m, z13.s, z25.s \n\t" \
"fadd z14.s, p5/m, z14.s, z26.s \n\t" \
@ -502,7 +499,6 @@ asm ( \
#define XM_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z27.s, 270 \n\t" \
"fcadd z13.s, p5/m, z13.s, z28.s, 270 \n\t" \
"fcadd z14.s, p5/m, z14.s, z29.s, 270 \n\t" \
@ -544,7 +540,6 @@ asm ( \
#define YM_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fadd z12.s, p5/m, z12.s, z27.s \n\t" \
"fadd z13.s, p5/m, z13.s, z28.s \n\t" \
"fadd z14.s, p5/m, z14.s, z29.s \n\t" \
@ -560,7 +555,6 @@ asm ( \
#define ZM_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fcadd z12.s, p5/m, z12.s, z24.s, 270 \n\t" \
"fcadd z13.s, p5/m, z13.s, z25.s, 270 \n\t" \
"fcadd z14.s, p5/m, z14.s, z26.s, 270 \n\t" \
@ -691,7 +685,6 @@ asm ( \
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXf \
asm ( \
"ptrue p5.s \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fadd z6.s, p5/m, z6.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \

View File

@ -69,3 +69,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#undef PERMUTE_DIR1
#undef PERMUTE_DIR2
#undef PERMUTE_DIR3
#undef LOAD_TABLE0
#undef LOAD_TABLE1
#undef LOAD_TABLE2
#undef LOAD_TABLE3