diff --git a/Grid/Fujitsu_A64FX_asm_double.h b/Grid/Fujitsu_A64FX_asm_double.h index 48d07297..4af5a91f 100644 --- a/Grid/Fujitsu_A64FX_asm_double.h +++ b/Grid/Fujitsu_A64FX_asm_double.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(A,B) +#define ADD_RESULT(base,basep) LOAD_CHI_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd @@ -691,3 +691,23 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z24.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z25.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z26.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z27.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z28.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z29.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/Fujitsu_A64FX_asm_single.h b/Grid/Fujitsu_A64FX_asm_single.h index 588fc9c4..08d2fc53 100644 --- a/Grid/Fujitsu_A64FX_asm_single.h +++ b/Grid/Fujitsu_A64FX_asm_single.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(A,B) +#define ADD_RESULT(base,basep) LOAD_CHI_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf @@ -705,3 +705,23 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ +asm ( \ + "fadd z0.f, p5/m, z0.f, z12.f \n\t" \ + "fadd z1.f, p5/m, z1.f, z13.f \n\t" \ + "fadd z2.f, p5/m, z2.f, z14.f \n\t" \ + "fadd z3.f, p5/m, z3.f, z15.f \n\t" \ + "fadd z4.f, p5/m, z4.f, z16.f \n\t" \ + "fadd z5.f, p5/m, z5.f, z17.f \n\t" \ + "fadd z6.f, p5/m, z6.f, z24.f \n\t" \ + "fadd z7.f, p5/m, z7.f, z25.f \n\t" \ + "fadd z8.f, p5/m, z8.f, z26.f \n\t" \ + "fadd z9.f, p5/m, z9.f, z27.f \n\t" \ + "fadd z10.f, p5/m, z10.f, z28.f \n\t" \ + "fadd z11.f, p5/m, z11.f, z29.f \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/Fujitsu_A64FX_intrin_double.h b/Grid/Fujitsu_A64FX_intrin_double.h index e7469ffb..a0167ca9 100644 --- a/Grid/Fujitsu_A64FX_intrin_double.h +++ b/Grid/Fujitsu_A64FX_intrin_double.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) #define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(A,B) +#define ADD_RESULT(base,basep) LOAD_CHI_A64FXd(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); XP_PROJ_A64FXd #define YP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); YP_PROJ_A64FXd #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXd(base); ZP_PROJ_A64FXd @@ -567,3 +567,18 @@ Author: Nils Meyer result_31 = __svzero(result_31); \ result_32 = __svzero(result_32); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + diff --git a/Grid/Fujitsu_A64FX_intrin_single.h b/Grid/Fujitsu_A64FX_intrin_single.h index e44374f0..6849506b 100644 --- a/Grid/Fujitsu_A64FX_intrin_single.h +++ b/Grid/Fujitsu_A64FX_intrin_single.h @@ -43,7 +43,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { A ; } #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) #define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(A,B) +#define ADD_RESULT(base,basep) LOAD_CHI_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf #define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf #define ZP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); ZP_PROJ_A64FXf @@ -576,3 +576,18 @@ Author: Nils Meyer result_31 = __svzero(result_31); \ result_32 = __svzero(result_32); +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); +