1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 15:27:06 +01:00

Dslash finally works; cleaned up; uses MOVPRFX in assembly

This commit is contained in:
nils meyer
2020-04-10 22:26:40 +02:00
parent 160f78c1e4
commit 974586bedc
5 changed files with 344 additions and 252 deletions

View File

@ -2,7 +2,7 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: XXX
Source file: Fujitsu_A64FX_intrin_single.h
Copyright (C) 2020
@ -40,9 +40,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOAD64(A,B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A)
#define MULT_2SPIN_DIR_PF(A,B) MULT_2SPIN_A64FXf(A)
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MAYBEPERM(A,perm) { A ; }
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI ZERO_PSI_A64FXf
#define ZERO_PSI
#define ADD_RESULT(base,basep) LOAD_CHIMU_A64FXf(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); XP_PROJ_A64FXf
#define YP_PROJMEM(base) LOAD_CHIMU_A64FXf(base); YP_PROJ_A64FXf
@ -62,10 +62,10 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 PERM0_A64FXf
#define PERMUTE_DIR1 PERM1_A64FXf
#define PERMUTE_DIR2 PERM2_A64FXf
#define PERMUTE_DIR3 PERM3_A64FXf
#define PERMUTE_DIR0 LOAD_TABLE0; if (perm) { PERM0_A64FXf; }
#define PERMUTE_DIR1 LOAD_TABLE1; if (perm) { PERM1_A64FXf; }
#define PERMUTE_DIR2 LOAD_TABLE2; if (perm) { PERM2_A64FXf; }
#define PERMUTE_DIR3 LOAD_TABLE3; if (perm) { PERM3_A64FXf; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
@ -170,12 +170,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
// LOAD_CHI
#define LOAD_CHI_A64FXf(base) \
{ \
Chi_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chi_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chi_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chi_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chi_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \
Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \
Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \
Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \
Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \
Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
@ -227,9 +227,24 @@ Author: Nils Meyer <nils.meyer@ur.de>
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
table0 = svld1(pg1, (uint32_t*)&lut[0]);
// LOAD_TABLE1
#define LOAD_TABLE1 \
table0 = svld1(pg1, (uint32_t*)&lut[1]);
// LOAD_TABLE2
#define LOAD_TABLE2 \
table0 = svld1(pg1, (uint32_t*)&lut[2]);
// LOAD_TABLE3
#define LOAD_TABLE3 \
table0 = svld1(pg1, (uint32_t*)&lut[3]);
// PERM0
#define PERM0_A64FXf \
table0 = svld1(pg1, (float32_t*)&lut[0]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -239,7 +254,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// PERM1
#define PERM1_A64FXf \
table0 = svld1(pg1, (float32_t*)&lut[1]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -249,7 +263,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// PERM2
#define PERM2_A64FXf \
table0 = svld1(pg1, (float32_t*)&lut[2]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -259,7 +272,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// PERM3
#define PERM3_A64FXf \
table0 = svld1(pg1, (float32_t*)&lut[3]); \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
@ -277,18 +289,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
UChi_00 = __svzero(UChi_00); \
UChi_10 = __svzero(UChi_10); \
UChi_01 = __svzero(UChi_01); \
UChi_11 = __svzero(UChi_11); \
UChi_02 = __svzero(UChi_02); \
UChi_12 = __svzero(UChi_12); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
@ -326,7 +332,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// XP_PROJ
#define XP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[3]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \
@ -336,12 +341,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
}
// XP_RECON
#define XP_RECON_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
@ -367,7 +372,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// YP_PROJ
#define YP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[2]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \
@ -378,7 +382,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// ZP_PROJ
#define ZP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \
@ -389,7 +392,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// TP_PROJ
#define TP_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[0]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \
@ -400,7 +402,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// XM_PROJ
#define XM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[3]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \
@ -410,12 +411,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
}
// XM_RECON
#define XM_RECON_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
@ -426,7 +427,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// YM_PROJ
#define YM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[2]); \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \
@ -437,7 +437,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// ZM_PROJ
#define ZM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[1]); \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \
@ -448,7 +447,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
// TM_PROJ
#define TM_PROJ_A64FXf \
{ \
table0 = svld1(pg1, (float32_t*)&lut[0]); \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \
@ -464,12 +462,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXf \