From 0883d6a7cef58803ecb621ba0287bdc398d6340c Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 7 Feb 2017 00:59:32 -0500 Subject: [PATCH] Overlap comms compute support; make reg naming consistent with bgq aasm --- lib/simd/Intel512wilson.h | 557 ++++++++++++++++++++------------------ 1 file changed, 293 insertions(+), 264 deletions(-) diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 3ca0b648..64142a2e 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -31,21 +31,21 @@ Author: paboyle ////////////////////////////////////////////////////////////////////////////////////////// // Register allocations for Wilson Kernel are precision indept ////////////////////////////////////////////////////////////////////////////////////////// -#define result_00 %zmm0 -#define result_01 %zmm1 -#define result_02 %zmm2 +#define psi_00 %zmm0 +#define psi_01 %zmm1 +#define psi_02 %zmm2 -#define result_10 %zmm3 -#define result_11 %zmm4 -#define result_12 %zmm5 +#define psi_10 %zmm3 +#define psi_11 %zmm4 +#define psi_12 %zmm5 -#define result_20 %zmm6 -#define result_21 %zmm7 -#define result_22 %zmm8 +#define psi_20 %zmm6 +#define psi_21 %zmm7 +#define psi_22 %zmm8 -#define result_30 %zmm9 -#define result_31 %zmm10 -#define result_32 %zmm11 +#define psi_30 %zmm9 +#define psi_31 %zmm10 +#define psi_32 %zmm11 #define Chi_00 %zmm12 #define Chi_01 %zmm13 @@ -102,32 +102,46 @@ Author: paboyle #define UNLOCK_GAUGE(dir) // const SiteSpinor * ptr = & in._odata[offset]; -#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR) +#define LOAD_CHIMU(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi ); #define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi ); #define SAVE_UCHI(PTR) SAVE_UCHIi(PTR) #define SAVE_CHI(PTR) SAVE_CHIi(PTR) #define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R) +#define ADD_RESULT(PT,R) ADD_RESULTi(PT,R) -#define LOAD_CHIMUi \ - LOAD_CHIMU01i \ - LOAD_CHIMU23i ); +#define ZERO_PSI \ + asm( VZERO(psi_00) \ + VZERO(psi_01) \ + VZERO(psi_02) \ + VZERO(psi_10) \ + VZERO(psi_11) \ + VZERO(psi_12) \ + VZERO(psi_20) \ + VZERO(psi_21) \ + VZERO(psi_22) \ + VZERO(psi_30) \ + VZERO(psi_31) \ + VZERO(psi_32)); +#define LOAD_CHIMUi \ + LOAD_CHIMU01i \ + LOAD_CHIMU23i -#define LOAD_CHIMU01i\ - VLOAD(0,%r8,Chimu_00) \ - VLOAD(1,%r8,Chimu_01) \ - VLOAD(2,%r8,Chimu_02) \ - VLOAD(3,%r8,Chimu_10) \ - VLOAD(4,%r8,Chimu_11) \ - VLOAD(5,%r8,Chimu_12) +#define LOAD_CHIMU01i \ + VLOAD(0,%r8,Chimu_00) \ + VLOAD(1,%r8,Chimu_01) \ + VLOAD(2,%r8,Chimu_02) \ + VLOAD(3,%r8,Chimu_10) \ + VLOAD(4,%r8,Chimu_11) \ + VLOAD(5,%r8,Chimu_12) -#define LOAD_CHIMU23i\ - VLOAD(6,%r8,Chimu_20) \ - VLOAD(7,%r8,Chimu_21) \ - VLOAD(8,%r8,Chimu_22) \ - VLOAD(9,%r8,Chimu_30) \ - VLOAD(10,%r8,Chimu_31) \ - VLOAD(11,%r8,Chimu_32) +#define LOAD_CHIMU23i \ + VLOAD(6,%r8,Chimu_20) \ + VLOAD(7,%r8,Chimu_21) \ + VLOAD(8,%r8,Chimu_22) \ + VLOAD(9,%r8,Chimu_30) \ + VLOAD(10,%r8,Chimu_31) \ + VLOAD(11,%r8,Chimu_32) #define SHUF_CHIMU23i\ VSHUFMEM(6,%r8,Chimu_20) \ @@ -137,9 +151,6 @@ Author: paboyle VSHUFMEM(10,%r8,Chimu_31) \ VSHUFMEM(11,%r8,Chimu_32) - -// const SiteHalfSpinor *ptr = &buf[offset]; - #define LOAD_CHIi \ VLOAD(0,%r8,Chi_00) \ VLOAD(1,%r8,Chi_01) \ @@ -147,7 +158,6 @@ Author: paboyle VLOAD(3,%r8,Chi_10) \ VLOAD(4,%r8,Chi_11) \ VLOAD(5,%r8,Chi_12) - #define SAVE_UCHIi(PTR) \ LOAD64(%r8,PTR) \ @@ -157,8 +167,7 @@ Author: paboyle VSTORE(2,%r8,UChi_02) \ VSTORE(3,%r8,UChi_10) \ VSTORE(4,%r8,UChi_11) \ - VSTORE(5,%r8,UChi_12) \ - ); + VSTORE(5,%r8,UChi_12) ); #define SAVE_CHIi(PTR) \ LOAD64(%r8,PTR) \ @@ -168,33 +177,14 @@ Author: paboyle VSTORE(2,%r8,Chi_02) \ VSTORE(3,%r8,Chi_10) \ VSTORE(4,%r8,Chi_11) \ - VSTORE(5,%r8,Chi_12) \ - ); + VSTORE(5,%r8,Chi_12) ); - -#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p) - -#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p) - -#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf) -#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf) -#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf) -#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf) -#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf) -#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf) -#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf) -#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf) +#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p) +#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf) ////////////////////////////////////////////////////////////////// // Dirac algebra ////////////////////////////////////////////////////////////////// - // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); #define XP_PROJMEM(PTR) \ @@ -259,7 +249,6 @@ Author: paboyle // hspin(0)=fspin(0)-timesI(fspin(3)) // hspin(1)=fspin(1)-timesI(fspin(2)) - #define XM_PROJMEM(PTR) \ LOAD64(%r8,PTR)\ __asm__ ( \ @@ -324,226 +313,226 @@ Author: paboyle // fspin(3)=timesMinusI(hspin(0)) #define XP_RECON __asm__ ( \ VZERO(TMP) \ - VTIMESMINUSI0(UChi_00,result_30,TMP) \ - VTIMESMINUSI0(UChi_10,result_20,TMP) \ - VTIMESMINUSI0(UChi_01,result_31,TMP) \ - VTIMESMINUSI0(UChi_11,result_21,TMP) \ - VTIMESMINUSI0(UChi_02,result_32,TMP) \ - VTIMESMINUSI0(UChi_12,result_22,TMP) \ - VMOV(UChi_00,result_00) \ - VMOV(UChi_10,result_10) \ - VMOV(UChi_01,result_01) \ - VMOV(UChi_11,result_11) \ - VMOV(UChi_02,result_02) \ - VMOV(UChi_12,result_12) \ - VTIMESMINUSI1(UChi_10,result_20,TMP) \ - VTIMESMINUSI1(UChi_11,result_21,TMP) \ - VTIMESMINUSI1(UChi_12,result_22,TMP) \ - VTIMESMINUSI1(UChi_00,result_30,TMP) \ - VTIMESMINUSI1(UChi_01,result_31,TMP) \ - VTIMESMINUSI1(UChi_02,result_32,TMP) \ - VTIMESMINUSI2(UChi_10,result_20,TMP) \ - VTIMESMINUSI2(UChi_11,result_21,TMP) \ - VTIMESMINUSI2(UChi_12,result_22,TMP) \ - VTIMESMINUSI2(UChi_00,result_30,TMP) \ - VTIMESMINUSI2(UChi_01,result_31,TMP) \ - VTIMESMINUSI2(UChi_02,result_32,TMP) \ + VTIMESMINUSI0(UChi_00,psi_30,TMP) \ + VTIMESMINUSI0(UChi_10,psi_20,TMP) \ + VTIMESMINUSI0(UChi_01,psi_31,TMP) \ + VTIMESMINUSI0(UChi_11,psi_21,TMP) \ + VTIMESMINUSI0(UChi_02,psi_32,TMP) \ + VTIMESMINUSI0(UChi_12,psi_22,TMP) \ + VMOV(UChi_00,psi_00) \ + VMOV(UChi_10,psi_10) \ + VMOV(UChi_01,psi_01) \ + VMOV(UChi_11,psi_11) \ + VMOV(UChi_02,psi_02) \ + VMOV(UChi_12,psi_12) \ + VTIMESMINUSI1(UChi_10,psi_20,TMP) \ + VTIMESMINUSI1(UChi_11,psi_21,TMP) \ + VTIMESMINUSI1(UChi_12,psi_22,TMP) \ + VTIMESMINUSI1(UChi_00,psi_30,TMP) \ + VTIMESMINUSI1(UChi_01,psi_31,TMP) \ + VTIMESMINUSI1(UChi_02,psi_32,TMP) \ + VTIMESMINUSI2(UChi_10,psi_20,TMP) \ + VTIMESMINUSI2(UChi_11,psi_21,TMP) \ + VTIMESMINUSI2(UChi_12,psi_22,TMP) \ + VTIMESMINUSI2(UChi_00,psi_30,TMP) \ + VTIMESMINUSI2(UChi_01,psi_31,TMP) \ + VTIMESMINUSI2(UChi_02,psi_32,TMP) \ ); // NB could save 6 ops using addsub => 12 cycles #define XP_RECON_ACCUM __asm__ ( \ VZERO(TMP)\ - VACCTIMESMINUSI0(UChi_00,result_30,Z3)\ - VACCTIMESMINUSI0(UChi_10,result_20,Z0)\ - VACCTIMESMINUSI0(UChi_01,result_31,Z4)\ - VACCTIMESMINUSI0(UChi_11,result_21,Z1)\ - VACCTIMESMINUSI0(UChi_02,result_32,Z5)\ - VACCTIMESMINUSI0(UChi_12,result_22,Z2)\ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_12,result_12,result_12)\ - VACCTIMESMINUSI1(UChi_00,result_30,Z3)\ - VACCTIMESMINUSI1(UChi_10,result_20,Z0)\ - VACCTIMESMINUSI1(UChi_01,result_31,Z4)\ - VACCTIMESMINUSI1(UChi_11,result_21,Z1)\ - VACCTIMESMINUSI1(UChi_02,result_32,Z5)\ - VACCTIMESMINUSI1(UChi_12,result_22,Z2)\ - VACCTIMESMINUSI2(UChi_10,result_20,Z0)\ - VACCTIMESMINUSI2(UChi_11,result_21,Z1)\ - VACCTIMESMINUSI2(UChi_12,result_22,Z2)\ - VACCTIMESMINUSI2(UChi_00,result_30,Z3)\ - VACCTIMESMINUSI2(UChi_01,result_31,Z4)\ - VACCTIMESMINUSI2(UChi_02,result_32,Z5)\ + VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\ + VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\ + VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\ + VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\ + VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\ + VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\ + VADD(UChi_00,psi_00,psi_00)\ + VADD(UChi_10,psi_10,psi_10)\ + VADD(UChi_01,psi_01,psi_01)\ + VADD(UChi_11,psi_11,psi_11)\ + VADD(UChi_02,psi_02,psi_02)\ + VADD(UChi_12,psi_12,psi_12)\ + VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\ + VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\ + VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\ + VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\ + VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\ + VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\ + VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\ + VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\ + VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\ + VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\ + VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\ + VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\ ); #define XM_RECON __asm__ ( \ VZERO(TMP)\ - VTIMESI0(UChi_00,result_30,TMP)\ - VTIMESI0(UChi_10,result_20,TMP)\ - VTIMESI0(UChi_01,result_31,TMP)\ - VTIMESI0(UChi_11,result_21,TMP)\ - VTIMESI0(UChi_02,result_32,TMP)\ - VTIMESI0(UChi_12,result_22,TMP)\ - VMOV(UChi_00,result_00)\ - VMOV(UChi_10,result_10)\ - VMOV(UChi_01,result_01)\ - VMOV(UChi_11,result_11)\ - VMOV(UChi_02,result_02)\ - VMOV(UChi_12,result_12)\ - VTIMESI1(UChi_00,result_30,TMP)\ - VTIMESI1(UChi_10,result_20,TMP)\ - VTIMESI1(UChi_01,result_31,TMP)\ - VTIMESI1(UChi_11,result_21,TMP)\ - VTIMESI1(UChi_02,result_32,TMP)\ - VTIMESI1(UChi_12,result_22,TMP)\ - VTIMESI2(UChi_10,result_20,TMP)\ - VTIMESI2(UChi_11,result_21,TMP)\ - VTIMESI2(UChi_12,result_22,TMP)\ - VTIMESI2(UChi_00,result_30,TMP)\ - VTIMESI2(UChi_01,result_31,TMP)\ - VTIMESI2(UChi_02,result_32,TMP)\ + VTIMESI0(UChi_00,psi_30,TMP)\ + VTIMESI0(UChi_10,psi_20,TMP)\ + VTIMESI0(UChi_01,psi_31,TMP)\ + VTIMESI0(UChi_11,psi_21,TMP)\ + VTIMESI0(UChi_02,psi_32,TMP)\ + VTIMESI0(UChi_12,psi_22,TMP)\ + VMOV(UChi_00,psi_00)\ + VMOV(UChi_10,psi_10)\ + VMOV(UChi_01,psi_01)\ + VMOV(UChi_11,psi_11)\ + VMOV(UChi_02,psi_02)\ + VMOV(UChi_12,psi_12)\ + VTIMESI1(UChi_00,psi_30,TMP)\ + VTIMESI1(UChi_10,psi_20,TMP)\ + VTIMESI1(UChi_01,psi_31,TMP)\ + VTIMESI1(UChi_11,psi_21,TMP)\ + VTIMESI1(UChi_02,psi_32,TMP)\ + VTIMESI1(UChi_12,psi_22,TMP)\ + VTIMESI2(UChi_10,psi_20,TMP)\ + VTIMESI2(UChi_11,psi_21,TMP)\ + VTIMESI2(UChi_12,psi_22,TMP)\ + VTIMESI2(UChi_00,psi_30,TMP)\ + VTIMESI2(UChi_01,psi_31,TMP)\ + VTIMESI2(UChi_02,psi_32,TMP)\ ); #define XM_RECON_ACCUM __asm__ ( \ - VACCTIMESI0(UChi_10,result_20,Z0)\ - VACCTIMESI0(UChi_00,result_30,Z3)\ - VACCTIMESI0(UChi_11,result_21,Z1)\ - VACCTIMESI0(UChi_01,result_31,Z4)\ - VACCTIMESI0(UChi_12,result_22,Z2)\ - VACCTIMESI0(UChi_02,result_32,Z5)\ + VACCTIMESI0(UChi_10,psi_20,Z0)\ + VACCTIMESI0(UChi_00,psi_30,Z3)\ + VACCTIMESI0(UChi_11,psi_21,Z1)\ + VACCTIMESI0(UChi_01,psi_31,Z4)\ + VACCTIMESI0(UChi_12,psi_22,Z2)\ + VACCTIMESI0(UChi_02,psi_32,Z5)\ \ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_12,result_12,result_12)\ - VADD(UChi_02,result_02,result_02)\ + VADD(UChi_10,psi_10,psi_10)\ + VADD(UChi_00,psi_00,psi_00)\ + VADD(UChi_11,psi_11,psi_11)\ + VADD(UChi_01,psi_01,psi_01)\ + VADD(UChi_12,psi_12,psi_12)\ + VADD(UChi_02,psi_02,psi_02)\ \ - VACCTIMESI1(UChi_10,result_20,Z0)\ - VACCTIMESI1(UChi_00,result_30,Z3)\ - VACCTIMESI1(UChi_11,result_21,Z1)\ - VACCTIMESI1(UChi_01,result_31,Z4)\ - VACCTIMESI1(UChi_12,result_22,Z2)\ - VACCTIMESI1(UChi_02,result_32,Z5)\ - VACCTIMESI2(UChi_10,result_20,Z0)\ - VACCTIMESI2(UChi_11,result_21,Z1)\ - VACCTIMESI2(UChi_12,result_22,Z2)\ - VACCTIMESI2(UChi_00,result_30,Z3)\ - VACCTIMESI2(UChi_01,result_31,Z4)\ - VACCTIMESI2(UChi_02,result_32,Z5)\ + VACCTIMESI1(UChi_10,psi_20,Z0)\ + VACCTIMESI1(UChi_00,psi_30,Z3)\ + VACCTIMESI1(UChi_11,psi_21,Z1)\ + VACCTIMESI1(UChi_01,psi_31,Z4)\ + VACCTIMESI1(UChi_12,psi_22,Z2)\ + VACCTIMESI1(UChi_02,psi_32,Z5)\ + VACCTIMESI2(UChi_10,psi_20,Z0)\ + VACCTIMESI2(UChi_11,psi_21,Z1)\ + VACCTIMESI2(UChi_12,psi_22,Z2)\ + VACCTIMESI2(UChi_00,psi_30,Z3)\ + VACCTIMESI2(UChi_01,psi_31,Z4)\ + VACCTIMESI2(UChi_02,psi_32,Z5)\ ); #define YP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_12,result_12,result_12)\ - VADD(UChi_10,result_20,result_20)\ - VADD(UChi_11,result_21,result_21)\ - VADD(UChi_12,result_22,result_22)\ - VSUB(UChi_00,result_30,result_30)\ - VSUB(UChi_01,result_31,result_31)\ - VSUB(UChi_02,result_32,result_32) ); + VADD(UChi_00,psi_00,psi_00)\ + VADD(UChi_10,psi_10,psi_10)\ + VADD(UChi_01,psi_01,psi_01)\ + VADD(UChi_11,psi_11,psi_11)\ + VADD(UChi_02,psi_02,psi_02)\ + VADD(UChi_12,psi_12,psi_12)\ + VADD(UChi_10,psi_20,psi_20)\ + VADD(UChi_11,psi_21,psi_21)\ + VADD(UChi_12,psi_22,psi_22)\ + VSUB(UChi_00,psi_30,psi_30)\ + VSUB(UChi_01,psi_31,psi_31)\ + VSUB(UChi_02,psi_32,psi_32) ); #define YM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_12,result_12,result_12)\ - VSUB(UChi_10,result_20,result_20)\ - VSUB(UChi_11,result_21,result_21)\ - VSUB(UChi_12,result_22,result_22)\ - VADD(UChi_00,result_30,result_30)\ - VADD(UChi_01,result_31,result_31)\ - VADD(UChi_02,result_32,result_32) ); + VADD(UChi_00,psi_00,psi_00)\ + VADD(UChi_10,psi_10,psi_10)\ + VADD(UChi_01,psi_01,psi_01)\ + VADD(UChi_11,psi_11,psi_11)\ + VADD(UChi_02,psi_02,psi_02)\ + VADD(UChi_12,psi_12,psi_12)\ + VSUB(UChi_10,psi_20,psi_20)\ + VSUB(UChi_11,psi_21,psi_21)\ + VSUB(UChi_12,psi_22,psi_22)\ + VADD(UChi_00,psi_30,psi_30)\ + VADD(UChi_01,psi_31,psi_31)\ + VADD(UChi_02,psi_32,psi_32) ); #define ZP_RECON_ACCUM __asm__ ( \ - VACCTIMESMINUSI0(UChi_00,result_20,Z0)\ - VACCTIMESI0(UChi_10,result_30,Z3)\ - VACCTIMESMINUSI0(UChi_01,result_21,Z1)\ - VACCTIMESI0(UChi_11,result_31,Z4)\ - VACCTIMESMINUSI0(UChi_02,result_22,Z2)\ - VACCTIMESI0(UChi_12,result_32,Z5)\ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_12,result_12,result_12)\ - VACCTIMESMINUSI1(UChi_00,result_20,Z0)\ - VACCTIMESI1(UChi_10,result_30,Z3)\ - VACCTIMESMINUSI1(UChi_01,result_21,Z1)\ - VACCTIMESI1(UChi_11,result_31,Z4)\ - VACCTIMESMINUSI1(UChi_02,result_22,Z2)\ - VACCTIMESI1(UChi_12,result_32,Z5)\ - VACCTIMESMINUSI2(UChi_00,result_20,Z0)\ - VACCTIMESMINUSI2(UChi_01,result_21,Z1)\ - VACCTIMESMINUSI2(UChi_02,result_22,Z2)\ - VACCTIMESI2(UChi_10,result_30,Z3)\ - VACCTIMESI2(UChi_11,result_31,Z4)\ - VACCTIMESI2(UChi_12,result_32,Z5)\ + VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\ + VACCTIMESI0(UChi_10,psi_30,Z3)\ + VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\ + VACCTIMESI0(UChi_11,psi_31,Z4)\ + VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\ + VACCTIMESI0(UChi_12,psi_32,Z5)\ + VADD(UChi_00,psi_00,psi_00)\ + VADD(UChi_10,psi_10,psi_10)\ + VADD(UChi_01,psi_01,psi_01)\ + VADD(UChi_11,psi_11,psi_11)\ + VADD(UChi_02,psi_02,psi_02)\ + VADD(UChi_12,psi_12,psi_12)\ + VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\ + VACCTIMESI1(UChi_10,psi_30,Z3)\ + VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\ + VACCTIMESI1(UChi_11,psi_31,Z4)\ + VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\ + VACCTIMESI1(UChi_12,psi_32,Z5)\ + VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\ + VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\ + VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\ + VACCTIMESI2(UChi_10,psi_30,Z3)\ + VACCTIMESI2(UChi_11,psi_31,Z4)\ + VACCTIMESI2(UChi_12,psi_32,Z5)\ ); #define ZM_RECON_ACCUM __asm__ ( \ - VACCTIMESI0(UChi_00,result_20,Z0)\ - VACCTIMESMINUSI0(UChi_10,result_30,Z3)\ - VACCTIMESI0(UChi_01,result_21,Z1)\ - VACCTIMESMINUSI0(UChi_11,result_31,Z4)\ - VACCTIMESI0(UChi_02,result_22,Z2)\ - VACCTIMESMINUSI0(UChi_12,result_32,Z5)\ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_12,result_12,result_12)\ - VACCTIMESI1(UChi_00,result_20,Z0)\ - VACCTIMESMINUSI1(UChi_10,result_30,Z3)\ - VACCTIMESI1(UChi_01,result_21,Z1)\ - VACCTIMESMINUSI1(UChi_11,result_31,Z4)\ - VACCTIMESI1(UChi_02,result_22,Z2)\ - VACCTIMESMINUSI1(UChi_12,result_32,Z5)\ - VACCTIMESI2(UChi_00,result_20,Z0)\ - VACCTIMESI2(UChi_01,result_21,Z1)\ - VACCTIMESI2(UChi_02,result_22,Z2)\ - VACCTIMESMINUSI2(UChi_10,result_30,Z3)\ - VACCTIMESMINUSI2(UChi_11,result_31,Z4)\ - VACCTIMESMINUSI2(UChi_12,result_32,Z5)\ + VACCTIMESI0(UChi_00,psi_20,Z0)\ + VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\ + VACCTIMESI0(UChi_01,psi_21,Z1)\ + VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\ + VACCTIMESI0(UChi_02,psi_22,Z2)\ + VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\ + VADD(UChi_00,psi_00,psi_00)\ + VADD(UChi_10,psi_10,psi_10)\ + VADD(UChi_01,psi_01,psi_01)\ + VADD(UChi_11,psi_11,psi_11)\ + VADD(UChi_02,psi_02,psi_02)\ + VADD(UChi_12,psi_12,psi_12)\ + VACCTIMESI1(UChi_00,psi_20,Z0)\ + VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\ + VACCTIMESI1(UChi_01,psi_21,Z1)\ + VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\ + VACCTIMESI1(UChi_02,psi_22,Z2)\ + VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\ + VACCTIMESI2(UChi_00,psi_20,Z0)\ + VACCTIMESI2(UChi_01,psi_21,Z1)\ + VACCTIMESI2(UChi_02,psi_22,Z2)\ + VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\ + VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\ + VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\ ); #define TP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_12,result_12,result_12)\ - VADD(UChi_00,result_20,result_20)\ - VADD(UChi_10,result_30,result_30)\ - VADD(UChi_01,result_21,result_21)\ - VADD(UChi_11,result_31,result_31)\ - VADD(UChi_02,result_22,result_22)\ - VADD(UChi_12,result_32,result_32) ); + VADD(UChi_00,psi_00,psi_00)\ + VADD(UChi_10,psi_10,psi_10)\ + VADD(UChi_01,psi_01,psi_01)\ + VADD(UChi_11,psi_11,psi_11)\ + VADD(UChi_02,psi_02,psi_02)\ + VADD(UChi_12,psi_12,psi_12)\ + VADD(UChi_00,psi_20,psi_20)\ + VADD(UChi_10,psi_30,psi_30)\ + VADD(UChi_01,psi_21,psi_21)\ + VADD(UChi_11,psi_31,psi_31)\ + VADD(UChi_02,psi_22,psi_22)\ + VADD(UChi_12,psi_32,psi_32) ); #define TM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_12,result_12,result_12)\ - VSUB(UChi_00,result_20,result_20)\ - VSUB(UChi_10,result_30,result_30)\ - VSUB(UChi_01,result_21,result_21)\ - VSUB(UChi_11,result_31,result_31)\ - VSUB(UChi_02,result_22,result_22)\ - VSUB(UChi_12,result_32,result_32) ); + VADD(UChi_00,psi_00,psi_00)\ + VADD(UChi_10,psi_10,psi_10)\ + VADD(UChi_01,psi_01,psi_01)\ + VADD(UChi_11,psi_11,psi_11)\ + VADD(UChi_02,psi_02,psi_02)\ + VADD(UChi_12,psi_12,psi_12)\ + VSUB(UChi_00,psi_20,psi_20)\ + VSUB(UChi_10,psi_30,psi_30)\ + VSUB(UChi_01,psi_21,psi_21)\ + VSUB(UChi_11,psi_31,psi_31)\ + VSUB(UChi_02,psi_22,psi_22)\ + VSUB(UChi_12,psi_32,psi_32) ); #define AVX512_PF_L1 #define AVX512_PF_L2_GAUGE @@ -582,22 +571,62 @@ Author: paboyle LOAD64(%r8,PTR) \ LOAD64(%r9,pf) \ __asm__ ( \ - VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \ - VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \ - VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \ - VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \ - VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \ - VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \ - VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \ - VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \ - VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \ - VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \ - VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \ - VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \ + VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \ + VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \ + VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \ + VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \ + VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \ + VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \ + VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \ + VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \ + VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \ + VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \ + VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \ + VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \ ); +#define ADD_RESULTi(PTR,pf) \ + LOAD_CHIMU(PTR); \ + asm(VADD(psi_00,Chimu_00,psi_00) VADD(psi_01,Chimu_01,psi_01) VADD(psi_02,Chimu_02,psi_02) \ + VADD(psi_10,Chimu_10,psi_10) VADD(psi_11,Chimu_11,psi_11) VADD(psi_12,Chimu_12,psi_12) \ + VADD(psi_20,Chimu_20,psi_20) VADD(psi_21,Chimu_21,psi_21) VADD(psi_22,Chimu_22,psi_22) \ + VADD(psi_30,Chimu_30,psi_30) VADD(psi_31,Chimu_31,psi_31) VADD(psi_32,Chimu_32,psi_32) ); \ + SAVE_RESULT(PTR,pf); + + + +#define ADD_RESULTia(PTR,pf) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + VADDMEM(0,%r8,psi_00,psi_00) \ + VADDMEM(1,%r8,psi_01,psi_01) \ + VADDMEM(2,%r8,psi_02,psi_02) \ + VADDMEM(3,%r8,psi_10,psi_10) \ + VADDMEM(4,%r8,psi_11,psi_11) \ + VADDMEM(5,%r8,psi_12,psi_12) \ + VADDMEM(6,%r8,psi_20,psi_20) \ + VADDMEM(7,%r8,psi_21,psi_21) \ + VADDMEM(8,%r8,psi_22,psi_22) \ + VADDMEM(9,%r8,psi_30,psi_30) \ + VADDMEM(10,%r8,psi_31,psi_31) \ + VADDMEM(11,%r8,psi_32,psi_32) \ + VSTORE(0,%r8,psi_00) \ + VSTORE(1,%r8,psi_01) \ + VSTORE(2,%r8,psi_02) \ + VSTORE(3,%r8,psi_10) \ + VSTORE(4,%r8,psi_11) \ + VSTORE(5,%r8,psi_12) \ + VSTORE(6,%r8,psi_20) \ + VSTORE(7,%r8,psi_21) \ + VSTORE(8,%r8,psi_22) \ + VSTORE(9,%r8,psi_30) \ + VSTORE(10,%r8,psi_31) \ + VSTORE(11,%r8,psi_32) \ + ); + + #ifdef AVX512_PF_L2_TABLE -#define PREFETCH_CHIMU(A) \ +#define PREFETCH_CHIMU(A) \ LOAD64(%r9,A) \ __asm__ ( \ VPREFETCH_P1(0,%r9) \