mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Overlap comms compute support; make reg naming consistent with bgq aasm
This commit is contained in:
parent
9ff97b4711
commit
0883d6a7ce
@ -31,21 +31,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Register allocations for Wilson Kernel are precision indept
|
// Register allocations for Wilson Kernel are precision indept
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#define result_00 %zmm0
|
#define psi_00 %zmm0
|
||||||
#define result_01 %zmm1
|
#define psi_01 %zmm1
|
||||||
#define result_02 %zmm2
|
#define psi_02 %zmm2
|
||||||
|
|
||||||
#define result_10 %zmm3
|
#define psi_10 %zmm3
|
||||||
#define result_11 %zmm4
|
#define psi_11 %zmm4
|
||||||
#define result_12 %zmm5
|
#define psi_12 %zmm5
|
||||||
|
|
||||||
#define result_20 %zmm6
|
#define psi_20 %zmm6
|
||||||
#define result_21 %zmm7
|
#define psi_21 %zmm7
|
||||||
#define result_22 %zmm8
|
#define psi_22 %zmm8
|
||||||
|
|
||||||
#define result_30 %zmm9
|
#define psi_30 %zmm9
|
||||||
#define result_31 %zmm10
|
#define psi_31 %zmm10
|
||||||
#define result_32 %zmm11
|
#define psi_32 %zmm11
|
||||||
|
|
||||||
#define Chi_00 %zmm12
|
#define Chi_00 %zmm12
|
||||||
#define Chi_01 %zmm13
|
#define Chi_01 %zmm13
|
||||||
@ -102,18 +102,32 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define UNLOCK_GAUGE(dir)
|
#define UNLOCK_GAUGE(dir)
|
||||||
|
|
||||||
// const SiteSpinor * ptr = & in._odata[offset];
|
// const SiteSpinor * ptr = & in._odata[offset];
|
||||||
#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR)
|
#define LOAD_CHIMU(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi );
|
||||||
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
||||||
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
||||||
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
||||||
#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
|
#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
|
||||||
|
#define ADD_RESULT(PT,R) ADD_RESULTi(PT,R)
|
||||||
|
|
||||||
|
#define ZERO_PSI \
|
||||||
|
asm( VZERO(psi_00) \
|
||||||
|
VZERO(psi_01) \
|
||||||
|
VZERO(psi_02) \
|
||||||
|
VZERO(psi_10) \
|
||||||
|
VZERO(psi_11) \
|
||||||
|
VZERO(psi_12) \
|
||||||
|
VZERO(psi_20) \
|
||||||
|
VZERO(psi_21) \
|
||||||
|
VZERO(psi_22) \
|
||||||
|
VZERO(psi_30) \
|
||||||
|
VZERO(psi_31) \
|
||||||
|
VZERO(psi_32));
|
||||||
|
|
||||||
#define LOAD_CHIMUi \
|
#define LOAD_CHIMUi \
|
||||||
LOAD_CHIMU01i \
|
LOAD_CHIMU01i \
|
||||||
LOAD_CHIMU23i );
|
LOAD_CHIMU23i
|
||||||
|
|
||||||
|
#define LOAD_CHIMU01i \
|
||||||
#define LOAD_CHIMU01i\
|
|
||||||
VLOAD(0,%r8,Chimu_00) \
|
VLOAD(0,%r8,Chimu_00) \
|
||||||
VLOAD(1,%r8,Chimu_01) \
|
VLOAD(1,%r8,Chimu_01) \
|
||||||
VLOAD(2,%r8,Chimu_02) \
|
VLOAD(2,%r8,Chimu_02) \
|
||||||
@ -121,7 +135,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VLOAD(4,%r8,Chimu_11) \
|
VLOAD(4,%r8,Chimu_11) \
|
||||||
VLOAD(5,%r8,Chimu_12)
|
VLOAD(5,%r8,Chimu_12)
|
||||||
|
|
||||||
#define LOAD_CHIMU23i\
|
#define LOAD_CHIMU23i \
|
||||||
VLOAD(6,%r8,Chimu_20) \
|
VLOAD(6,%r8,Chimu_20) \
|
||||||
VLOAD(7,%r8,Chimu_21) \
|
VLOAD(7,%r8,Chimu_21) \
|
||||||
VLOAD(8,%r8,Chimu_22) \
|
VLOAD(8,%r8,Chimu_22) \
|
||||||
@ -137,9 +151,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSHUFMEM(10,%r8,Chimu_31) \
|
VSHUFMEM(10,%r8,Chimu_31) \
|
||||||
VSHUFMEM(11,%r8,Chimu_32)
|
VSHUFMEM(11,%r8,Chimu_32)
|
||||||
|
|
||||||
|
|
||||||
// const SiteHalfSpinor *ptr = &buf[offset];
|
|
||||||
|
|
||||||
#define LOAD_CHIi \
|
#define LOAD_CHIi \
|
||||||
VLOAD(0,%r8,Chi_00) \
|
VLOAD(0,%r8,Chi_00) \
|
||||||
VLOAD(1,%r8,Chi_01) \
|
VLOAD(1,%r8,Chi_01) \
|
||||||
@ -148,7 +159,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VLOAD(4,%r8,Chi_11) \
|
VLOAD(4,%r8,Chi_11) \
|
||||||
VLOAD(5,%r8,Chi_12)
|
VLOAD(5,%r8,Chi_12)
|
||||||
|
|
||||||
|
|
||||||
#define SAVE_UCHIi(PTR) \
|
#define SAVE_UCHIi(PTR) \
|
||||||
LOAD64(%r8,PTR) \
|
LOAD64(%r8,PTR) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
@ -157,8 +167,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSTORE(2,%r8,UChi_02) \
|
VSTORE(2,%r8,UChi_02) \
|
||||||
VSTORE(3,%r8,UChi_10) \
|
VSTORE(3,%r8,UChi_10) \
|
||||||
VSTORE(4,%r8,UChi_11) \
|
VSTORE(4,%r8,UChi_11) \
|
||||||
VSTORE(5,%r8,UChi_12) \
|
VSTORE(5,%r8,UChi_12) );
|
||||||
);
|
|
||||||
|
|
||||||
#define SAVE_CHIi(PTR) \
|
#define SAVE_CHIi(PTR) \
|
||||||
LOAD64(%r8,PTR) \
|
LOAD64(%r8,PTR) \
|
||||||
@ -168,33 +177,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSTORE(2,%r8,Chi_02) \
|
VSTORE(2,%r8,Chi_02) \
|
||||||
VSTORE(3,%r8,Chi_10) \
|
VSTORE(3,%r8,Chi_10) \
|
||||||
VSTORE(4,%r8,Chi_11) \
|
VSTORE(4,%r8,Chi_11) \
|
||||||
VSTORE(5,%r8,Chi_12) \
|
VSTORE(5,%r8,Chi_12) );
|
||||||
);
|
|
||||||
|
|
||||||
|
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
|
||||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
|
||||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
|
|
||||||
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
|
|
||||||
|
|
||||||
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
|
|
||||||
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
|
|
||||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
|
||||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
|
||||||
|
|
||||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
|
|
||||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
|
|
||||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
|
|
||||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
|
|
||||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
|
|
||||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
|
|
||||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
|
|
||||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
// Dirac algebra
|
// Dirac algebra
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||||
#define XP_PROJMEM(PTR) \
|
#define XP_PROJMEM(PTR) \
|
||||||
@ -259,7 +249,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
// hspin(0)=fspin(0)-timesI(fspin(3))
|
// hspin(0)=fspin(0)-timesI(fspin(3))
|
||||||
// hspin(1)=fspin(1)-timesI(fspin(2))
|
// hspin(1)=fspin(1)-timesI(fspin(2))
|
||||||
|
|
||||||
#define XM_PROJMEM(PTR) \
|
#define XM_PROJMEM(PTR) \
|
||||||
LOAD64(%r8,PTR)\
|
LOAD64(%r8,PTR)\
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
@ -324,226 +313,226 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
// fspin(3)=timesMinusI(hspin(0))
|
// fspin(3)=timesMinusI(hspin(0))
|
||||||
#define XP_RECON __asm__ ( \
|
#define XP_RECON __asm__ ( \
|
||||||
VZERO(TMP) \
|
VZERO(TMP) \
|
||||||
VTIMESMINUSI0(UChi_00,result_30,TMP) \
|
VTIMESMINUSI0(UChi_00,psi_30,TMP) \
|
||||||
VTIMESMINUSI0(UChi_10,result_20,TMP) \
|
VTIMESMINUSI0(UChi_10,psi_20,TMP) \
|
||||||
VTIMESMINUSI0(UChi_01,result_31,TMP) \
|
VTIMESMINUSI0(UChi_01,psi_31,TMP) \
|
||||||
VTIMESMINUSI0(UChi_11,result_21,TMP) \
|
VTIMESMINUSI0(UChi_11,psi_21,TMP) \
|
||||||
VTIMESMINUSI0(UChi_02,result_32,TMP) \
|
VTIMESMINUSI0(UChi_02,psi_32,TMP) \
|
||||||
VTIMESMINUSI0(UChi_12,result_22,TMP) \
|
VTIMESMINUSI0(UChi_12,psi_22,TMP) \
|
||||||
VMOV(UChi_00,result_00) \
|
VMOV(UChi_00,psi_00) \
|
||||||
VMOV(UChi_10,result_10) \
|
VMOV(UChi_10,psi_10) \
|
||||||
VMOV(UChi_01,result_01) \
|
VMOV(UChi_01,psi_01) \
|
||||||
VMOV(UChi_11,result_11) \
|
VMOV(UChi_11,psi_11) \
|
||||||
VMOV(UChi_02,result_02) \
|
VMOV(UChi_02,psi_02) \
|
||||||
VMOV(UChi_12,result_12) \
|
VMOV(UChi_12,psi_12) \
|
||||||
VTIMESMINUSI1(UChi_10,result_20,TMP) \
|
VTIMESMINUSI1(UChi_10,psi_20,TMP) \
|
||||||
VTIMESMINUSI1(UChi_11,result_21,TMP) \
|
VTIMESMINUSI1(UChi_11,psi_21,TMP) \
|
||||||
VTIMESMINUSI1(UChi_12,result_22,TMP) \
|
VTIMESMINUSI1(UChi_12,psi_22,TMP) \
|
||||||
VTIMESMINUSI1(UChi_00,result_30,TMP) \
|
VTIMESMINUSI1(UChi_00,psi_30,TMP) \
|
||||||
VTIMESMINUSI1(UChi_01,result_31,TMP) \
|
VTIMESMINUSI1(UChi_01,psi_31,TMP) \
|
||||||
VTIMESMINUSI1(UChi_02,result_32,TMP) \
|
VTIMESMINUSI1(UChi_02,psi_32,TMP) \
|
||||||
VTIMESMINUSI2(UChi_10,result_20,TMP) \
|
VTIMESMINUSI2(UChi_10,psi_20,TMP) \
|
||||||
VTIMESMINUSI2(UChi_11,result_21,TMP) \
|
VTIMESMINUSI2(UChi_11,psi_21,TMP) \
|
||||||
VTIMESMINUSI2(UChi_12,result_22,TMP) \
|
VTIMESMINUSI2(UChi_12,psi_22,TMP) \
|
||||||
VTIMESMINUSI2(UChi_00,result_30,TMP) \
|
VTIMESMINUSI2(UChi_00,psi_30,TMP) \
|
||||||
VTIMESMINUSI2(UChi_01,result_31,TMP) \
|
VTIMESMINUSI2(UChi_01,psi_31,TMP) \
|
||||||
VTIMESMINUSI2(UChi_02,result_32,TMP) \
|
VTIMESMINUSI2(UChi_02,psi_32,TMP) \
|
||||||
);
|
);
|
||||||
// NB could save 6 ops using addsub => 12 cycles
|
// NB could save 6 ops using addsub => 12 cycles
|
||||||
#define XP_RECON_ACCUM __asm__ ( \
|
#define XP_RECON_ACCUM __asm__ ( \
|
||||||
VZERO(TMP)\
|
VZERO(TMP)\
|
||||||
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
|
VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\
|
||||||
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
|
VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\
|
||||||
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
|
VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\
|
||||||
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
|
VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\
|
||||||
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
|
VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\
|
||||||
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
|
VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,psi_00,psi_00)\
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,psi_10,psi_10)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
VADD(UChi_01,psi_01,psi_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,psi_11,psi_11)\
|
||||||
VADD(UChi_02,result_02,result_02)\
|
VADD(UChi_02,psi_02,psi_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,psi_12,psi_12)\
|
||||||
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
|
VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\
|
||||||
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
|
VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\
|
||||||
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
|
VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\
|
||||||
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
|
VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\
|
||||||
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
|
VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\
|
||||||
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
|
VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\
|
||||||
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
|
VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\
|
||||||
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
|
VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\
|
||||||
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
|
VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\
|
||||||
VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
|
VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\
|
||||||
VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
|
VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\
|
||||||
VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
|
VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\
|
||||||
);
|
);
|
||||||
|
|
||||||
#define XM_RECON __asm__ ( \
|
#define XM_RECON __asm__ ( \
|
||||||
VZERO(TMP)\
|
VZERO(TMP)\
|
||||||
VTIMESI0(UChi_00,result_30,TMP)\
|
VTIMESI0(UChi_00,psi_30,TMP)\
|
||||||
VTIMESI0(UChi_10,result_20,TMP)\
|
VTIMESI0(UChi_10,psi_20,TMP)\
|
||||||
VTIMESI0(UChi_01,result_31,TMP)\
|
VTIMESI0(UChi_01,psi_31,TMP)\
|
||||||
VTIMESI0(UChi_11,result_21,TMP)\
|
VTIMESI0(UChi_11,psi_21,TMP)\
|
||||||
VTIMESI0(UChi_02,result_32,TMP)\
|
VTIMESI0(UChi_02,psi_32,TMP)\
|
||||||
VTIMESI0(UChi_12,result_22,TMP)\
|
VTIMESI0(UChi_12,psi_22,TMP)\
|
||||||
VMOV(UChi_00,result_00)\
|
VMOV(UChi_00,psi_00)\
|
||||||
VMOV(UChi_10,result_10)\
|
VMOV(UChi_10,psi_10)\
|
||||||
VMOV(UChi_01,result_01)\
|
VMOV(UChi_01,psi_01)\
|
||||||
VMOV(UChi_11,result_11)\
|
VMOV(UChi_11,psi_11)\
|
||||||
VMOV(UChi_02,result_02)\
|
VMOV(UChi_02,psi_02)\
|
||||||
VMOV(UChi_12,result_12)\
|
VMOV(UChi_12,psi_12)\
|
||||||
VTIMESI1(UChi_00,result_30,TMP)\
|
VTIMESI1(UChi_00,psi_30,TMP)\
|
||||||
VTIMESI1(UChi_10,result_20,TMP)\
|
VTIMESI1(UChi_10,psi_20,TMP)\
|
||||||
VTIMESI1(UChi_01,result_31,TMP)\
|
VTIMESI1(UChi_01,psi_31,TMP)\
|
||||||
VTIMESI1(UChi_11,result_21,TMP)\
|
VTIMESI1(UChi_11,psi_21,TMP)\
|
||||||
VTIMESI1(UChi_02,result_32,TMP)\
|
VTIMESI1(UChi_02,psi_32,TMP)\
|
||||||
VTIMESI1(UChi_12,result_22,TMP)\
|
VTIMESI1(UChi_12,psi_22,TMP)\
|
||||||
VTIMESI2(UChi_10,result_20,TMP)\
|
VTIMESI2(UChi_10,psi_20,TMP)\
|
||||||
VTIMESI2(UChi_11,result_21,TMP)\
|
VTIMESI2(UChi_11,psi_21,TMP)\
|
||||||
VTIMESI2(UChi_12,result_22,TMP)\
|
VTIMESI2(UChi_12,psi_22,TMP)\
|
||||||
VTIMESI2(UChi_00,result_30,TMP)\
|
VTIMESI2(UChi_00,psi_30,TMP)\
|
||||||
VTIMESI2(UChi_01,result_31,TMP)\
|
VTIMESI2(UChi_01,psi_31,TMP)\
|
||||||
VTIMESI2(UChi_02,result_32,TMP)\
|
VTIMESI2(UChi_02,psi_32,TMP)\
|
||||||
);
|
);
|
||||||
|
|
||||||
#define XM_RECON_ACCUM __asm__ ( \
|
#define XM_RECON_ACCUM __asm__ ( \
|
||||||
VACCTIMESI0(UChi_10,result_20,Z0)\
|
VACCTIMESI0(UChi_10,psi_20,Z0)\
|
||||||
VACCTIMESI0(UChi_00,result_30,Z3)\
|
VACCTIMESI0(UChi_00,psi_30,Z3)\
|
||||||
VACCTIMESI0(UChi_11,result_21,Z1)\
|
VACCTIMESI0(UChi_11,psi_21,Z1)\
|
||||||
VACCTIMESI0(UChi_01,result_31,Z4)\
|
VACCTIMESI0(UChi_01,psi_31,Z4)\
|
||||||
VACCTIMESI0(UChi_12,result_22,Z2)\
|
VACCTIMESI0(UChi_12,psi_22,Z2)\
|
||||||
VACCTIMESI0(UChi_02,result_32,Z5)\
|
VACCTIMESI0(UChi_02,psi_32,Z5)\
|
||||||
\
|
\
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,psi_10,psi_10)\
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,psi_00,psi_00)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,psi_11,psi_11)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
VADD(UChi_01,psi_01,psi_01)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,psi_12,psi_12)\
|
||||||
VADD(UChi_02,result_02,result_02)\
|
VADD(UChi_02,psi_02,psi_02)\
|
||||||
\
|
\
|
||||||
VACCTIMESI1(UChi_10,result_20,Z0)\
|
VACCTIMESI1(UChi_10,psi_20,Z0)\
|
||||||
VACCTIMESI1(UChi_00,result_30,Z3)\
|
VACCTIMESI1(UChi_00,psi_30,Z3)\
|
||||||
VACCTIMESI1(UChi_11,result_21,Z1)\
|
VACCTIMESI1(UChi_11,psi_21,Z1)\
|
||||||
VACCTIMESI1(UChi_01,result_31,Z4)\
|
VACCTIMESI1(UChi_01,psi_31,Z4)\
|
||||||
VACCTIMESI1(UChi_12,result_22,Z2)\
|
VACCTIMESI1(UChi_12,psi_22,Z2)\
|
||||||
VACCTIMESI1(UChi_02,result_32,Z5)\
|
VACCTIMESI1(UChi_02,psi_32,Z5)\
|
||||||
VACCTIMESI2(UChi_10,result_20,Z0)\
|
VACCTIMESI2(UChi_10,psi_20,Z0)\
|
||||||
VACCTIMESI2(UChi_11,result_21,Z1)\
|
VACCTIMESI2(UChi_11,psi_21,Z1)\
|
||||||
VACCTIMESI2(UChi_12,result_22,Z2)\
|
VACCTIMESI2(UChi_12,psi_22,Z2)\
|
||||||
VACCTIMESI2(UChi_00,result_30,Z3)\
|
VACCTIMESI2(UChi_00,psi_30,Z3)\
|
||||||
VACCTIMESI2(UChi_01,result_31,Z4)\
|
VACCTIMESI2(UChi_01,psi_31,Z4)\
|
||||||
VACCTIMESI2(UChi_02,result_32,Z5)\
|
VACCTIMESI2(UChi_02,psi_32,Z5)\
|
||||||
);
|
);
|
||||||
|
|
||||||
#define YP_RECON_ACCUM __asm__ ( \
|
#define YP_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,psi_00,psi_00)\
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,psi_10,psi_10)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
VADD(UChi_01,psi_01,psi_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,psi_11,psi_11)\
|
||||||
VADD(UChi_02,result_02,result_02)\
|
VADD(UChi_02,psi_02,psi_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,psi_12,psi_12)\
|
||||||
VADD(UChi_10,result_20,result_20)\
|
VADD(UChi_10,psi_20,psi_20)\
|
||||||
VADD(UChi_11,result_21,result_21)\
|
VADD(UChi_11,psi_21,psi_21)\
|
||||||
VADD(UChi_12,result_22,result_22)\
|
VADD(UChi_12,psi_22,psi_22)\
|
||||||
VSUB(UChi_00,result_30,result_30)\
|
VSUB(UChi_00,psi_30,psi_30)\
|
||||||
VSUB(UChi_01,result_31,result_31)\
|
VSUB(UChi_01,psi_31,psi_31)\
|
||||||
VSUB(UChi_02,result_32,result_32) );
|
VSUB(UChi_02,psi_32,psi_32) );
|
||||||
|
|
||||||
#define YM_RECON_ACCUM __asm__ ( \
|
#define YM_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,psi_00,psi_00)\
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,psi_10,psi_10)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
VADD(UChi_01,psi_01,psi_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,psi_11,psi_11)\
|
||||||
VADD(UChi_02,result_02,result_02)\
|
VADD(UChi_02,psi_02,psi_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,psi_12,psi_12)\
|
||||||
VSUB(UChi_10,result_20,result_20)\
|
VSUB(UChi_10,psi_20,psi_20)\
|
||||||
VSUB(UChi_11,result_21,result_21)\
|
VSUB(UChi_11,psi_21,psi_21)\
|
||||||
VSUB(UChi_12,result_22,result_22)\
|
VSUB(UChi_12,psi_22,psi_22)\
|
||||||
VADD(UChi_00,result_30,result_30)\
|
VADD(UChi_00,psi_30,psi_30)\
|
||||||
VADD(UChi_01,result_31,result_31)\
|
VADD(UChi_01,psi_31,psi_31)\
|
||||||
VADD(UChi_02,result_32,result_32) );
|
VADD(UChi_02,psi_32,psi_32) );
|
||||||
|
|
||||||
#define ZP_RECON_ACCUM __asm__ ( \
|
#define ZP_RECON_ACCUM __asm__ ( \
|
||||||
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
|
VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\
|
||||||
VACCTIMESI0(UChi_10,result_30,Z3)\
|
VACCTIMESI0(UChi_10,psi_30,Z3)\
|
||||||
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
|
VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\
|
||||||
VACCTIMESI0(UChi_11,result_31,Z4)\
|
VACCTIMESI0(UChi_11,psi_31,Z4)\
|
||||||
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
|
VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\
|
||||||
VACCTIMESI0(UChi_12,result_32,Z5)\
|
VACCTIMESI0(UChi_12,psi_32,Z5)\
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,psi_00,psi_00)\
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,psi_10,psi_10)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
VADD(UChi_01,psi_01,psi_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,psi_11,psi_11)\
|
||||||
VADD(UChi_02,result_02,result_02)\
|
VADD(UChi_02,psi_02,psi_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,psi_12,psi_12)\
|
||||||
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
|
VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\
|
||||||
VACCTIMESI1(UChi_10,result_30,Z3)\
|
VACCTIMESI1(UChi_10,psi_30,Z3)\
|
||||||
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
|
VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\
|
||||||
VACCTIMESI1(UChi_11,result_31,Z4)\
|
VACCTIMESI1(UChi_11,psi_31,Z4)\
|
||||||
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
|
VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\
|
||||||
VACCTIMESI1(UChi_12,result_32,Z5)\
|
VACCTIMESI1(UChi_12,psi_32,Z5)\
|
||||||
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
|
VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\
|
||||||
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
|
VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\
|
||||||
VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
|
VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\
|
||||||
VACCTIMESI2(UChi_10,result_30,Z3)\
|
VACCTIMESI2(UChi_10,psi_30,Z3)\
|
||||||
VACCTIMESI2(UChi_11,result_31,Z4)\
|
VACCTIMESI2(UChi_11,psi_31,Z4)\
|
||||||
VACCTIMESI2(UChi_12,result_32,Z5)\
|
VACCTIMESI2(UChi_12,psi_32,Z5)\
|
||||||
);
|
);
|
||||||
|
|
||||||
#define ZM_RECON_ACCUM __asm__ ( \
|
#define ZM_RECON_ACCUM __asm__ ( \
|
||||||
VACCTIMESI0(UChi_00,result_20,Z0)\
|
VACCTIMESI0(UChi_00,psi_20,Z0)\
|
||||||
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
|
VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\
|
||||||
VACCTIMESI0(UChi_01,result_21,Z1)\
|
VACCTIMESI0(UChi_01,psi_21,Z1)\
|
||||||
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
|
VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\
|
||||||
VACCTIMESI0(UChi_02,result_22,Z2)\
|
VACCTIMESI0(UChi_02,psi_22,Z2)\
|
||||||
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
|
VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,psi_00,psi_00)\
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,psi_10,psi_10)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
VADD(UChi_01,psi_01,psi_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,psi_11,psi_11)\
|
||||||
VADD(UChi_02,result_02,result_02)\
|
VADD(UChi_02,psi_02,psi_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,psi_12,psi_12)\
|
||||||
VACCTIMESI1(UChi_00,result_20,Z0)\
|
VACCTIMESI1(UChi_00,psi_20,Z0)\
|
||||||
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
|
VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\
|
||||||
VACCTIMESI1(UChi_01,result_21,Z1)\
|
VACCTIMESI1(UChi_01,psi_21,Z1)\
|
||||||
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
|
VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\
|
||||||
VACCTIMESI1(UChi_02,result_22,Z2)\
|
VACCTIMESI1(UChi_02,psi_22,Z2)\
|
||||||
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
|
VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\
|
||||||
VACCTIMESI2(UChi_00,result_20,Z0)\
|
VACCTIMESI2(UChi_00,psi_20,Z0)\
|
||||||
VACCTIMESI2(UChi_01,result_21,Z1)\
|
VACCTIMESI2(UChi_01,psi_21,Z1)\
|
||||||
VACCTIMESI2(UChi_02,result_22,Z2)\
|
VACCTIMESI2(UChi_02,psi_22,Z2)\
|
||||||
VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
|
VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\
|
||||||
VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
|
VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\
|
||||||
VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
|
VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\
|
||||||
);
|
);
|
||||||
|
|
||||||
#define TP_RECON_ACCUM __asm__ ( \
|
#define TP_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,psi_00,psi_00)\
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,psi_10,psi_10)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
VADD(UChi_01,psi_01,psi_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,psi_11,psi_11)\
|
||||||
VADD(UChi_02,result_02,result_02)\
|
VADD(UChi_02,psi_02,psi_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,psi_12,psi_12)\
|
||||||
VADD(UChi_00,result_20,result_20)\
|
VADD(UChi_00,psi_20,psi_20)\
|
||||||
VADD(UChi_10,result_30,result_30)\
|
VADD(UChi_10,psi_30,psi_30)\
|
||||||
VADD(UChi_01,result_21,result_21)\
|
VADD(UChi_01,psi_21,psi_21)\
|
||||||
VADD(UChi_11,result_31,result_31)\
|
VADD(UChi_11,psi_31,psi_31)\
|
||||||
VADD(UChi_02,result_22,result_22)\
|
VADD(UChi_02,psi_22,psi_22)\
|
||||||
VADD(UChi_12,result_32,result_32) );
|
VADD(UChi_12,psi_32,psi_32) );
|
||||||
|
|
||||||
#define TM_RECON_ACCUM __asm__ ( \
|
#define TM_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,psi_00,psi_00)\
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,psi_10,psi_10)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
VADD(UChi_01,psi_01,psi_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,psi_11,psi_11)\
|
||||||
VADD(UChi_02,result_02,result_02)\
|
VADD(UChi_02,psi_02,psi_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,psi_12,psi_12)\
|
||||||
VSUB(UChi_00,result_20,result_20)\
|
VSUB(UChi_00,psi_20,psi_20)\
|
||||||
VSUB(UChi_10,result_30,result_30)\
|
VSUB(UChi_10,psi_30,psi_30)\
|
||||||
VSUB(UChi_01,result_21,result_21)\
|
VSUB(UChi_01,psi_21,psi_21)\
|
||||||
VSUB(UChi_11,result_31,result_31)\
|
VSUB(UChi_11,psi_31,psi_31)\
|
||||||
VSUB(UChi_02,result_22,result_22)\
|
VSUB(UChi_02,psi_22,psi_22)\
|
||||||
VSUB(UChi_12,result_32,result_32) );
|
VSUB(UChi_12,psi_32,psi_32) );
|
||||||
|
|
||||||
#define AVX512_PF_L1
|
#define AVX512_PF_L1
|
||||||
#define AVX512_PF_L2_GAUGE
|
#define AVX512_PF_L2_GAUGE
|
||||||
@ -582,20 +571,60 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
LOAD64(%r8,PTR) \
|
LOAD64(%r8,PTR) \
|
||||||
LOAD64(%r9,pf) \
|
LOAD64(%r9,pf) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \
|
VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \
|
||||||
VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \
|
VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \
|
||||||
VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \
|
VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \
|
||||||
VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \
|
VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \
|
||||||
VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \
|
VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \
|
||||||
VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \
|
VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \
|
||||||
VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \
|
VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \
|
||||||
VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \
|
VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \
|
||||||
VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \
|
VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \
|
||||||
VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \
|
VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \
|
||||||
VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \
|
VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \
|
||||||
VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \
|
VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \
|
||||||
);
|
);
|
||||||
|
|
||||||
|
#define ADD_RESULTi(PTR,pf) \
|
||||||
|
LOAD_CHIMU(PTR); \
|
||||||
|
asm(VADD(psi_00,Chimu_00,psi_00) VADD(psi_01,Chimu_01,psi_01) VADD(psi_02,Chimu_02,psi_02) \
|
||||||
|
VADD(psi_10,Chimu_10,psi_10) VADD(psi_11,Chimu_11,psi_11) VADD(psi_12,Chimu_12,psi_12) \
|
||||||
|
VADD(psi_20,Chimu_20,psi_20) VADD(psi_21,Chimu_21,psi_21) VADD(psi_22,Chimu_22,psi_22) \
|
||||||
|
VADD(psi_30,Chimu_30,psi_30) VADD(psi_31,Chimu_31,psi_31) VADD(psi_32,Chimu_32,psi_32) ); \
|
||||||
|
SAVE_RESULT(PTR,pf);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define ADD_RESULTia(PTR,pf) \
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
__asm__ ( \
|
||||||
|
VADDMEM(0,%r8,psi_00,psi_00) \
|
||||||
|
VADDMEM(1,%r8,psi_01,psi_01) \
|
||||||
|
VADDMEM(2,%r8,psi_02,psi_02) \
|
||||||
|
VADDMEM(3,%r8,psi_10,psi_10) \
|
||||||
|
VADDMEM(4,%r8,psi_11,psi_11) \
|
||||||
|
VADDMEM(5,%r8,psi_12,psi_12) \
|
||||||
|
VADDMEM(6,%r8,psi_20,psi_20) \
|
||||||
|
VADDMEM(7,%r8,psi_21,psi_21) \
|
||||||
|
VADDMEM(8,%r8,psi_22,psi_22) \
|
||||||
|
VADDMEM(9,%r8,psi_30,psi_30) \
|
||||||
|
VADDMEM(10,%r8,psi_31,psi_31) \
|
||||||
|
VADDMEM(11,%r8,psi_32,psi_32) \
|
||||||
|
VSTORE(0,%r8,psi_00) \
|
||||||
|
VSTORE(1,%r8,psi_01) \
|
||||||
|
VSTORE(2,%r8,psi_02) \
|
||||||
|
VSTORE(3,%r8,psi_10) \
|
||||||
|
VSTORE(4,%r8,psi_11) \
|
||||||
|
VSTORE(5,%r8,psi_12) \
|
||||||
|
VSTORE(6,%r8,psi_20) \
|
||||||
|
VSTORE(7,%r8,psi_21) \
|
||||||
|
VSTORE(8,%r8,psi_22) \
|
||||||
|
VSTORE(9,%r8,psi_30) \
|
||||||
|
VSTORE(10,%r8,psi_31) \
|
||||||
|
VSTORE(11,%r8,psi_32) \
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
#ifdef AVX512_PF_L2_TABLE
|
#ifdef AVX512_PF_L2_TABLE
|
||||||
#define PREFETCH_CHIMU(A) \
|
#define PREFETCH_CHIMU(A) \
|
||||||
LOAD64(%r9,A) \
|
LOAD64(%r9,A) \
|
||||||
|
Loading…
Reference in New Issue
Block a user