/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Avx512Asm.h Copyright (C) 2015 Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #ifndef GRID_ASM_INTEL_512_QCD_H #define GRID_ASM_INTEL_512_QCD_H ////////////////////////////////////////////////////////////////////////////////////////// // Register allocations for Wilson Kernel are precision indept ////////////////////////////////////////////////////////////////////////////////////////// #define psi_00 %zmm0 #define psi_01 %zmm1 #define psi_02 %zmm2 #define psi_10 %zmm3 #define psi_11 %zmm4 #define psi_12 %zmm5 #define psi_20 %zmm6 #define psi_21 %zmm7 #define psi_22 %zmm8 #define psi_30 %zmm9 #define psi_31 %zmm10 #define psi_32 %zmm11 #define Chi_00 %zmm12 #define Chi_01 %zmm13 #define Chi_02 %zmm14 #define Chi_10 %zmm15 #define Chi_11 %zmm16 #define Chi_12 %zmm17 #define UChi_00 %zmm18 #define UChi_01 %zmm19 #define UChi_02 %zmm20 #define UChi_10 %zmm21 #define UChi_11 %zmm22 #define UChi_12 %zmm23 #define Uir %zmm24 #define Uri %zmm25 #define T1 %zmm24 #define T2 %zmm25 #define Z0 %zmm26 #define Z1 %zmm27 #define Z2 %zmm28 #define Z3 %zmm29 #define Z4 %zmm30 #define Z5 %zmm31 #define TMP Chi_00 #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 #define Chimu_02 Chi_02 #define Chimu_10 Chi_10 #define Chimu_11 Chi_11 #define Chimu_12 Chi_12 #define Chimu_20 UChi_00 #define Chimu_21 UChi_01 #define Chimu_22 UChi_02 #define Chimu_30 UChi_10 #define Chimu_31 UChi_11 #define Chimu_32 UChi_12 #include "Intel512common.h" #include "Intel512avx.h" ////////////////////////////////////////////////////////////////// // Macros used to build wilson kernel -- can rationalise and simplify // a little as some duplication developed during trying different // variants during optimisation. Could cut back to only those used. ////////////////////////////////////////////////////////////////// #define LOCK_GAUGE(dir) #define UNLOCK_GAUGE(dir) // const SiteSpinor * ptr = & in[offset]; #define LOAD_CHIMU(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi ); #define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi ); #define SAVE_UCHI(PTR) SAVE_UCHIi(PTR) #define SAVE_CHI(PTR) SAVE_CHIi(PTR) #define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R) #define ADD_RESULT(PT,R) ADD_RESULTi(PT,R) #define ZERO_PSI \ asm( VZERO(psi_00) \ VZERO(psi_01) \ VZERO(psi_02) \ VZERO(psi_10) \ VZERO(psi_11) \ VZERO(psi_12) \ VZERO(psi_20) \ VZERO(psi_21) \ VZERO(psi_22) \ VZERO(psi_30) \ VZERO(psi_31) \ VZERO(psi_32)); #define LOAD_CHIMUi \ LOAD_CHIMU01i \ LOAD_CHIMU23i #define LOAD_CHIMU01i \ VLOAD(0,%r8,Chimu_00) \ VLOAD(1,%r8,Chimu_01) \ VLOAD(2,%r8,Chimu_02) \ VLOAD(3,%r8,Chimu_10) \ VLOAD(4,%r8,Chimu_11) \ VLOAD(5,%r8,Chimu_12) #define LOAD_CHIMU23i \ VLOAD(6,%r8,Chimu_20) \ VLOAD(7,%r8,Chimu_21) \ VLOAD(8,%r8,Chimu_22) \ VLOAD(9,%r8,Chimu_30) \ VLOAD(10,%r8,Chimu_31) \ VLOAD(11,%r8,Chimu_32) #define SHUF_CHIMU23i \ VSHUFMEM(6,%r8,Chimu_20) \ VSHUFMEM(7,%r8,Chimu_21) \ VSHUFMEM(8,%r8,Chimu_22) \ VSHUFMEM(9,%r8,Chimu_30) \ VSHUFMEM(10,%r8,Chimu_31) \ VSHUFMEM(11,%r8,Chimu_32) #define LOAD_CHIi \ VLOAD(0,%r8,Chi_00) \ VLOAD(1,%r8,Chi_01) \ VLOAD(2,%r8,Chi_02) \ VLOAD(3,%r8,Chi_10) \ VLOAD(4,%r8,Chi_11) \ VLOAD(5,%r8,Chi_12) #define SAVE_UCHIi(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ VSTORE(0,%r8,UChi_00) \ VSTORE(1,%r8,UChi_01) \ VSTORE(2,%r8,UChi_02) \ VSTORE(3,%r8,UChi_10) \ VSTORE(4,%r8,UChi_11) \ VSTORE(5,%r8,UChi_12) ); #define SAVE_CHIi(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ VSTORE(0,%r8,Chi_00) \ VSTORE(1,%r8,Chi_01) \ VSTORE(2,%r8,Chi_02) \ VSTORE(3,%r8,Chi_10) \ VSTORE(4,%r8,Chi_11) \ VSTORE(5,%r8,Chi_12) ); #define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U[sU](A),p) #define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf) ////////////////////////////////////////////////////////////////// // Dirac algebra ////////////////////////////////////////////////////////////////// // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); #define XP_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ LOAD_CHIi \ SHUF_CHIMU23i \ VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \ VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \ VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \ VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \ VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \ VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \ VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \ VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \ VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \ VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \ VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \ VACCTIMESI2(Chi_12,Chi_12,Chimu_22) ); #define YP_PROJMEM(ptr) \ LOAD64(%r8,ptr) \ __asm__ ( \ LOAD_CHIMU01i \ VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \ VSUBMEM(10,%r8,Chimu_01,Chi_01) \ VSUBMEM(11,%r8,Chimu_02,Chi_02) \ VADDMEM(6,%r8,Chimu_10,Chi_10) \ VADDMEM(7,%r8,Chimu_11,Chi_11) \ VADDMEM(8,%r8,Chimu_12,Chi_12) ); #define ZP_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ LOAD_CHIi \ SHUF_CHIMU23i \ VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \ VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \ VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \ VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \ VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \ VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \ VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \ VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \ VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \ VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \ VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \ VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) ); #define TP_PROJMEM(ptr) \ LOAD64(%r8,ptr) \ __asm__ ( \ LOAD_CHIMU01i \ VADDMEM(6,%r8 ,Chimu_00,Chi_00) \ VADDMEM(7,%r8,Chimu_01,Chi_01) \ VADDMEM(8,%r8,Chimu_02,Chi_02) \ VADDMEM(9,%r8,Chimu_10,Chi_10) \ VADDMEM(10,%r8,Chimu_11,Chi_11) \ VADDMEM(11,%r8,Chimu_12,Chi_12) ); // hspin(0)=fspin(0)-timesI(fspin(3)) // hspin(1)=fspin(1)-timesI(fspin(2)) #define XM_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ LOAD_CHIi \ SHUF_CHIMU23i \ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30) \ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31) \ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32) \ VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20) \ VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21) \ VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22) \ VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30) \ VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31) \ VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32) \ VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20) \ VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21) \ VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) ); #define YM_PROJMEM(ptr) \ LOAD64(%r8,ptr) \ __asm__ ( \ LOAD_CHIMU01i \ VADDMEM(9,%r8 ,Chimu_00,Chi_00) \ VADDMEM(10,%r8,Chimu_01,Chi_01) \ VADDMEM(11,%r8,Chimu_02,Chi_02) \ VSUBMEM(6,%r8,Chimu_10,Chi_10) \ VSUBMEM(7,%r8,Chimu_11,Chi_11) \ VSUBMEM(8,%r8,Chimu_12,Chi_12) ); #define ZM_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ LOAD_CHIi \ SHUF_CHIMU23i \ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20) \ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21) \ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22) \ VACCTIMESI1(Chi_10,Chi_10,Chimu_30) \ VACCTIMESI1(Chi_11,Chi_11,Chimu_31) \ VACCTIMESI1(Chi_12,Chi_12,Chimu_32) \ VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20) \ VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21) \ VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22) \ VACCTIMESI2(Chi_10,Chi_10,Chimu_30) \ VACCTIMESI2(Chi_11,Chi_11,Chimu_31) \ VACCTIMESI2(Chi_12,Chi_12,Chimu_32) ); #define TM_PROJMEM(ptr) \ LOAD64(%r8,ptr) \ __asm__ ( \ LOAD_CHIMU01i \ VSUBMEM(6,%r8,Chimu_00,Chi_00) \ VSUBMEM(7,%r8,Chimu_01,Chi_01) \ VSUBMEM(8,%r8,Chimu_02,Chi_02) \ VSUBMEM(9,%r8,Chimu_10,Chi_10) \ VSUBMEM(10,%r8,Chimu_11,Chi_11) \ VSUBMEM(11,%r8,Chimu_12,Chi_12) ); // fspin(0)=hspin(0) // fspin(1)=hspin(1) // fspin(2)=timesMinusI(hspin(1)) // fspin(3)=timesMinusI(hspin(0)) #define XP_RECON __asm__ ( \ VZERO(TMP) \ VTIMESMINUSI0(UChi_00,psi_30,TMP) \ VTIMESMINUSI0(UChi_10,psi_20,TMP) \ VTIMESMINUSI0(UChi_01,psi_31,TMP) \ VTIMESMINUSI0(UChi_11,psi_21,TMP) \ VTIMESMINUSI0(UChi_02,psi_32,TMP) \ VTIMESMINUSI0(UChi_12,psi_22,TMP) \ VMOV(UChi_00,psi_00) \ VMOV(UChi_10,psi_10) \ VMOV(UChi_01,psi_01) \ VMOV(UChi_11,psi_11) \ VMOV(UChi_02,psi_02) \ VMOV(UChi_12,psi_12) \ VTIMESMINUSI1(UChi_10,psi_20,TMP) \ VTIMESMINUSI1(UChi_11,psi_21,TMP) \ VTIMESMINUSI1(UChi_12,psi_22,TMP) \ VTIMESMINUSI1(UChi_00,psi_30,TMP) \ VTIMESMINUSI1(UChi_01,psi_31,TMP) \ VTIMESMINUSI1(UChi_02,psi_32,TMP) \ VTIMESMINUSI2(UChi_10,psi_20,TMP) \ VTIMESMINUSI2(UChi_11,psi_21,TMP) \ VTIMESMINUSI2(UChi_12,psi_22,TMP) \ VTIMESMINUSI2(UChi_00,psi_30,TMP) \ VTIMESMINUSI2(UChi_01,psi_31,TMP) \ VTIMESMINUSI2(UChi_02,psi_32,TMP) \ ); // NB could save 6 ops using addsub => 12 cycles #define XP_RECON_ACCUM __asm__ ( \ VZERO(TMP) \ VACCTIMESMINUSI0(UChi_00,psi_30,Z3) \ VACCTIMESMINUSI0(UChi_10,psi_20,Z0) \ VACCTIMESMINUSI0(UChi_01,psi_31,Z4) \ VACCTIMESMINUSI0(UChi_11,psi_21,Z1) \ VACCTIMESMINUSI0(UChi_02,psi_32,Z5) \ VACCTIMESMINUSI0(UChi_12,psi_22,Z2) \ VADD(UChi_00,psi_00,psi_00) \ VADD(UChi_10,psi_10,psi_10) \ VADD(UChi_01,psi_01,psi_01) \ VADD(UChi_11,psi_11,psi_11) \ VADD(UChi_02,psi_02,psi_02) \ VADD(UChi_12,psi_12,psi_12) \ VACCTIMESMINUSI1(UChi_00,psi_30,Z3) \ VACCTIMESMINUSI1(UChi_10,psi_20,Z0) \ VACCTIMESMINUSI1(UChi_01,psi_31,Z4) \ VACCTIMESMINUSI1(UChi_11,psi_21,Z1) \ VACCTIMESMINUSI1(UChi_02,psi_32,Z5) \ VACCTIMESMINUSI1(UChi_12,psi_22,Z2) \ VACCTIMESMINUSI2(UChi_10,psi_20,Z0) \ VACCTIMESMINUSI2(UChi_11,psi_21,Z1) \ VACCTIMESMINUSI2(UChi_12,psi_22,Z2) \ VACCTIMESMINUSI2(UChi_00,psi_30,Z3) \ VACCTIMESMINUSI2(UChi_01,psi_31,Z4) \ VACCTIMESMINUSI2(UChi_02,psi_32,Z5) \ ); #define XM_RECON __asm__ ( \ VZERO(TMP) \ VTIMESI0(UChi_00,psi_30,TMP) \ VTIMESI0(UChi_10,psi_20,TMP) \ VTIMESI0(UChi_01,psi_31,TMP) \ VTIMESI0(UChi_11,psi_21,TMP) \ VTIMESI0(UChi_02,psi_32,TMP) \ VTIMESI0(UChi_12,psi_22,TMP) \ VMOV(UChi_00,psi_00) \ VMOV(UChi_10,psi_10) \ VMOV(UChi_01,psi_01) \ VMOV(UChi_11,psi_11) \ VMOV(UChi_02,psi_02) \ VMOV(UChi_12,psi_12) \ VTIMESI1(UChi_00,psi_30,TMP) \ VTIMESI1(UChi_10,psi_20,TMP) \ VTIMESI1(UChi_01,psi_31,TMP) \ VTIMESI1(UChi_11,psi_21,TMP) \ VTIMESI1(UChi_02,psi_32,TMP) \ VTIMESI1(UChi_12,psi_22,TMP) \ VTIMESI2(UChi_10,psi_20,TMP) \ VTIMESI2(UChi_11,psi_21,TMP) \ VTIMESI2(UChi_12,psi_22,TMP) \ VTIMESI2(UChi_00,psi_30,TMP) \ VTIMESI2(UChi_01,psi_31,TMP) \ VTIMESI2(UChi_02,psi_32,TMP) \ ); #define XM_RECON_ACCUM __asm__ ( \ VACCTIMESI0(UChi_10,psi_20,Z0) \ VACCTIMESI0(UChi_00,psi_30,Z3) \ VACCTIMESI0(UChi_11,psi_21,Z1) \ VACCTIMESI0(UChi_01,psi_31,Z4) \ VACCTIMESI0(UChi_12,psi_22,Z2) \ VACCTIMESI0(UChi_02,psi_32,Z5) \ \ VADD(UChi_10,psi_10,psi_10) \ VADD(UChi_00,psi_00,psi_00) \ VADD(UChi_11,psi_11,psi_11) \ VADD(UChi_01,psi_01,psi_01) \ VADD(UChi_12,psi_12,psi_12) \ VADD(UChi_02,psi_02,psi_02) \ \ VACCTIMESI1(UChi_10,psi_20,Z0) \ VACCTIMESI1(UChi_00,psi_30,Z3) \ VACCTIMESI1(UChi_11,psi_21,Z1) \ VACCTIMESI1(UChi_01,psi_31,Z4) \ VACCTIMESI1(UChi_12,psi_22,Z2) \ VACCTIMESI1(UChi_02,psi_32,Z5) \ VACCTIMESI2(UChi_10,psi_20,Z0) \ VACCTIMESI2(UChi_11,psi_21,Z1) \ VACCTIMESI2(UChi_12,psi_22,Z2) \ VACCTIMESI2(UChi_00,psi_30,Z3) \ VACCTIMESI2(UChi_01,psi_31,Z4) \ VACCTIMESI2(UChi_02,psi_32,Z5) \ ); #define YP_RECON_ACCUM __asm__ ( \ VADD(UChi_00,psi_00,psi_00) \ VADD(UChi_10,psi_10,psi_10) \ VADD(UChi_01,psi_01,psi_01) \ VADD(UChi_11,psi_11,psi_11) \ VADD(UChi_02,psi_02,psi_02) \ VADD(UChi_12,psi_12,psi_12) \ VADD(UChi_10,psi_20,psi_20) \ VADD(UChi_11,psi_21,psi_21) \ VADD(UChi_12,psi_22,psi_22) \ VSUB(UChi_00,psi_30,psi_30) \ VSUB(UChi_01,psi_31,psi_31) \ VSUB(UChi_02,psi_32,psi_32) ); #define YM_RECON_ACCUM __asm__ ( \ VADD(UChi_00,psi_00,psi_00) \ VADD(UChi_10,psi_10,psi_10) \ VADD(UChi_01,psi_01,psi_01) \ VADD(UChi_11,psi_11,psi_11) \ VADD(UChi_02,psi_02,psi_02) \ VADD(UChi_12,psi_12,psi_12) \ VSUB(UChi_10,psi_20,psi_20) \ VSUB(UChi_11,psi_21,psi_21) \ VSUB(UChi_12,psi_22,psi_22) \ VADD(UChi_00,psi_30,psi_30) \ VADD(UChi_01,psi_31,psi_31) \ VADD(UChi_02,psi_32,psi_32) ); #define ZP_RECON_ACCUM __asm__ ( \ VACCTIMESMINUSI0(UChi_00,psi_20,Z0) \ VACCTIMESI0(UChi_10,psi_30,Z3) \ VACCTIMESMINUSI0(UChi_01,psi_21,Z1) \ VACCTIMESI0(UChi_11,psi_31,Z4) \ VACCTIMESMINUSI0(UChi_02,psi_22,Z2) \ VACCTIMESI0(UChi_12,psi_32,Z5) \ VADD(UChi_00,psi_00,psi_00) \ VADD(UChi_10,psi_10,psi_10) \ VADD(UChi_01,psi_01,psi_01) \ VADD(UChi_11,psi_11,psi_11) \ VADD(UChi_02,psi_02,psi_02) \ VADD(UChi_12,psi_12,psi_12) \ VACCTIMESMINUSI1(UChi_00,psi_20,Z0) \ VACCTIMESI1(UChi_10,psi_30,Z3) \ VACCTIMESMINUSI1(UChi_01,psi_21,Z1) \ VACCTIMESI1(UChi_11,psi_31,Z4) \ VACCTIMESMINUSI1(UChi_02,psi_22,Z2) \ VACCTIMESI1(UChi_12,psi_32,Z5) \ VACCTIMESMINUSI2(UChi_00,psi_20,Z0) \ VACCTIMESMINUSI2(UChi_01,psi_21,Z1) \ VACCTIMESMINUSI2(UChi_02,psi_22,Z2) \ VACCTIMESI2(UChi_10,psi_30,Z3) \ VACCTIMESI2(UChi_11,psi_31,Z4) \ VACCTIMESI2(UChi_12,psi_32,Z5) \ ); #define ZM_RECON_ACCUM __asm__ ( \ VACCTIMESI0(UChi_00,psi_20,Z0) \ VACCTIMESMINUSI0(UChi_10,psi_30,Z3) \ VACCTIMESI0(UChi_01,psi_21,Z1) \ VACCTIMESMINUSI0(UChi_11,psi_31,Z4) \ VACCTIMESI0(UChi_02,psi_22,Z2) \ VACCTIMESMINUSI0(UChi_12,psi_32,Z5) \ VADD(UChi_00,psi_00,psi_00) \ VADD(UChi_10,psi_10,psi_10) \ VADD(UChi_01,psi_01,psi_01) \ VADD(UChi_11,psi_11,psi_11) \ VADD(UChi_02,psi_02,psi_02) \ VADD(UChi_12,psi_12,psi_12) \ VACCTIMESI1(UChi_00,psi_20,Z0) \ VACCTIMESMINUSI1(UChi_10,psi_30,Z3) \ VACCTIMESI1(UChi_01,psi_21,Z1) \ VACCTIMESMINUSI1(UChi_11,psi_31,Z4) \ VACCTIMESI1(UChi_02,psi_22,Z2) \ VACCTIMESMINUSI1(UChi_12,psi_32,Z5) \ VACCTIMESI2(UChi_00,psi_20,Z0) \ VACCTIMESI2(UChi_01,psi_21,Z1) \ VACCTIMESI2(UChi_02,psi_22,Z2) \ VACCTIMESMINUSI2(UChi_10,psi_30,Z3) \ VACCTIMESMINUSI2(UChi_11,psi_31,Z4) \ VACCTIMESMINUSI2(UChi_12,psi_32,Z5) \ ); #define TP_RECON_ACCUM __asm__ ( \ VADD(UChi_00,psi_00,psi_00) \ VADD(UChi_10,psi_10,psi_10) \ VADD(UChi_01,psi_01,psi_01) \ VADD(UChi_11,psi_11,psi_11) \ VADD(UChi_02,psi_02,psi_02) \ VADD(UChi_12,psi_12,psi_12) \ VADD(UChi_00,psi_20,psi_20) \ VADD(UChi_10,psi_30,psi_30) \ VADD(UChi_01,psi_21,psi_21) \ VADD(UChi_11,psi_31,psi_31) \ VADD(UChi_02,psi_22,psi_22) \ VADD(UChi_12,psi_32,psi_32) ); #define TM_RECON_ACCUM __asm__ ( \ VADD(UChi_00,psi_00,psi_00) \ VADD(UChi_10,psi_10,psi_10) \ VADD(UChi_01,psi_01,psi_01) \ VADD(UChi_11,psi_11,psi_11) \ VADD(UChi_02,psi_02,psi_02) \ VADD(UChi_12,psi_12,psi_12) \ VSUB(UChi_00,psi_20,psi_20) \ VSUB(UChi_10,psi_30,psi_30) \ VSUB(UChi_01,psi_21,psi_21) \ VSUB(UChi_11,psi_31,psi_31) \ VSUB(UChi_02,psi_22,psi_22) \ VSUB(UChi_12,psi_32,psi_32) ); #define AVX512_PF_L1 #define AVX512_PF_L2_GAUGE #define AVX512_PF_L2_TABLE #undef AVX512_PF_L2_LINEAR #ifdef AVX512_PF_L2_TABLE // P1 Fetches the base pointer for next link into L1 with P1 // M1 Fetches the next site pointer into L2 #define VPREFETCH_P1(A,B) VPREFETCH1(A,B) #define VPREFETCH_P2(A,B) #define VPREFETCH_M1(A,B) VPREFETCH2(A,B) #define VPREFETCH_M2(A,B) #endif #ifdef AVX512_PF_L2_LINEAR #define VPREFETCH_M1(A,B) VPREFETCH1(A,B) #define VPREFETCH_M2(A,B) VPREFETCH2(A,B) #define VPREFETCH_P1(A,B) #define VPREFETCH_P2(A,B) #endif #ifdef AVX512_PF_L2_GAUGE #define VPREFETCH_G1(A,B) VPREFETCH1(A,B) #define VPREFETCH_G2(A,B) VPREFETCH2(A,B) #endif #define PF_GAUGE(A) \ LOAD64(%r8,&U[sU](A)) \ __asm__ ( \ VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \ VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \ ); #define SAVE_RESULTi(PTR,pf) \ LOAD64(%r8,PTR) \ LOAD64(%r9,pf) \ __asm__ ( \ VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \ VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \ VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \ VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \ VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \ VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \ VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \ VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \ VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \ VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \ VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \ VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \ ); #define ADD_RESULTi(PTR,pf) \ LOAD_CHIMU(PTR); \ asm(VADD(psi_00,Chimu_00,psi_00) VADD(psi_01,Chimu_01,psi_01) VADD(psi_02,Chimu_02,psi_02) \ VADD(psi_10,Chimu_10,psi_10) VADD(psi_11,Chimu_11,psi_11) VADD(psi_12,Chimu_12,psi_12) \ VADD(psi_20,Chimu_20,psi_20) VADD(psi_21,Chimu_21,psi_21) VADD(psi_22,Chimu_22,psi_22) \ VADD(psi_30,Chimu_30,psi_30) VADD(psi_31,Chimu_31,psi_31) VADD(psi_32,Chimu_32,psi_32) ); \ SAVE_RESULT(PTR,pf); #define ADD_RESULTia(PTR,pf) \ LOAD64(%r8,PTR) \ __asm__ ( \ VADDMEM(0,%r8,psi_00,psi_00) \ VADDMEM(1,%r8,psi_01,psi_01) \ VADDMEM(2,%r8,psi_02,psi_02) \ VADDMEM(3,%r8,psi_10,psi_10) \ VADDMEM(4,%r8,psi_11,psi_11) \ VADDMEM(5,%r8,psi_12,psi_12) \ VADDMEM(6,%r8,psi_20,psi_20) \ VADDMEM(7,%r8,psi_21,psi_21) \ VADDMEM(8,%r8,psi_22,psi_22) \ VADDMEM(9,%r8,psi_30,psi_30) \ VADDMEM(10,%r8,psi_31,psi_31) \ VADDMEM(11,%r8,psi_32,psi_32) \ VSTORE(0,%r8,psi_00) \ VSTORE(1,%r8,psi_01) \ VSTORE(2,%r8,psi_02) \ VSTORE(3,%r8,psi_10) \ VSTORE(4,%r8,psi_11) \ VSTORE(5,%r8,psi_12) \ VSTORE(6,%r8,psi_20) \ VSTORE(7,%r8,psi_21) \ VSTORE(8,%r8,psi_22) \ VSTORE(9,%r8,psi_30) \ VSTORE(10,%r8,psi_31) \ VSTORE(11,%r8,psi_32) \ ); #ifdef AVX512_PF_L2_TABLE #define PREFETCH_CHIMU(A) \ LOAD64(%r9,A) \ __asm__ ( \ VPREFETCH_P1(0,%r9) \ VPREFETCH_P1(1,%r9) \ VPREFETCH_P1(2,%r9) \ VPREFETCH_P1(3,%r9) \ VPREFETCH_P1(4,%r9) \ VPREFETCH_P1(5,%r9) \ VPREFETCH_P1(6,%r9) \ VPREFETCH_P1(7,%r9) \ VPREFETCH_P1(8,%r9) \ VPREFETCH_P1(9,%r9) \ VPREFETCH_P1(10,%r9) \ VPREFETCH_P1(11,%r9)); #else #define PREFETCH_CHIMU(A) #endif #define PREFETCH1_CHIMU(A) \ LOAD64(%r9,A) \ __asm__ ( \ VPREFETCH_P1(0,%r9) \ VPREFETCH_P1(1,%r9) \ VPREFETCH_P1(2,%r9) \ VPREFETCH_P1(3,%r9) \ VPREFETCH_P1(4,%r9) \ VPREFETCH_P1(5,%r9) \ VPREFETCH_P1(6,%r9) \ VPREFETCH_P1(7,%r9) \ VPREFETCH_P1(8,%r9) \ VPREFETCH_P1(9,%r9) \ VPREFETCH_P1(10,%r9) \ VPREFETCH_P1(11,%r9)); #define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_01,Chi_01) \ VPERM0(Chi_02,Chi_02) \ VPERM0(Chi_10,Chi_10) \ VPERM0(Chi_11,Chi_11) \ VPERM0(Chi_12,Chi_12) ); #define PERMUTE_DIR1 __asm__ ( \ VPERM1(Chi_00,Chi_00) \ VPERM1(Chi_01,Chi_01) \ VPERM1(Chi_02,Chi_02) \ VPERM1(Chi_10,Chi_10) \ VPERM1(Chi_11,Chi_11) \ VPERM1(Chi_12,Chi_12)); #define PERMUTE_DIR2 __asm__ ( \ VPERM2(Chi_00,Chi_00) \ VPERM2(Chi_01,Chi_01) \ VPERM2(Chi_02,Chi_02) \ VPERM2(Chi_10,Chi_10) \ VPERM2(Chi_11,Chi_11) \ VPERM2(Chi_12,Chi_12) ); #define PERMUTE_DIR3 __asm__ ( \ VPERM3(Chi_00,Chi_00) \ VPERM3(Chi_01,Chi_01) \ VPERM3(Chi_02,Chi_02) \ VPERM3(Chi_10,Chi_10) \ VPERM3(Chi_11,Chi_11) \ VPERM3(Chi_12,Chi_12) ); #define MULT_ADDSUB_2SPIN(ptr,pf) \ LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ VPREFETCH_G2(9,%r8) \ VPREFETCH_G2(10,%r8) \ VPREFETCH_G2(11,%r8) \ VPREFETCH_G2(12,%r8) \ VPREFETCH_G2(13,%r8) \ VPREFETCH_G2(14,%r8) \ VPREFETCH_G2(15,%r8) \ VPREFETCH_G2(16,%r8) \ VPREFETCH_G2(17,%r8) \ VSHUF(Chi_00,T1) \ VMOVIDUP(0,%r8,Z0 ) \ VMOVIDUP(3,%r8,Z1 ) \ VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \ /*6*/ \ VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \ VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \ VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \ VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ VPREFETCH_M1(0,%r9) \ VPREFETCH_M1(1,%r9) \ VPREFETCH_M1(2,%r9) \ VPREFETCH_M1(3,%r9) \ /*18*/ \ VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ VMADDSUB(Z3,Chi_10,UChi_10) \ VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \ VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ VMADDSUB(Z5,Chi_10,UChi_12) \ VPREFETCH_M1(4,%r9) \ VPREFETCH_M1(5,%r9) \ VPREFETCH_M1(6,%r9) \ VPREFETCH_M1(7,%r9) \ /*28*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \ VMADDSUB(Z1,T2,UChi_11) \ VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \ VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \ VPREFETCH2(12,%r9) \ VPREFETCH2(13,%r9) \ VPREFETCH2(14,%r9) \ VPREFETCH2(15,%r9) \ VPREFETCH2(16,%r9) \ VPREFETCH2(17,%r9) \ VPREFETCH2(18,%r9) \ VPREFETCH2(19,%r9) \ VPREFETCH2(20,%r9) \ VPREFETCH2(21,%r9) \ VPREFETCH2(22,%r9) \ VPREFETCH2(23,%r9) \ /*38*/ \ VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \ VMADDSUB(Z3,Chi_11,UChi_10) \ VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \ VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ VMADDSUB(Z5,Chi_11,UChi_12) \ VPREFETCH_M1(9,%r8) \ VPREFETCH_M1(10,%r8) \ VPREFETCH_M1(11,%r8) \ VPREFETCH_M1(12,%r8) \ VPREFETCH_M1(13,%r8) \ VPREFETCH_M1(14,%r8) \ VPREFETCH_M1(15,%r8) \ VPREFETCH_M1(16,%r8) \ VPREFETCH_M1(17,%r8) \ /*48*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ VMADDSUB(Z1,T1,UChi_01) \ VMADDSUB(Z1,T2,UChi_11) \ VMADDSUB(Z2,T1,UChi_02) \ VMADDSUB(Z2,T2,UChi_12) \ VPREFETCH_M1(8,%r9) \ VPREFETCH_M1(9,%r9) \ VPREFETCH_M1(10,%r9) \ VPREFETCH_M1(11,%r9) \ /*55*/ \ VMADDSUB(Z3,Chi_02,UChi_00) \ VMADDSUB(Z3,Chi_12,UChi_10) \ VMADDSUB(Z4,Chi_02,UChi_01) \ VMADDSUB(Z4,Chi_12,UChi_11) \ VMADDSUB(Z5,Chi_02,UChi_02) \ VMADDSUB(Z5,Chi_12,UChi_12) \ /*61 insns*/ ); #define MULT_ADDSUB_2SPIN_LS(ptr,pf) \ LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ VPREFETCH_M1(0,%r9) \ VPREFETCH_M1(1,%r9) \ VPREFETCH_M1(2,%r9) \ VPREFETCH_M1(3,%r9) \ /*8*/ \ VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ VPREFETCH_M1(4,%r9) \ VPREFETCH_M1(5,%r9) \ VPREFETCH_M1(6,%r9) \ VPREFETCH_M1(7,%r9) \ /*16*/ \ VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ VPREFETCH_M1(8,%r9) \ VPREFETCH_M1(9,%r9) \ VPREFETCH_M1(10,%r9) \ VPREFETCH_M1(11,%r9) \ /*22*/ \ VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ VPREFETCH_M2(12,%r9) \ VPREFETCH_M2(13,%r9) \ VPREFETCH_M2(14,%r9) \ VPREFETCH_M2(15,%r9) \ /*30*/ \ VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ VPREFETCH_M2(16,%r9) \ VPREFETCH_M2(17,%r9) \ VPREFETCH_M2(18,%r9) \ VPREFETCH_M2(19,%r9) \ VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ VPREFETCH_M2(20,%r9) \ VPREFETCH_M2(21,%r9) \ VPREFETCH_M2(22,%r9) \ VPREFETCH_M2(23,%r9) \ VPREFETCH_G1(2,%r8) \ VPREFETCH_G1(3,%r8) \ VPREFETCH_G2(4,%r8) \ VPREFETCH_G2(5,%r8) \ VPREFETCH_G2(6,%r8) \ VPREFETCH_G2(7,%r8) \ /*42 insns*/ ); #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ /*8*/ \ VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ /*16*/ \ VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ /*22*/ \ VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ /*30*/ \ VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ /* VPREFETCH1(2,%r8)*/ \ /* VPREFETCH1(3,%r8)*/ \ /*42 insns*/ ); #define Z6 Chi_00 #define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \ LOAD64(%r8,ptr) \ __asm__ ( \ VSHUFMEM(0,%r8,Z0) \ VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \ VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \ VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \ VSHUFMEM(3,%r8,Z0) \ VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \ VSHUFMEM(6,%r8,Z0) \ VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \ VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \ VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \ VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \ /*11 cycles*/ \ VSHUFMEM(1,%r8,Z0) \ VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \ VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \ VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \ VSHUFMEM(4,%r8,Z0) \ VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \ VSHUFMEM(7,%r8,Z0) \ VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \ VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \ VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \ VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \ /*22 cycles*/ \ VSHUFMEM(2,%r8,Z0) \ VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \ VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \ VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \ VSHUFMEM(5,%r8,Z0) \ VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \ VSHUFMEM(8,%r8,Z0) \ VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \ /*33 cycles*/ \ VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \ VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \ VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \ /*stall*/ \ /*stall*/ \ /*stall*/ \ VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \ VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \ VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) ) #endif