From bd15c38ae8bd617c9a17b3ab034083faf85552a3 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 12 Jan 2018 23:25:02 +0000 Subject: [PATCH] Formatting emacs compliant --- lib/simd/Intel512avx.h | 66 +- lib/simd/Intel512common.h | 66 +- lib/simd/Intel512double.h | 6 +- lib/simd/Intel512imci.h | 46 +- lib/simd/Intel512single.h | 6 +- lib/simd/Intel512wilson.h | 1270 ++++++++++++++++++------------------- 6 files changed, 730 insertions(+), 730 deletions(-) diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index 7b5964ad..9cded194 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_AV512_H #define GRID_ASM_AV512_H @@ -44,46 +44,46 @@ Author: paboyle #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp) #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp) -#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMULMEMf(O,P,B,Biirr) \ - VMULMEMf(O,P,C,Ciirr) \ - VMULf(tmp,B,Briir) \ +#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMf(O,P,tmp) \ + VMULMEMf(O,P,B,Biirr) \ + VMULMEMf(O,P,C,Ciirr) \ + VMULf(tmp,B,Briir) \ VMULf(tmp,C,Criir) -#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMd(O,P,tmp) \ - VMULMEMd(O,P,B,Biirr) \ - VMULMEMd(O,P,C,Ciirr) \ - VMULd(tmp,B,Briir) \ +#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMd(O,P,tmp) \ + VMULMEMd(O,P,B,Biirr) \ + VMULMEMd(O,P,C,Ciirr) \ + VMULd(tmp,B,Briir) \ VMULd(tmp,C,Criir) -#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMADDMEMf(O,P,B,Biirr) \ - VMADDMEMf(O,P,C,Ciirr) \ - VMADDf(tmp,B,Briir) \ +#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMf(O,P,tmp) \ + VMADDMEMf(O,P,B,Biirr) \ + VMADDMEMf(O,P,C,Ciirr) \ + VMADDf(tmp,B,Briir) \ VMADDf(tmp,C,Criir) #define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ - VSHUFMEMd(O,P,tmp) \ - VMADDMEMd(O,P,B,Biirr) \ - VMADDMEMd(O,P,C,Ciirr) \ - VMADDd(tmp,B,Briir) \ + VSHUFMEMd(O,P,tmp) \ + VMADDMEMd(O,P,B,Biirr) \ + VMADDMEMd(O,P,C,Ciirr) \ + VMADDd(tmp,B,Briir) \ VMADDd(tmp,C,Criir) // Merges accumulation for complex dot chain; less efficient under avx512 -#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\ - "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" +#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n" \ + "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" -#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\ - "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n" +#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n" \ + "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n" #define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\ - "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" +"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" -#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\ - "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii +#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n" \ + "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii #define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2 #define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2 @@ -123,10 +123,10 @@ Author: paboyle #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" - /* - * TimesI is used only in the XP recon - * Could zero the regs and use RECON_ACCUM - */ +/* + * TimesI is used only in the XP recon + * Could zero the regs and use RECON_ACCUM + */ #define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST) #define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n" diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index e69e541c..c9472918 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_INTEL_COMMON_512_H #define GRID_ASM_INTEL_COMMON_512_H @@ -36,10 +36,10 @@ Author: paboyle //////////////////////////////////////////////////////////////////////////////////////////////////// // Opcodes common //////////////////////////////////////////////////////////////////////////////////////////////////// -#define MASK_REGS \ - __asm__ ("mov $0xAAAA, %%eax \n"\ - "kmovw %%eax, %%k6 \n"\ - "mov $0x5555, %%eax \n"\ +#define MASK_REGS \ + __asm__ ("mov $0xAAAA, %%eax \n" \ + "kmovw %%eax, %%k6 \n" \ + "mov $0x5555, %%eax \n" \ "kmovw %%eax, %%k7 \n" : : : "%eax"); //#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" ); @@ -47,44 +47,44 @@ Author: paboyle #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" -#define VTIMESIf(A,DEST, Z) \ - VTIMESI0f(A,DEST, Z) \ - VTIMESI1f(A,DEST, Z) \ +#define VTIMESIf(A,DEST, Z) \ + VTIMESI0f(A,DEST, Z) \ + VTIMESI1f(A,DEST, Z) \ VTIMESI2f(A,DEST, Z) -#define VTIMESId(A,DEST, Z) \ - VTIMESI0d(A,DEST, Z) \ - VTIMESI1d(A,DEST, Z) \ +#define VTIMESId(A,DEST, Z) \ + VTIMESI0d(A,DEST, Z) \ + VTIMESI1d(A,DEST, Z) \ VTIMESI2d(A,DEST, Z) -#define VTIMESMINUSIf(A,DEST, Z) \ - VTIMESMINUSI0f(A,DEST, Z) \ - VTIMESMINUSI1f(A,DEST, Z) \ - VTIMESMINUSI2f(A,DEST, Z) +#define VTIMESMINUSIf(A,DEST, Z) \ + VTIMESMINUSI0f(A,DEST, Z) \ + VTIMESMINUSI1f(A,DEST, Z) \ + VTIMESMINUSI2f(A,DEST, Z) -#define VTIMESMINUSId(A,DEST, Z) \ - VTIMESMINUSI0d(A,DEST, Z) \ - VTIMESMINUSI1d(A,DEST, Z) \ - VTIMESMINUSI2d(A,DEST, Z) +#define VTIMESMINUSId(A,DEST, Z) \ + VTIMESMINUSI0d(A,DEST, Z) \ + VTIMESMINUSI1d(A,DEST, Z) \ + VTIMESMINUSI2d(A,DEST, Z) #define VACCTIMESIf(A,ACC,tmp) \ - VACCTIMESI0f(A,ACC,tmp) \ - VACCTIMESI1f(A,ACC,tmp) \ - VACCTIMESI2f(A,ACC,tmp) + VACCTIMESI0f(A,ACC,tmp) \ + VACCTIMESI1f(A,ACC,tmp) \ + VACCTIMESI2f(A,ACC,tmp) #define VACCTIMESId(A,ACC,tmp) \ - VACCTIMESI0d(A,ACC,tmp) \ - VACCTIMESI1d(A,ACC,tmp) \ - VACCTIMESI2d(A,ACC,tmp) + VACCTIMESI0d(A,ACC,tmp) \ + VACCTIMESI1d(A,ACC,tmp) \ + VACCTIMESI2d(A,ACC,tmp) -#define VACCTIMESMINUSIf(A,ACC,tmp) \ - VACCTIMESMINUSI0f(A,ACC,tmp) \ - VACCTIMESMINUSI1f(A,ACC,tmp) \ +#define VACCTIMESMINUSIf(A,ACC,tmp) \ + VACCTIMESMINUSI0f(A,ACC,tmp) \ + VACCTIMESMINUSI1f(A,ACC,tmp) \ VACCTIMESMINUSI2f(A,ACC,tmp) -#define VACCTIMESMINUSId(A,ACC,tmp) \ - VACCTIMESMINUSI0d(A,ACC,tmp) \ - VACCTIMESMINUSI1d(A,ACC,tmp) \ +#define VACCTIMESMINUSId(A,ACC,tmp) \ + VACCTIMESMINUSI0d(A,ACC,tmp) \ + VACCTIMESMINUSI1d(A,ACC,tmp) \ VACCTIMESMINUSI2d(A,ACC,tmp) #define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h index 632b5639..f0b7fa0d 100644 --- a/lib/simd/Intel512double.h +++ b/lib/simd/Intel512double.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ // No guard can be multiply included as undef clearage #undef VZERO #undef VMOV diff --git a/lib/simd/Intel512imci.h b/lib/simd/Intel512imci.h index 7176890f..655c001d 100644 --- a/lib/simd/Intel512imci.h +++ b/lib/simd/Intel512imci.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_AV512_H #define GRID_ASM_AV512_H @@ -44,32 +44,32 @@ Author: paboyle #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp) #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp) -#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMULMEMf(O,P,B,Biirr) \ - VMULMEMf(O,P,C,Ciirr) \ - VMULf(tmp,B,Briir) \ +#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMf(O,P,tmp) \ + VMULMEMf(O,P,B,Biirr) \ + VMULMEMf(O,P,C,Ciirr) \ + VMULf(tmp,B,Briir) \ VMULf(tmp,C,Criir) -#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMd(O,P,tmp) \ +#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMd(O,P,tmp) \ VMULMEMd(O,P,B,Biirr) \ - VMULMEMd(O,P,C,Ciirr) \ - VMULd(tmp,B,Briir) \ - VMULd(tmp,C,Criir) +VMULMEMd(O,P,C,Ciirr) \ +VMULd(tmp,B,Briir) \ +VMULd(tmp,C,Criir) -#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMADDMEMf(O,P,B,Biirr) \ - VMADDMEMf(O,P,C,Ciirr) \ - VMADDf(tmp,B,Briir) \ +#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ + VSHUFMEMf(O,P,tmp) \ + VMADDMEMf(O,P,B,Biirr) \ + VMADDMEMf(O,P,C,Ciirr) \ + VMADDf(tmp,B,Briir) \ VMADDf(tmp,C,Criir) #define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ - VSHUFMEMd(O,P,tmp) \ - VMADDMEMd(O,P,B,Biirr) \ - VMADDMEMd(O,P,C,Ciirr) \ - VMADDd(tmp,B,Briir) \ + VSHUFMEMd(O,P,tmp) \ + VMADDMEMd(O,P,B,Biirr) \ + VMADDMEMd(O,P,C,Ciirr) \ + VMADDd(tmp,B,Briir) \ VMADDd(tmp,C,Criir) #define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" @@ -106,7 +106,7 @@ Author: paboyle #define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" - // Acc = Acc - i A +// Acc = Acc - i A #define VACCTIMESMINUSI0d(A,ACC,tmp) #define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h index ed135651..5c1e4135 100644 --- a/lib/simd/Intel512single.h +++ b/lib/simd/Intel512single.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ // No guard can be multiply included as undef clearge of macros #undef VZERO #undef VMOV diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 64142a2e..66781387 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_INTEL_512_QCD_H #define GRID_ASM_INTEL_512_QCD_H @@ -109,18 +109,18 @@ Author: paboyle #define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R) #define ADD_RESULT(PT,R) ADD_RESULTi(PT,R) -#define ZERO_PSI \ - asm( VZERO(psi_00) \ - VZERO(psi_01) \ - VZERO(psi_02) \ - VZERO(psi_10) \ - VZERO(psi_11) \ - VZERO(psi_12) \ - VZERO(psi_20) \ - VZERO(psi_21) \ - VZERO(psi_22) \ - VZERO(psi_30) \ - VZERO(psi_31) \ +#define ZERO_PSI \ + asm( VZERO(psi_00) \ + VZERO(psi_01) \ + VZERO(psi_02) \ + VZERO(psi_10) \ + VZERO(psi_11) \ + VZERO(psi_12) \ + VZERO(psi_20) \ + VZERO(psi_21) \ + VZERO(psi_22) \ + VZERO(psi_30) \ + VZERO(psi_31) \ VZERO(psi_32)); #define LOAD_CHIMUi \ @@ -143,41 +143,41 @@ Author: paboyle VLOAD(10,%r8,Chimu_31) \ VLOAD(11,%r8,Chimu_32) -#define SHUF_CHIMU23i\ - VSHUFMEM(6,%r8,Chimu_20) \ - VSHUFMEM(7,%r8,Chimu_21) \ - VSHUFMEM(8,%r8,Chimu_22) \ - VSHUFMEM(9,%r8,Chimu_30) \ - VSHUFMEM(10,%r8,Chimu_31) \ - VSHUFMEM(11,%r8,Chimu_32) +#define SHUF_CHIMU23i \ + VSHUFMEM(6,%r8,Chimu_20) \ + VSHUFMEM(7,%r8,Chimu_21) \ + VSHUFMEM(8,%r8,Chimu_22) \ + VSHUFMEM(9,%r8,Chimu_30) \ + VSHUFMEM(10,%r8,Chimu_31) \ + VSHUFMEM(11,%r8,Chimu_32) #define LOAD_CHIi \ - VLOAD(0,%r8,Chi_00) \ - VLOAD(1,%r8,Chi_01) \ - VLOAD(2,%r8,Chi_02) \ - VLOAD(3,%r8,Chi_10) \ - VLOAD(4,%r8,Chi_11) \ + VLOAD(0,%r8,Chi_00) \ + VLOAD(1,%r8,Chi_01) \ + VLOAD(2,%r8,Chi_02) \ + VLOAD(3,%r8,Chi_10) \ + VLOAD(4,%r8,Chi_11) \ VLOAD(5,%r8,Chi_12) -#define SAVE_UCHIi(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - VSTORE(0,%r8,UChi_00) \ - VSTORE(1,%r8,UChi_01) \ - VSTORE(2,%r8,UChi_02) \ - VSTORE(3,%r8,UChi_10) \ - VSTORE(4,%r8,UChi_11) \ - VSTORE(5,%r8,UChi_12) ); +#define SAVE_UCHIi(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + VSTORE(0,%r8,UChi_00) \ + VSTORE(1,%r8,UChi_01) \ + VSTORE(2,%r8,UChi_02) \ + VSTORE(3,%r8,UChi_10) \ + VSTORE(4,%r8,UChi_11) \ + VSTORE(5,%r8,UChi_12) ); -#define SAVE_CHIi(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - VSTORE(0,%r8,Chi_00) \ - VSTORE(1,%r8,Chi_01) \ - VSTORE(2,%r8,Chi_02) \ - VSTORE(3,%r8,Chi_10) \ - VSTORE(4,%r8,Chi_11) \ - VSTORE(5,%r8,Chi_12) ); +#define SAVE_CHIi(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + VSTORE(0,%r8,Chi_00) \ + VSTORE(1,%r8,Chi_01) \ + VSTORE(2,%r8,Chi_02) \ + VSTORE(3,%r8,Chi_10) \ + VSTORE(4,%r8,Chi_11) \ + VSTORE(5,%r8,Chi_12) ); #define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p) #define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf) @@ -187,352 +187,352 @@ Author: paboyle ////////////////////////////////////////////////////////////////// // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); -#define XP_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ +#define XP_PROJMEM(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \ - VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \ - VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \ - VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \ - VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \ - VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \ - VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \ - VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \ - VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \ - VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \ - VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \ + SHUF_CHIMU23i \ + VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \ + VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \ + VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \ VACCTIMESI2(Chi_12,Chi_12,Chimu_22) ); -#define YP_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \ - VSUBMEM(10,%r8,Chimu_01,Chi_01) \ - VSUBMEM(11,%r8,Chimu_02,Chi_02) \ - VADDMEM(6,%r8,Chimu_10,Chi_10) \ - VADDMEM(7,%r8,Chimu_11,Chi_11) \ - VADDMEM(8,%r8,Chimu_12,Chi_12) ); +#define YP_PROJMEM(ptr) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + LOAD_CHIMU01i \ + VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \ + VSUBMEM(10,%r8,Chimu_01,Chi_01) \ + VSUBMEM(11,%r8,Chimu_02,Chi_02) \ + VADDMEM(6,%r8,Chimu_10,Chi_10) \ + VADDMEM(7,%r8,Chimu_11,Chi_11) \ + VADDMEM(8,%r8,Chimu_12,Chi_12) ); -#define ZP_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ +#define ZP_PROJMEM(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \ - VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \ - VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \ - VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \ - VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \ - VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \ - VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \ - VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \ - VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \ + SHUF_CHIMU23i \ + VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \ + VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \ + VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \ + VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \ + VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \ VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \ VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \ VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) ); -#define TP_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VADDMEM(6,%r8 ,Chimu_00,Chi_00) \ - VADDMEM(7,%r8,Chimu_01,Chi_01) \ - VADDMEM(8,%r8,Chimu_02,Chi_02) \ - VADDMEM(9,%r8,Chimu_10,Chi_10) \ - VADDMEM(10,%r8,Chimu_11,Chi_11) \ +#define TP_PROJMEM(ptr) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + LOAD_CHIMU01i \ + VADDMEM(6,%r8 ,Chimu_00,Chi_00) \ + VADDMEM(7,%r8,Chimu_01,Chi_01) \ + VADDMEM(8,%r8,Chimu_02,Chi_02) \ + VADDMEM(9,%r8,Chimu_10,Chi_10) \ + VADDMEM(10,%r8,Chimu_11,Chi_11) \ VADDMEM(11,%r8,Chimu_12,Chi_12) ); // hspin(0)=fspin(0)-timesI(fspin(3)) // hspin(1)=fspin(1)-timesI(fspin(2)) -#define XM_PROJMEM(PTR) \ - LOAD64(%r8,PTR)\ - __asm__ ( \ - LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\ - VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\ - VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\ - VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\ - VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\ - VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\ - VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\ - VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\ - VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\ - VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\ - VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\ +#define XM_PROJMEM(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + LOAD_CHIi \ + SHUF_CHIMU23i \ + VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21) \ + VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22) \ + VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21) \ VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) ); -#define YM_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VADDMEM(9,%r8 ,Chimu_00,Chi_00) \ - VADDMEM(10,%r8,Chimu_01,Chi_01) \ - VADDMEM(11,%r8,Chimu_02,Chi_02) \ - VSUBMEM(6,%r8,Chimu_10,Chi_10) \ - VSUBMEM(7,%r8,Chimu_11,Chi_11) \ - VSUBMEM(8,%r8,Chimu_12,Chi_12) ); - -#define ZM_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ +#define YM_PROJMEM(ptr) \ + LOAD64(%r8,ptr) \ __asm__ ( \ - LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\ - VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\ - VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\ - VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\ - VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\ - VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\ - VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\ - VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\ - VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\ - VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\ - VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\ + LOAD_CHIMU01i \ + VADDMEM(9,%r8 ,Chimu_00,Chi_00) \ + VADDMEM(10,%r8,Chimu_01,Chi_01) \ + VADDMEM(11,%r8,Chimu_02,Chi_02) \ + VSUBMEM(6,%r8,Chimu_10,Chi_10) \ + VSUBMEM(7,%r8,Chimu_11,Chi_11) \ + VSUBMEM(8,%r8,Chimu_12,Chi_12) ); + +#define ZM_PROJMEM(PTR) \ + LOAD64(%r8,PTR) \ + __asm__ ( \ + LOAD_CHIi \ + SHUF_CHIMU23i \ + VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22) \ + VACCTIMESI1(Chi_10,Chi_10,Chimu_30) \ + VACCTIMESI1(Chi_11,Chi_11,Chimu_31) \ + VACCTIMESI1(Chi_12,Chi_12,Chimu_32) \ + VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22) \ + VACCTIMESI2(Chi_10,Chi_10,Chimu_30) \ + VACCTIMESI2(Chi_11,Chi_11,Chimu_31) \ VACCTIMESI2(Chi_12,Chi_12,Chimu_32) ); -#define TM_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VSUBMEM(6,%r8,Chimu_00,Chi_00) \ - VSUBMEM(7,%r8,Chimu_01,Chi_01) \ - VSUBMEM(8,%r8,Chimu_02,Chi_02) \ - VSUBMEM(9,%r8,Chimu_10,Chi_10) \ - VSUBMEM(10,%r8,Chimu_11,Chi_11) \ - VSUBMEM(11,%r8,Chimu_12,Chi_12) ); +#define TM_PROJMEM(ptr) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + LOAD_CHIMU01i \ + VSUBMEM(6,%r8,Chimu_00,Chi_00) \ + VSUBMEM(7,%r8,Chimu_01,Chi_01) \ + VSUBMEM(8,%r8,Chimu_02,Chi_02) \ + VSUBMEM(9,%r8,Chimu_10,Chi_10) \ + VSUBMEM(10,%r8,Chimu_11,Chi_11) \ + VSUBMEM(11,%r8,Chimu_12,Chi_12) ); // fspin(0)=hspin(0) // fspin(1)=hspin(1) // fspin(2)=timesMinusI(hspin(1)) // fspin(3)=timesMinusI(hspin(0)) -#define XP_RECON __asm__ ( \ - VZERO(TMP) \ +#define XP_RECON __asm__ ( \ + VZERO(TMP) \ VTIMESMINUSI0(UChi_00,psi_30,TMP) \ VTIMESMINUSI0(UChi_10,psi_20,TMP) \ VTIMESMINUSI0(UChi_01,psi_31,TMP) \ VTIMESMINUSI0(UChi_11,psi_21,TMP) \ - VTIMESMINUSI0(UChi_02,psi_32,TMP) \ + VTIMESMINUSI0(UChi_02,psi_32,TMP) \ VTIMESMINUSI0(UChi_12,psi_22,TMP) \ - VMOV(UChi_00,psi_00) \ - VMOV(UChi_10,psi_10) \ - VMOV(UChi_01,psi_01) \ - VMOV(UChi_11,psi_11) \ - VMOV(UChi_02,psi_02) \ - VMOV(UChi_12,psi_12) \ + VMOV(UChi_00,psi_00) \ + VMOV(UChi_10,psi_10) \ + VMOV(UChi_01,psi_01) \ + VMOV(UChi_11,psi_11) \ + VMOV(UChi_02,psi_02) \ + VMOV(UChi_12,psi_12) \ VTIMESMINUSI1(UChi_10,psi_20,TMP) \ VTIMESMINUSI1(UChi_11,psi_21,TMP) \ VTIMESMINUSI1(UChi_12,psi_22,TMP) \ VTIMESMINUSI1(UChi_00,psi_30,TMP) \ VTIMESMINUSI1(UChi_01,psi_31,TMP) \ - VTIMESMINUSI1(UChi_02,psi_32,TMP) \ + VTIMESMINUSI1(UChi_02,psi_32,TMP) \ VTIMESMINUSI2(UChi_10,psi_20,TMP) \ VTIMESMINUSI2(UChi_11,psi_21,TMP) \ VTIMESMINUSI2(UChi_12,psi_22,TMP) \ VTIMESMINUSI2(UChi_00,psi_30,TMP) \ VTIMESMINUSI2(UChi_01,psi_31,TMP) \ - VTIMESMINUSI2(UChi_02,psi_32,TMP) \ + VTIMESMINUSI2(UChi_02,psi_32,TMP) \ ); - // NB could save 6 ops using addsub => 12 cycles -#define XP_RECON_ACCUM __asm__ ( \ - VZERO(TMP)\ - VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\ - VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\ - VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\ - VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\ - VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\ - VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\ - VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\ - VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\ - VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\ - VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\ - VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\ - VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\ - VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\ - VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\ - VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\ - VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\ - VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\ +// NB could save 6 ops using addsub => 12 cycles +#define XP_RECON_ACCUM __asm__ ( \ + VZERO(TMP) \ + VACCTIMESMINUSI0(UChi_00,psi_30,Z3) \ + VACCTIMESMINUSI0(UChi_10,psi_20,Z0) \ + VACCTIMESMINUSI0(UChi_01,psi_31,Z4) \ + VACCTIMESMINUSI0(UChi_11,psi_21,Z1) \ + VACCTIMESMINUSI0(UChi_02,psi_32,Z5) \ + VACCTIMESMINUSI0(UChi_12,psi_22,Z2) \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VACCTIMESMINUSI1(UChi_00,psi_30,Z3) \ + VACCTIMESMINUSI1(UChi_10,psi_20,Z0) \ + VACCTIMESMINUSI1(UChi_01,psi_31,Z4) \ + VACCTIMESMINUSI1(UChi_11,psi_21,Z1) \ + VACCTIMESMINUSI1(UChi_02,psi_32,Z5) \ + VACCTIMESMINUSI1(UChi_12,psi_22,Z2) \ + VACCTIMESMINUSI2(UChi_10,psi_20,Z0) \ + VACCTIMESMINUSI2(UChi_11,psi_21,Z1) \ + VACCTIMESMINUSI2(UChi_12,psi_22,Z2) \ + VACCTIMESMINUSI2(UChi_00,psi_30,Z3) \ + VACCTIMESMINUSI2(UChi_01,psi_31,Z4) \ + VACCTIMESMINUSI2(UChi_02,psi_32,Z5) \ ); -#define XM_RECON __asm__ ( \ - VZERO(TMP)\ - VTIMESI0(UChi_00,psi_30,TMP)\ - VTIMESI0(UChi_10,psi_20,TMP)\ - VTIMESI0(UChi_01,psi_31,TMP)\ - VTIMESI0(UChi_11,psi_21,TMP)\ - VTIMESI0(UChi_02,psi_32,TMP)\ - VTIMESI0(UChi_12,psi_22,TMP)\ - VMOV(UChi_00,psi_00)\ - VMOV(UChi_10,psi_10)\ - VMOV(UChi_01,psi_01)\ - VMOV(UChi_11,psi_11)\ - VMOV(UChi_02,psi_02)\ - VMOV(UChi_12,psi_12)\ - VTIMESI1(UChi_00,psi_30,TMP)\ - VTIMESI1(UChi_10,psi_20,TMP)\ - VTIMESI1(UChi_01,psi_31,TMP)\ - VTIMESI1(UChi_11,psi_21,TMP)\ - VTIMESI1(UChi_02,psi_32,TMP)\ - VTIMESI1(UChi_12,psi_22,TMP)\ - VTIMESI2(UChi_10,psi_20,TMP)\ - VTIMESI2(UChi_11,psi_21,TMP)\ - VTIMESI2(UChi_12,psi_22,TMP)\ - VTIMESI2(UChi_00,psi_30,TMP)\ - VTIMESI2(UChi_01,psi_31,TMP)\ - VTIMESI2(UChi_02,psi_32,TMP)\ +#define XM_RECON __asm__ ( \ + VZERO(TMP) \ + VTIMESI0(UChi_00,psi_30,TMP) \ + VTIMESI0(UChi_10,psi_20,TMP) \ + VTIMESI0(UChi_01,psi_31,TMP) \ + VTIMESI0(UChi_11,psi_21,TMP) \ + VTIMESI0(UChi_02,psi_32,TMP) \ + VTIMESI0(UChi_12,psi_22,TMP) \ + VMOV(UChi_00,psi_00) \ + VMOV(UChi_10,psi_10) \ + VMOV(UChi_01,psi_01) \ + VMOV(UChi_11,psi_11) \ + VMOV(UChi_02,psi_02) \ + VMOV(UChi_12,psi_12) \ + VTIMESI1(UChi_00,psi_30,TMP) \ + VTIMESI1(UChi_10,psi_20,TMP) \ + VTIMESI1(UChi_01,psi_31,TMP) \ + VTIMESI1(UChi_11,psi_21,TMP) \ + VTIMESI1(UChi_02,psi_32,TMP) \ + VTIMESI1(UChi_12,psi_22,TMP) \ + VTIMESI2(UChi_10,psi_20,TMP) \ + VTIMESI2(UChi_11,psi_21,TMP) \ + VTIMESI2(UChi_12,psi_22,TMP) \ + VTIMESI2(UChi_00,psi_30,TMP) \ + VTIMESI2(UChi_01,psi_31,TMP) \ + VTIMESI2(UChi_02,psi_32,TMP) \ ); -#define XM_RECON_ACCUM __asm__ ( \ - VACCTIMESI0(UChi_10,psi_20,Z0)\ - VACCTIMESI0(UChi_00,psi_30,Z3)\ - VACCTIMESI0(UChi_11,psi_21,Z1)\ - VACCTIMESI0(UChi_01,psi_31,Z4)\ - VACCTIMESI0(UChi_12,psi_22,Z2)\ - VACCTIMESI0(UChi_02,psi_32,Z5)\ - \ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_12,psi_12,psi_12)\ - VADD(UChi_02,psi_02,psi_02)\ - \ - VACCTIMESI1(UChi_10,psi_20,Z0)\ - VACCTIMESI1(UChi_00,psi_30,Z3)\ - VACCTIMESI1(UChi_11,psi_21,Z1)\ - VACCTIMESI1(UChi_01,psi_31,Z4)\ - VACCTIMESI1(UChi_12,psi_22,Z2)\ - VACCTIMESI1(UChi_02,psi_32,Z5)\ - VACCTIMESI2(UChi_10,psi_20,Z0)\ - VACCTIMESI2(UChi_11,psi_21,Z1)\ - VACCTIMESI2(UChi_12,psi_22,Z2)\ - VACCTIMESI2(UChi_00,psi_30,Z3)\ - VACCTIMESI2(UChi_01,psi_31,Z4)\ - VACCTIMESI2(UChi_02,psi_32,Z5)\ +#define XM_RECON_ACCUM __asm__ ( \ + VACCTIMESI0(UChi_10,psi_20,Z0) \ + VACCTIMESI0(UChi_00,psi_30,Z3) \ + VACCTIMESI0(UChi_11,psi_21,Z1) \ + VACCTIMESI0(UChi_01,psi_31,Z4) \ + VACCTIMESI0(UChi_12,psi_22,Z2) \ + VACCTIMESI0(UChi_02,psi_32,Z5) \ + \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_12,psi_12,psi_12) \ + VADD(UChi_02,psi_02,psi_02) \ + \ + VACCTIMESI1(UChi_10,psi_20,Z0) \ + VACCTIMESI1(UChi_00,psi_30,Z3) \ + VACCTIMESI1(UChi_11,psi_21,Z1) \ + VACCTIMESI1(UChi_01,psi_31,Z4) \ + VACCTIMESI1(UChi_12,psi_22,Z2) \ + VACCTIMESI1(UChi_02,psi_32,Z5) \ + VACCTIMESI2(UChi_10,psi_20,Z0) \ + VACCTIMESI2(UChi_11,psi_21,Z1) \ + VACCTIMESI2(UChi_12,psi_22,Z2) \ + VACCTIMESI2(UChi_00,psi_30,Z3) \ + VACCTIMESI2(UChi_01,psi_31,Z4) \ + VACCTIMESI2(UChi_02,psi_32,Z5) \ ); -#define YP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VADD(UChi_10,psi_20,psi_20)\ - VADD(UChi_11,psi_21,psi_21)\ - VADD(UChi_12,psi_22,psi_22)\ - VSUB(UChi_00,psi_30,psi_30)\ - VSUB(UChi_01,psi_31,psi_31)\ - VSUB(UChi_02,psi_32,psi_32) ); +#define YP_RECON_ACCUM __asm__ ( \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VADD(UChi_10,psi_20,psi_20) \ + VADD(UChi_11,psi_21,psi_21) \ + VADD(UChi_12,psi_22,psi_22) \ + VSUB(UChi_00,psi_30,psi_30) \ + VSUB(UChi_01,psi_31,psi_31) \ + VSUB(UChi_02,psi_32,psi_32) ); -#define YM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VSUB(UChi_10,psi_20,psi_20)\ - VSUB(UChi_11,psi_21,psi_21)\ - VSUB(UChi_12,psi_22,psi_22)\ - VADD(UChi_00,psi_30,psi_30)\ - VADD(UChi_01,psi_31,psi_31)\ - VADD(UChi_02,psi_32,psi_32) ); +#define YM_RECON_ACCUM __asm__ ( \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VSUB(UChi_10,psi_20,psi_20) \ + VSUB(UChi_11,psi_21,psi_21) \ + VSUB(UChi_12,psi_22,psi_22) \ + VADD(UChi_00,psi_30,psi_30) \ + VADD(UChi_01,psi_31,psi_31) \ + VADD(UChi_02,psi_32,psi_32) ); -#define ZP_RECON_ACCUM __asm__ ( \ - VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\ - VACCTIMESI0(UChi_10,psi_30,Z3)\ - VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\ - VACCTIMESI0(UChi_11,psi_31,Z4)\ - VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\ - VACCTIMESI0(UChi_12,psi_32,Z5)\ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\ - VACCTIMESI1(UChi_10,psi_30,Z3)\ - VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\ - VACCTIMESI1(UChi_11,psi_31,Z4)\ - VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\ - VACCTIMESI1(UChi_12,psi_32,Z5)\ - VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\ - VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\ - VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\ - VACCTIMESI2(UChi_10,psi_30,Z3)\ - VACCTIMESI2(UChi_11,psi_31,Z4)\ - VACCTIMESI2(UChi_12,psi_32,Z5)\ +#define ZP_RECON_ACCUM __asm__ ( \ + VACCTIMESMINUSI0(UChi_00,psi_20,Z0) \ + VACCTIMESI0(UChi_10,psi_30,Z3) \ + VACCTIMESMINUSI0(UChi_01,psi_21,Z1) \ + VACCTIMESI0(UChi_11,psi_31,Z4) \ + VACCTIMESMINUSI0(UChi_02,psi_22,Z2) \ + VACCTIMESI0(UChi_12,psi_32,Z5) \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VACCTIMESMINUSI1(UChi_00,psi_20,Z0) \ + VACCTIMESI1(UChi_10,psi_30,Z3) \ + VACCTIMESMINUSI1(UChi_01,psi_21,Z1) \ + VACCTIMESI1(UChi_11,psi_31,Z4) \ + VACCTIMESMINUSI1(UChi_02,psi_22,Z2) \ + VACCTIMESI1(UChi_12,psi_32,Z5) \ + VACCTIMESMINUSI2(UChi_00,psi_20,Z0) \ + VACCTIMESMINUSI2(UChi_01,psi_21,Z1) \ + VACCTIMESMINUSI2(UChi_02,psi_22,Z2) \ + VACCTIMESI2(UChi_10,psi_30,Z3) \ + VACCTIMESI2(UChi_11,psi_31,Z4) \ + VACCTIMESI2(UChi_12,psi_32,Z5) \ ); -#define ZM_RECON_ACCUM __asm__ ( \ - VACCTIMESI0(UChi_00,psi_20,Z0)\ - VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\ - VACCTIMESI0(UChi_01,psi_21,Z1)\ - VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\ - VACCTIMESI0(UChi_02,psi_22,Z2)\ - VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VACCTIMESI1(UChi_00,psi_20,Z0)\ - VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\ - VACCTIMESI1(UChi_01,psi_21,Z1)\ - VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\ - VACCTIMESI1(UChi_02,psi_22,Z2)\ - VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\ - VACCTIMESI2(UChi_00,psi_20,Z0)\ - VACCTIMESI2(UChi_01,psi_21,Z1)\ - VACCTIMESI2(UChi_02,psi_22,Z2)\ - VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\ - VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\ - VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\ +#define ZM_RECON_ACCUM __asm__ ( \ + VACCTIMESI0(UChi_00,psi_20,Z0) \ + VACCTIMESMINUSI0(UChi_10,psi_30,Z3) \ + VACCTIMESI0(UChi_01,psi_21,Z1) \ + VACCTIMESMINUSI0(UChi_11,psi_31,Z4) \ + VACCTIMESI0(UChi_02,psi_22,Z2) \ + VACCTIMESMINUSI0(UChi_12,psi_32,Z5) \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VACCTIMESI1(UChi_00,psi_20,Z0) \ + VACCTIMESMINUSI1(UChi_10,psi_30,Z3) \ + VACCTIMESI1(UChi_01,psi_21,Z1) \ + VACCTIMESMINUSI1(UChi_11,psi_31,Z4) \ + VACCTIMESI1(UChi_02,psi_22,Z2) \ + VACCTIMESMINUSI1(UChi_12,psi_32,Z5) \ + VACCTIMESI2(UChi_00,psi_20,Z0) \ + VACCTIMESI2(UChi_01,psi_21,Z1) \ + VACCTIMESI2(UChi_02,psi_22,Z2) \ + VACCTIMESMINUSI2(UChi_10,psi_30,Z3) \ + VACCTIMESMINUSI2(UChi_11,psi_31,Z4) \ + VACCTIMESMINUSI2(UChi_12,psi_32,Z5) \ ); -#define TP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VADD(UChi_00,psi_20,psi_20)\ - VADD(UChi_10,psi_30,psi_30)\ - VADD(UChi_01,psi_21,psi_21)\ - VADD(UChi_11,psi_31,psi_31)\ - VADD(UChi_02,psi_22,psi_22)\ - VADD(UChi_12,psi_32,psi_32) ); +#define TP_RECON_ACCUM __asm__ ( \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VADD(UChi_00,psi_20,psi_20) \ + VADD(UChi_10,psi_30,psi_30) \ + VADD(UChi_01,psi_21,psi_21) \ + VADD(UChi_11,psi_31,psi_31) \ + VADD(UChi_02,psi_22,psi_22) \ + VADD(UChi_12,psi_32,psi_32) ); -#define TM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,psi_00,psi_00)\ - VADD(UChi_10,psi_10,psi_10)\ - VADD(UChi_01,psi_01,psi_01)\ - VADD(UChi_11,psi_11,psi_11)\ - VADD(UChi_02,psi_02,psi_02)\ - VADD(UChi_12,psi_12,psi_12)\ - VSUB(UChi_00,psi_20,psi_20)\ - VSUB(UChi_10,psi_30,psi_30)\ - VSUB(UChi_01,psi_21,psi_21)\ - VSUB(UChi_11,psi_31,psi_31)\ - VSUB(UChi_02,psi_22,psi_22)\ - VSUB(UChi_12,psi_32,psi_32) ); +#define TM_RECON_ACCUM __asm__ ( \ + VADD(UChi_00,psi_00,psi_00) \ + VADD(UChi_10,psi_10,psi_10) \ + VADD(UChi_01,psi_01,psi_01) \ + VADD(UChi_11,psi_11,psi_11) \ + VADD(UChi_02,psi_02,psi_02) \ + VADD(UChi_12,psi_12,psi_12) \ + VSUB(UChi_00,psi_20,psi_20) \ + VSUB(UChi_10,psi_30,psi_30) \ + VSUB(UChi_01,psi_21,psi_21) \ + VSUB(UChi_11,psi_31,psi_31) \ + VSUB(UChi_02,psi_22,psi_22) \ + VSUB(UChi_12,psi_32,psi_32) ); #define AVX512_PF_L1 #define AVX512_PF_L2_GAUGE @@ -560,27 +560,27 @@ Author: paboyle #define VPREFETCH_G2(A,B) VPREFETCH2(A,B) #endif -#define PF_GAUGE(A) \ +#define PF_GAUGE(A) \ LOAD64(%r8,&U._odata[sU](A)) \ __asm__ ( \ VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \ VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \ ); -#define SAVE_RESULTi(PTR,pf) \ - LOAD64(%r8,PTR) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \ - VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \ - VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \ - VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \ - VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \ - VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \ - VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \ - VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \ - VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \ - VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \ +#define SAVE_RESULTi(PTR,pf) \ + LOAD64(%r8,PTR) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \ + VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \ + VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \ + VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \ + VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \ + VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \ + VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \ + VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \ + VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \ + VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \ VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \ VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \ ); @@ -596,343 +596,343 @@ Author: paboyle #define ADD_RESULTia(PTR,pf) \ - LOAD64(%r8,PTR) \ + LOAD64(%r8,PTR) \ __asm__ ( \ - VADDMEM(0,%r8,psi_00,psi_00) \ - VADDMEM(1,%r8,psi_01,psi_01) \ - VADDMEM(2,%r8,psi_02,psi_02) \ - VADDMEM(3,%r8,psi_10,psi_10) \ - VADDMEM(4,%r8,psi_11,psi_11) \ - VADDMEM(5,%r8,psi_12,psi_12) \ - VADDMEM(6,%r8,psi_20,psi_20) \ - VADDMEM(7,%r8,psi_21,psi_21) \ - VADDMEM(8,%r8,psi_22,psi_22) \ - VADDMEM(9,%r8,psi_30,psi_30) \ - VADDMEM(10,%r8,psi_31,psi_31) \ + VADDMEM(0,%r8,psi_00,psi_00) \ + VADDMEM(1,%r8,psi_01,psi_01) \ + VADDMEM(2,%r8,psi_02,psi_02) \ + VADDMEM(3,%r8,psi_10,psi_10) \ + VADDMEM(4,%r8,psi_11,psi_11) \ + VADDMEM(5,%r8,psi_12,psi_12) \ + VADDMEM(6,%r8,psi_20,psi_20) \ + VADDMEM(7,%r8,psi_21,psi_21) \ + VADDMEM(8,%r8,psi_22,psi_22) \ + VADDMEM(9,%r8,psi_30,psi_30) \ + VADDMEM(10,%r8,psi_31,psi_31) \ VADDMEM(11,%r8,psi_32,psi_32) \ - VSTORE(0,%r8,psi_00) \ - VSTORE(1,%r8,psi_01) \ - VSTORE(2,%r8,psi_02) \ - VSTORE(3,%r8,psi_10) \ - VSTORE(4,%r8,psi_11) \ - VSTORE(5,%r8,psi_12) \ - VSTORE(6,%r8,psi_20) \ - VSTORE(7,%r8,psi_21) \ - VSTORE(8,%r8,psi_22) \ - VSTORE(9,%r8,psi_30) \ - VSTORE(10,%r8,psi_31) \ - VSTORE(11,%r8,psi_32) \ + VSTORE(0,%r8,psi_00) \ + VSTORE(1,%r8,psi_01) \ + VSTORE(2,%r8,psi_02) \ + VSTORE(3,%r8,psi_10) \ + VSTORE(4,%r8,psi_11) \ + VSTORE(5,%r8,psi_12) \ + VSTORE(6,%r8,psi_20) \ + VSTORE(7,%r8,psi_21) \ + VSTORE(8,%r8,psi_22) \ + VSTORE(9,%r8,psi_30) \ + VSTORE(10,%r8,psi_31) \ + VSTORE(11,%r8,psi_32) \ ); #ifdef AVX512_PF_L2_TABLE -#define PREFETCH_CHIMU(A) \ - LOAD64(%r9,A) \ - __asm__ ( \ - VPREFETCH_P1(0,%r9) \ - VPREFETCH_P1(1,%r9) \ - VPREFETCH_P1(2,%r9) \ - VPREFETCH_P1(3,%r9) \ - VPREFETCH_P1(4,%r9) \ - VPREFETCH_P1(5,%r9) \ - VPREFETCH_P1(6,%r9) \ - VPREFETCH_P1(7,%r9) \ - VPREFETCH_P1(8,%r9) \ - VPREFETCH_P1(9,%r9) \ - VPREFETCH_P1(10,%r9) \ +#define PREFETCH_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ VPREFETCH_P1(11,%r9)); #else #define PREFETCH_CHIMU(A) #endif -#define PREFETCH1_CHIMU(A) \ - LOAD64(%r9,A) \ - __asm__ ( \ - VPREFETCH_P1(0,%r9) \ - VPREFETCH_P1(1,%r9) \ - VPREFETCH_P1(2,%r9) \ - VPREFETCH_P1(3,%r9) \ - VPREFETCH_P1(4,%r9) \ - VPREFETCH_P1(5,%r9) \ - VPREFETCH_P1(6,%r9) \ - VPREFETCH_P1(7,%r9) \ - VPREFETCH_P1(8,%r9) \ - VPREFETCH_P1(9,%r9) \ - VPREFETCH_P1(10,%r9) \ +#define PREFETCH1_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ VPREFETCH_P1(11,%r9)); -#define PERMUTE_DIR0 __asm__ ( \ - VPERM0(Chi_00,Chi_00) \ - VPERM0(Chi_01,Chi_01) \ - VPERM0(Chi_02,Chi_02) \ - VPERM0(Chi_10,Chi_10) \ - VPERM0(Chi_11,Chi_11) \ - VPERM0(Chi_12,Chi_12) ); +#define PERMUTE_DIR0 __asm__ ( \ + VPERM0(Chi_00,Chi_00) \ + VPERM0(Chi_01,Chi_01) \ + VPERM0(Chi_02,Chi_02) \ + VPERM0(Chi_10,Chi_10) \ + VPERM0(Chi_11,Chi_11) \ + VPERM0(Chi_12,Chi_12) ); -#define PERMUTE_DIR1 __asm__ ( \ - VPERM1(Chi_00,Chi_00) \ - VPERM1(Chi_01,Chi_01) \ - VPERM1(Chi_02,Chi_02) \ - VPERM1(Chi_10,Chi_10) \ - VPERM1(Chi_11,Chi_11) \ - VPERM1(Chi_12,Chi_12)); +#define PERMUTE_DIR1 __asm__ ( \ + VPERM1(Chi_00,Chi_00) \ + VPERM1(Chi_01,Chi_01) \ + VPERM1(Chi_02,Chi_02) \ + VPERM1(Chi_10,Chi_10) \ + VPERM1(Chi_11,Chi_11) \ + VPERM1(Chi_12,Chi_12)); -#define PERMUTE_DIR2 __asm__ ( \ - VPERM2(Chi_00,Chi_00) \ - VPERM2(Chi_01,Chi_01) \ - VPERM2(Chi_02,Chi_02) \ - VPERM2(Chi_10,Chi_10) \ - VPERM2(Chi_11,Chi_11) \ - VPERM2(Chi_12,Chi_12) ); +#define PERMUTE_DIR2 __asm__ ( \ + VPERM2(Chi_00,Chi_00) \ + VPERM2(Chi_01,Chi_01) \ + VPERM2(Chi_02,Chi_02) \ + VPERM2(Chi_10,Chi_10) \ + VPERM2(Chi_11,Chi_11) \ + VPERM2(Chi_12,Chi_12) ); -#define PERMUTE_DIR3 __asm__ ( \ - VPERM3(Chi_00,Chi_00) \ - VPERM3(Chi_01,Chi_01) \ - VPERM3(Chi_02,Chi_02) \ - VPERM3(Chi_10,Chi_10) \ - VPERM3(Chi_11,Chi_11) \ - VPERM3(Chi_12,Chi_12) ); +#define PERMUTE_DIR3 __asm__ ( \ + VPERM3(Chi_00,Chi_00) \ + VPERM3(Chi_01,Chi_01) \ + VPERM3(Chi_02,Chi_02) \ + VPERM3(Chi_10,Chi_10) \ + VPERM3(Chi_11,Chi_11) \ + VPERM3(Chi_12,Chi_12) ); #define MULT_ADDSUB_2SPIN(ptr,pf) \ - LOAD64(%r8,ptr) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VPREFETCH_G2(9,%r8) \ - VPREFETCH_G2(10,%r8) \ - VPREFETCH_G2(11,%r8) \ - VPREFETCH_G2(12,%r8) \ - VPREFETCH_G2(13,%r8) \ - VPREFETCH_G2(14,%r8) \ - VPREFETCH_G2(15,%r8) \ - VPREFETCH_G2(16,%r8) \ - VPREFETCH_G2(17,%r8) \ - VSHUF(Chi_00,T1) \ - VMOVIDUP(0,%r8,Z0 ) \ - VMOVIDUP(3,%r8,Z1 ) \ + LOAD64(%r8,ptr) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VPREFETCH_G2(9,%r8) \ + VPREFETCH_G2(10,%r8) \ + VPREFETCH_G2(11,%r8) \ + VPREFETCH_G2(12,%r8) \ + VPREFETCH_G2(13,%r8) \ + VPREFETCH_G2(14,%r8) \ + VPREFETCH_G2(15,%r8) \ + VPREFETCH_G2(16,%r8) \ + VPREFETCH_G2(17,%r8) \ + VSHUF(Chi_00,T1) \ + VMOVIDUP(0,%r8,Z0 ) \ + VMOVIDUP(3,%r8,Z1 ) \ VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \ /*6*/ \ - VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \ - VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \ - VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \ - VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ - VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ - VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ - VPREFETCH_M1(0,%r9) \ - VPREFETCH_M1(1,%r9) \ - VPREFETCH_M1(2,%r9) \ - VPREFETCH_M1(3,%r9) \ - /*18*/ \ - VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ - VMADDSUB(Z3,Chi_10,UChi_10) \ - VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \ - VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ - VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ - VMADDSUB(Z5,Chi_10,UChi_12) \ - VPREFETCH_M1(4,%r9) \ - VPREFETCH_M1(5,%r9) \ - VPREFETCH_M1(6,%r9) \ - VPREFETCH_M1(7,%r9) \ - /*28*/ \ - VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ - VMADDSUB(Z0,T2,UChi_10) \ - VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \ - VMADDSUB(Z1,T2,UChi_11) \ - VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \ - VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \ - VPREFETCH2(12,%r9) \ - VPREFETCH2(13,%r9) \ - VPREFETCH2(14,%r9) \ - VPREFETCH2(15,%r9) \ - VPREFETCH2(16,%r9) \ - VPREFETCH2(17,%r9) \ - VPREFETCH2(18,%r9) \ - VPREFETCH2(19,%r9) \ - VPREFETCH2(20,%r9) \ - VPREFETCH2(21,%r9) \ - VPREFETCH2(22,%r9) \ - VPREFETCH2(23,%r9) \ - /*38*/ \ - VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \ - VMADDSUB(Z3,Chi_11,UChi_10) \ - VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \ - VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ - VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ - VMADDSUB(Z5,Chi_11,UChi_12) \ - VPREFETCH_M1(9,%r8) \ - VPREFETCH_M1(10,%r8) \ - VPREFETCH_M1(11,%r8) \ - VPREFETCH_M1(12,%r8) \ - VPREFETCH_M1(13,%r8) \ - VPREFETCH_M1(14,%r8) \ - VPREFETCH_M1(15,%r8) \ - VPREFETCH_M1(16,%r8) \ - VPREFETCH_M1(17,%r8) \ - /*48*/ \ - VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ - VMADDSUB(Z0,T2,UChi_10) \ - VMADDSUB(Z1,T1,UChi_01) \ - VMADDSUB(Z1,T2,UChi_11) \ - VMADDSUB(Z2,T1,UChi_02) \ - VMADDSUB(Z2,T2,UChi_12) \ - VPREFETCH_M1(8,%r9) \ - VPREFETCH_M1(9,%r9) \ - VPREFETCH_M1(10,%r9) \ - VPREFETCH_M1(11,%r9) \ - /*55*/ \ - VMADDSUB(Z3,Chi_02,UChi_00) \ - VMADDSUB(Z3,Chi_12,UChi_10) \ - VMADDSUB(Z4,Chi_02,UChi_01) \ - VMADDSUB(Z4,Chi_12,UChi_11) \ - VMADDSUB(Z5,Chi_02,UChi_02) \ - VMADDSUB(Z5,Chi_12,UChi_12) \ + VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \ + VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \ + VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \ + VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ + VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ + VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ + /*18*/ \ + VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ + VMADDSUB(Z3,Chi_10,UChi_10) \ + VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \ + VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ + VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ + VMADDSUB(Z5,Chi_10,UChi_12) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ + /*28*/ \ + VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ + VMADDSUB(Z0,T2,UChi_10) \ + VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \ + VMADDSUB(Z1,T2,UChi_11) \ + VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \ + VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \ + VPREFETCH2(12,%r9) \ + VPREFETCH2(13,%r9) \ + VPREFETCH2(14,%r9) \ + VPREFETCH2(15,%r9) \ + VPREFETCH2(16,%r9) \ + VPREFETCH2(17,%r9) \ + VPREFETCH2(18,%r9) \ + VPREFETCH2(19,%r9) \ + VPREFETCH2(20,%r9) \ + VPREFETCH2(21,%r9) \ + VPREFETCH2(22,%r9) \ + VPREFETCH2(23,%r9) \ + /*38*/ \ + VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \ + VMADDSUB(Z3,Chi_11,UChi_10) \ + VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \ + VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ + VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ + VMADDSUB(Z5,Chi_11,UChi_12) \ + VPREFETCH_M1(9,%r8) \ + VPREFETCH_M1(10,%r8) \ + VPREFETCH_M1(11,%r8) \ + VPREFETCH_M1(12,%r8) \ + VPREFETCH_M1(13,%r8) \ + VPREFETCH_M1(14,%r8) \ + VPREFETCH_M1(15,%r8) \ + VPREFETCH_M1(16,%r8) \ + VPREFETCH_M1(17,%r8) \ + /*48*/ \ + VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ + VMADDSUB(Z0,T2,UChi_10) \ + VMADDSUB(Z1,T1,UChi_01) \ + VMADDSUB(Z1,T2,UChi_11) \ + VMADDSUB(Z2,T1,UChi_02) \ + VMADDSUB(Z2,T2,UChi_12) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ + /*55*/ \ + VMADDSUB(Z3,Chi_02,UChi_00) \ + VMADDSUB(Z3,Chi_12,UChi_10) \ + VMADDSUB(Z4,Chi_02,UChi_01) \ + VMADDSUB(Z4,Chi_12,UChi_11) \ + VMADDSUB(Z5,Chi_02,UChi_02) \ + VMADDSUB(Z5,Chi_12,UChi_12) \ /*61 insns*/ ); -#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \ - LOAD64(%r8,ptr) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ - VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ - VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ - VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ - VPREFETCH_M1(0,%r9) \ - VPREFETCH_M1(1,%r9) \ - VPREFETCH_M1(2,%r9) \ - VPREFETCH_M1(3,%r9) \ - /*8*/ \ - VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ +#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \ + LOAD64(%r8,ptr) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ + VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ + VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ + VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ + /*8*/ \ + VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ - VPREFETCH_M1(4,%r9) \ - VPREFETCH_M1(5,%r9) \ - VPREFETCH_M1(6,%r9) \ - VPREFETCH_M1(7,%r9) \ - /*16*/ \ - VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ + /*16*/ \ + VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ - VPREFETCH_M1(8,%r9) \ - VPREFETCH_M1(9,%r9) \ - VPREFETCH_M1(10,%r9) \ - VPREFETCH_M1(11,%r9) \ - /*22*/ \ - VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ + /*22*/ \ + VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ - VPREFETCH_M2(12,%r9) \ - VPREFETCH_M2(13,%r9) \ - VPREFETCH_M2(14,%r9) \ - VPREFETCH_M2(15,%r9) \ - /*30*/ \ - VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ - VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ - VPREFETCH_M2(16,%r9) \ - VPREFETCH_M2(17,%r9) \ - VPREFETCH_M2(18,%r9) \ - VPREFETCH_M2(19,%r9) \ - VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ - /*36*/ \ + VPREFETCH_M2(12,%r9) \ + VPREFETCH_M2(13,%r9) \ + VPREFETCH_M2(14,%r9) \ + VPREFETCH_M2(15,%r9) \ + /*30*/ \ + VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ + VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ + VPREFETCH_M2(16,%r9) \ + VPREFETCH_M2(17,%r9) \ + VPREFETCH_M2(18,%r9) \ + VPREFETCH_M2(19,%r9) \ + VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ + /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - VPREFETCH_M2(20,%r9) \ - VPREFETCH_M2(21,%r9) \ - VPREFETCH_M2(22,%r9) \ - VPREFETCH_M2(23,%r9) \ - VPREFETCH_G1(2,%r8) \ - VPREFETCH_G1(3,%r8) \ - VPREFETCH_G2(4,%r8) \ - VPREFETCH_G2(5,%r8) \ - VPREFETCH_G2(6,%r8) \ - VPREFETCH_G2(7,%r8) \ + VPREFETCH_M2(20,%r9) \ + VPREFETCH_M2(21,%r9) \ + VPREFETCH_M2(22,%r9) \ + VPREFETCH_M2(23,%r9) \ + VPREFETCH_G1(2,%r8) \ + VPREFETCH_G1(3,%r8) \ + VPREFETCH_G2(4,%r8) \ + VPREFETCH_G2(5,%r8) \ + VPREFETCH_G2(6,%r8) \ + VPREFETCH_G2(7,%r8) \ /*42 insns*/ ); -#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ - LOAD64(%r8,ptr) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ - VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ - VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ - VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ - /*8*/ \ - VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ +#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ + LOAD64(%r8,ptr) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \ + VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ + VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ + VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ + /*8*/ \ + VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ - /*16*/ \ - VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ + /*16*/ \ + VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ - /*22*/ \ - VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ + /*22*/ \ + VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ - /*30*/ \ - VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ - VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ - VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ - /*36*/ \ + /*30*/ \ + VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ + VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ + VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ + /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - /* VPREFETCH1(2,%r8)*/ \ - /* VPREFETCH1(3,%r8)*/ \ + /* VPREFETCH1(2,%r8)*/ \ + /* VPREFETCH1(3,%r8)*/ \ /*42 insns*/ ); #define Z6 Chi_00 -#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - VSHUFMEM(0,%r8,Z0) \ - VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \ - VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \ - VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \ - VSHUFMEM(3,%r8,Z0) \ - VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \ - VSHUFMEM(6,%r8,Z0) \ - VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \ - VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \ - VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \ - VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \ - /*11 cycles*/ \ - VSHUFMEM(1,%r8,Z0) \ - VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \ - VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \ - VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \ - VSHUFMEM(4,%r8,Z0) \ - VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \ - VSHUFMEM(7,%r8,Z0) \ - VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \ - VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \ - VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \ - VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \ - /*22 cycles*/ \ - VSHUFMEM(2,%r8,Z0) \ - VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \ - VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \ - VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \ - VSHUFMEM(5,%r8,Z0) \ - VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \ - VSHUFMEM(8,%r8,Z0) \ - VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \ - /*33 cycles*/ \ - VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \ - VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \ - VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \ - /*stall*/ \ - /*stall*/ \ - /*stall*/ \ - VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \ - VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \ - VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) ) +#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + VSHUFMEM(0,%r8,Z0) \ + VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \ + VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \ + VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \ + VSHUFMEM(3,%r8,Z0) \ + VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \ + VSHUFMEM(6,%r8,Z0) \ + VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \ + VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \ + VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \ + VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \ + /*11 cycles*/ \ + VSHUFMEM(1,%r8,Z0) \ + VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \ + VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \ + VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \ + VSHUFMEM(4,%r8,Z0) \ + VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \ + VSHUFMEM(7,%r8,Z0) \ + VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \ + VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \ + VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \ + VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \ + /*22 cycles*/ \ + VSHUFMEM(2,%r8,Z0) \ + VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \ + VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \ + VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \ + VSHUFMEM(5,%r8,Z0) \ + VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \ + VSHUFMEM(8,%r8,Z0) \ + VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \ + /*33 cycles*/ \ + VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \ + VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \ + VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \ + /*stall*/ \ + /*stall*/ \ + /*stall*/ \ + VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \ + VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \ + VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) ) #endif