1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-27 22:25:56 +01:00

Formatting emacs compliant

This commit is contained in:
paboyle 2018-01-12 23:25:02 +00:00
parent b815f5f764
commit bd15c38ae8
6 changed files with 730 additions and 730 deletions

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ASM_AV512_H #ifndef GRID_ASM_AV512_H
#define GRID_ASM_AV512_H #define GRID_ASM_AV512_H
@ -44,21 +44,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp) #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp) #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ #define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
VSHUFMEMf(O,P,tmp) \ VSHUFMEMf(O,P,tmp) \
VMULMEMf(O,P,B,Biirr) \ VMULMEMf(O,P,B,Biirr) \
VMULMEMf(O,P,C,Ciirr) \ VMULMEMf(O,P,C,Ciirr) \
VMULf(tmp,B,Briir) \ VMULf(tmp,B,Briir) \
VMULf(tmp,C,Criir) VMULf(tmp,C,Criir)
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ #define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
VSHUFMEMd(O,P,tmp) \ VSHUFMEMd(O,P,tmp) \
VMULMEMd(O,P,B,Biirr) \ VMULMEMd(O,P,B,Biirr) \
VMULMEMd(O,P,C,Ciirr) \ VMULMEMd(O,P,C,Ciirr) \
VMULd(tmp,B,Briir) \ VMULd(tmp,B,Briir) \
VMULd(tmp,C,Criir) VMULd(tmp,C,Criir)
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ #define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
VSHUFMEMf(O,P,tmp) \ VSHUFMEMf(O,P,tmp) \
VMADDMEMf(O,P,B,Biirr) \ VMADDMEMf(O,P,B,Biirr) \
VMADDMEMf(O,P,C,Ciirr) \ VMADDMEMf(O,P,C,Ciirr) \
@ -73,16 +73,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VMADDd(tmp,C,Criir) VMADDd(tmp,C,Criir)
// Merges accumulation for complex dot chain; less efficient under avx512 // Merges accumulation for complex dot chain; less efficient under avx512
#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\ #define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n" \
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\ #define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n" \
"vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n" "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\ #define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\ #define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n" \
"vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2 #define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
@ -123,7 +123,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
/* /*
* TimesI is used only in the XP recon * TimesI is used only in the XP recon
* Could zero the regs and use RECON_ACCUM * Could zero the regs and use RECON_ACCUM
*/ */

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ASM_INTEL_COMMON_512_H #ifndef GRID_ASM_INTEL_COMMON_512_H
#define GRID_ASM_INTEL_COMMON_512_H #define GRID_ASM_INTEL_COMMON_512_H
@ -37,9 +37,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// Opcodes common // Opcodes common
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
#define MASK_REGS \ #define MASK_REGS \
__asm__ ("mov $0xAAAA, %%eax \n"\ __asm__ ("mov $0xAAAA, %%eax \n" \
"kmovw %%eax, %%k6 \n"\ "kmovw %%eax, %%k6 \n" \
"mov $0x5555, %%eax \n"\ "mov $0x5555, %%eax \n" \
"kmovw %%eax, %%k7 \n" : : : "%eax"); "kmovw %%eax, %%k7 \n" : : : "%eax");
//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" ); //#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
// No guard can be multiply included as undef clearage // No guard can be multiply included as undef clearage
#undef VZERO #undef VZERO
#undef VMOV #undef VMOV

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ASM_AV512_H #ifndef GRID_ASM_AV512_H
#define GRID_ASM_AV512_H #define GRID_ASM_AV512_H
@ -44,21 +44,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp) #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp) #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ #define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
VSHUFMEMf(O,P,tmp) \ VSHUFMEMf(O,P,tmp) \
VMULMEMf(O,P,B,Biirr) \ VMULMEMf(O,P,B,Biirr) \
VMULMEMf(O,P,C,Ciirr) \ VMULMEMf(O,P,C,Ciirr) \
VMULf(tmp,B,Briir) \ VMULf(tmp,B,Briir) \
VMULf(tmp,C,Criir) VMULf(tmp,C,Criir)
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ #define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
VSHUFMEMd(O,P,tmp) \ VSHUFMEMd(O,P,tmp) \
VMULMEMd(O,P,B,Biirr) \ VMULMEMd(O,P,B,Biirr) \
VMULMEMd(O,P,C,Ciirr) \ VMULMEMd(O,P,C,Ciirr) \
VMULd(tmp,B,Briir) \ VMULd(tmp,B,Briir) \
VMULd(tmp,C,Criir) VMULd(tmp,C,Criir)
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ #define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
VSHUFMEMf(O,P,tmp) \ VSHUFMEMf(O,P,tmp) \
VMADDMEMf(O,P,B,Biirr) \ VMADDMEMf(O,P,B,Biirr) \
VMADDMEMf(O,P,C,Ciirr) \ VMADDMEMf(O,P,C,Ciirr) \
@ -106,7 +106,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" #define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
// Acc = Acc - i A // Acc = Acc - i A
#define VACCTIMESMINUSI0d(A,ACC,tmp) #define VACCTIMESMINUSI0d(A,ACC,tmp)
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" #define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
// No guard can be multiply included as undef clearge of macros // No guard can be multiply included as undef clearge of macros
#undef VZERO #undef VZERO
#undef VMOV #undef VMOV

View File

@ -23,8 +23,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ASM_INTEL_512_QCD_H #ifndef GRID_ASM_INTEL_512_QCD_H
#define GRID_ASM_INTEL_512_QCD_H #define GRID_ASM_INTEL_512_QCD_H
@ -143,7 +143,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VLOAD(10,%r8,Chimu_31) \ VLOAD(10,%r8,Chimu_31) \
VLOAD(11,%r8,Chimu_32) VLOAD(11,%r8,Chimu_32)
#define SHUF_CHIMU23i\ #define SHUF_CHIMU23i \
VSHUFMEM(6,%r8,Chimu_20) \ VSHUFMEM(6,%r8,Chimu_20) \
VSHUFMEM(7,%r8,Chimu_21) \ VSHUFMEM(7,%r8,Chimu_21) \
VSHUFMEM(8,%r8,Chimu_22) \ VSHUFMEM(8,%r8,Chimu_22) \
@ -250,21 +250,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// hspin(0)=fspin(0)-timesI(fspin(3)) // hspin(0)=fspin(0)-timesI(fspin(3))
// hspin(1)=fspin(1)-timesI(fspin(2)) // hspin(1)=fspin(1)-timesI(fspin(2))
#define XM_PROJMEM(PTR) \ #define XM_PROJMEM(PTR) \
LOAD64(%r8,PTR)\ LOAD64(%r8,PTR) \
__asm__ ( \ __asm__ ( \
LOAD_CHIi \ LOAD_CHIi \
SHUF_CHIMU23i \ SHUF_CHIMU23i \
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30) \
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31) \
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32) \
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\ VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20) \
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\ VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21) \
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\ VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22) \
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\ VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30) \
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\ VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31) \
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\ VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32) \
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\ VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20) \
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\ VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21) \
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) ); VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
#define YM_PROJMEM(ptr) \ #define YM_PROJMEM(ptr) \
@ -283,17 +283,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
__asm__ ( \ __asm__ ( \
LOAD_CHIi \ LOAD_CHIi \
SHUF_CHIMU23i \ SHUF_CHIMU23i \
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20) \
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21) \
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22) \
VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\ VACCTIMESI1(Chi_10,Chi_10,Chimu_30) \
VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\ VACCTIMESI1(Chi_11,Chi_11,Chimu_31) \
VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\ VACCTIMESI1(Chi_12,Chi_12,Chimu_32) \
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\ VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20) \
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\ VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21) \
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\ VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22) \
VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\ VACCTIMESI2(Chi_10,Chi_10,Chimu_30) \
VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\ VACCTIMESI2(Chi_11,Chi_11,Chimu_31) \
VACCTIMESI2(Chi_12,Chi_12,Chimu_32) ); VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
#define TM_PROJMEM(ptr) \ #define TM_PROJMEM(ptr) \
@ -338,200 +338,200 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VTIMESMINUSI2(UChi_01,psi_31,TMP) \ VTIMESMINUSI2(UChi_01,psi_31,TMP) \
VTIMESMINUSI2(UChi_02,psi_32,TMP) \ VTIMESMINUSI2(UChi_02,psi_32,TMP) \
); );
// NB could save 6 ops using addsub => 12 cycles // NB could save 6 ops using addsub => 12 cycles
#define XP_RECON_ACCUM __asm__ ( \ #define XP_RECON_ACCUM __asm__ ( \
VZERO(TMP)\ VZERO(TMP) \
VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\ VACCTIMESMINUSI0(UChi_00,psi_30,Z3) \
VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\ VACCTIMESMINUSI0(UChi_10,psi_20,Z0) \
VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\ VACCTIMESMINUSI0(UChi_01,psi_31,Z4) \
VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\ VACCTIMESMINUSI0(UChi_11,psi_21,Z1) \
VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\ VACCTIMESMINUSI0(UChi_02,psi_32,Z5) \
VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\ VACCTIMESMINUSI0(UChi_12,psi_22,Z2) \
VADD(UChi_00,psi_00,psi_00)\ VADD(UChi_00,psi_00,psi_00) \
VADD(UChi_10,psi_10,psi_10)\ VADD(UChi_10,psi_10,psi_10) \
VADD(UChi_01,psi_01,psi_01)\ VADD(UChi_01,psi_01,psi_01) \
VADD(UChi_11,psi_11,psi_11)\ VADD(UChi_11,psi_11,psi_11) \
VADD(UChi_02,psi_02,psi_02)\ VADD(UChi_02,psi_02,psi_02) \
VADD(UChi_12,psi_12,psi_12)\ VADD(UChi_12,psi_12,psi_12) \
VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\ VACCTIMESMINUSI1(UChi_00,psi_30,Z3) \
VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\ VACCTIMESMINUSI1(UChi_10,psi_20,Z0) \
VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\ VACCTIMESMINUSI1(UChi_01,psi_31,Z4) \
VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\ VACCTIMESMINUSI1(UChi_11,psi_21,Z1) \
VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\ VACCTIMESMINUSI1(UChi_02,psi_32,Z5) \
VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\ VACCTIMESMINUSI1(UChi_12,psi_22,Z2) \
VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\ VACCTIMESMINUSI2(UChi_10,psi_20,Z0) \
VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\ VACCTIMESMINUSI2(UChi_11,psi_21,Z1) \
VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\ VACCTIMESMINUSI2(UChi_12,psi_22,Z2) \
VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\ VACCTIMESMINUSI2(UChi_00,psi_30,Z3) \
VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\ VACCTIMESMINUSI2(UChi_01,psi_31,Z4) \
VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\ VACCTIMESMINUSI2(UChi_02,psi_32,Z5) \
); );
#define XM_RECON __asm__ ( \ #define XM_RECON __asm__ ( \
VZERO(TMP)\ VZERO(TMP) \
VTIMESI0(UChi_00,psi_30,TMP)\ VTIMESI0(UChi_00,psi_30,TMP) \
VTIMESI0(UChi_10,psi_20,TMP)\ VTIMESI0(UChi_10,psi_20,TMP) \
VTIMESI0(UChi_01,psi_31,TMP)\ VTIMESI0(UChi_01,psi_31,TMP) \
VTIMESI0(UChi_11,psi_21,TMP)\ VTIMESI0(UChi_11,psi_21,TMP) \
VTIMESI0(UChi_02,psi_32,TMP)\ VTIMESI0(UChi_02,psi_32,TMP) \
VTIMESI0(UChi_12,psi_22,TMP)\ VTIMESI0(UChi_12,psi_22,TMP) \
VMOV(UChi_00,psi_00)\ VMOV(UChi_00,psi_00) \
VMOV(UChi_10,psi_10)\ VMOV(UChi_10,psi_10) \
VMOV(UChi_01,psi_01)\ VMOV(UChi_01,psi_01) \
VMOV(UChi_11,psi_11)\ VMOV(UChi_11,psi_11) \
VMOV(UChi_02,psi_02)\ VMOV(UChi_02,psi_02) \
VMOV(UChi_12,psi_12)\ VMOV(UChi_12,psi_12) \
VTIMESI1(UChi_00,psi_30,TMP)\ VTIMESI1(UChi_00,psi_30,TMP) \
VTIMESI1(UChi_10,psi_20,TMP)\ VTIMESI1(UChi_10,psi_20,TMP) \
VTIMESI1(UChi_01,psi_31,TMP)\ VTIMESI1(UChi_01,psi_31,TMP) \
VTIMESI1(UChi_11,psi_21,TMP)\ VTIMESI1(UChi_11,psi_21,TMP) \
VTIMESI1(UChi_02,psi_32,TMP)\ VTIMESI1(UChi_02,psi_32,TMP) \
VTIMESI1(UChi_12,psi_22,TMP)\ VTIMESI1(UChi_12,psi_22,TMP) \
VTIMESI2(UChi_10,psi_20,TMP)\ VTIMESI2(UChi_10,psi_20,TMP) \
VTIMESI2(UChi_11,psi_21,TMP)\ VTIMESI2(UChi_11,psi_21,TMP) \
VTIMESI2(UChi_12,psi_22,TMP)\ VTIMESI2(UChi_12,psi_22,TMP) \
VTIMESI2(UChi_00,psi_30,TMP)\ VTIMESI2(UChi_00,psi_30,TMP) \
VTIMESI2(UChi_01,psi_31,TMP)\ VTIMESI2(UChi_01,psi_31,TMP) \
VTIMESI2(UChi_02,psi_32,TMP)\ VTIMESI2(UChi_02,psi_32,TMP) \
); );
#define XM_RECON_ACCUM __asm__ ( \ #define XM_RECON_ACCUM __asm__ ( \
VACCTIMESI0(UChi_10,psi_20,Z0)\ VACCTIMESI0(UChi_10,psi_20,Z0) \
VACCTIMESI0(UChi_00,psi_30,Z3)\ VACCTIMESI0(UChi_00,psi_30,Z3) \
VACCTIMESI0(UChi_11,psi_21,Z1)\ VACCTIMESI0(UChi_11,psi_21,Z1) \
VACCTIMESI0(UChi_01,psi_31,Z4)\ VACCTIMESI0(UChi_01,psi_31,Z4) \
VACCTIMESI0(UChi_12,psi_22,Z2)\ VACCTIMESI0(UChi_12,psi_22,Z2) \
VACCTIMESI0(UChi_02,psi_32,Z5)\ VACCTIMESI0(UChi_02,psi_32,Z5) \
\ \
VADD(UChi_10,psi_10,psi_10)\ VADD(UChi_10,psi_10,psi_10) \
VADD(UChi_00,psi_00,psi_00)\ VADD(UChi_00,psi_00,psi_00) \
VADD(UChi_11,psi_11,psi_11)\ VADD(UChi_11,psi_11,psi_11) \
VADD(UChi_01,psi_01,psi_01)\ VADD(UChi_01,psi_01,psi_01) \
VADD(UChi_12,psi_12,psi_12)\ VADD(UChi_12,psi_12,psi_12) \
VADD(UChi_02,psi_02,psi_02)\ VADD(UChi_02,psi_02,psi_02) \
\ \
VACCTIMESI1(UChi_10,psi_20,Z0)\ VACCTIMESI1(UChi_10,psi_20,Z0) \
VACCTIMESI1(UChi_00,psi_30,Z3)\ VACCTIMESI1(UChi_00,psi_30,Z3) \
VACCTIMESI1(UChi_11,psi_21,Z1)\ VACCTIMESI1(UChi_11,psi_21,Z1) \
VACCTIMESI1(UChi_01,psi_31,Z4)\ VACCTIMESI1(UChi_01,psi_31,Z4) \
VACCTIMESI1(UChi_12,psi_22,Z2)\ VACCTIMESI1(UChi_12,psi_22,Z2) \
VACCTIMESI1(UChi_02,psi_32,Z5)\ VACCTIMESI1(UChi_02,psi_32,Z5) \
VACCTIMESI2(UChi_10,psi_20,Z0)\ VACCTIMESI2(UChi_10,psi_20,Z0) \
VACCTIMESI2(UChi_11,psi_21,Z1)\ VACCTIMESI2(UChi_11,psi_21,Z1) \
VACCTIMESI2(UChi_12,psi_22,Z2)\ VACCTIMESI2(UChi_12,psi_22,Z2) \
VACCTIMESI2(UChi_00,psi_30,Z3)\ VACCTIMESI2(UChi_00,psi_30,Z3) \
VACCTIMESI2(UChi_01,psi_31,Z4)\ VACCTIMESI2(UChi_01,psi_31,Z4) \
VACCTIMESI2(UChi_02,psi_32,Z5)\ VACCTIMESI2(UChi_02,psi_32,Z5) \
); );
#define YP_RECON_ACCUM __asm__ ( \ #define YP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,psi_00,psi_00)\ VADD(UChi_00,psi_00,psi_00) \
VADD(UChi_10,psi_10,psi_10)\ VADD(UChi_10,psi_10,psi_10) \
VADD(UChi_01,psi_01,psi_01)\ VADD(UChi_01,psi_01,psi_01) \
VADD(UChi_11,psi_11,psi_11)\ VADD(UChi_11,psi_11,psi_11) \
VADD(UChi_02,psi_02,psi_02)\ VADD(UChi_02,psi_02,psi_02) \
VADD(UChi_12,psi_12,psi_12)\ VADD(UChi_12,psi_12,psi_12) \
VADD(UChi_10,psi_20,psi_20)\ VADD(UChi_10,psi_20,psi_20) \
VADD(UChi_11,psi_21,psi_21)\ VADD(UChi_11,psi_21,psi_21) \
VADD(UChi_12,psi_22,psi_22)\ VADD(UChi_12,psi_22,psi_22) \
VSUB(UChi_00,psi_30,psi_30)\ VSUB(UChi_00,psi_30,psi_30) \
VSUB(UChi_01,psi_31,psi_31)\ VSUB(UChi_01,psi_31,psi_31) \
VSUB(UChi_02,psi_32,psi_32) ); VSUB(UChi_02,psi_32,psi_32) );
#define YM_RECON_ACCUM __asm__ ( \ #define YM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,psi_00,psi_00)\ VADD(UChi_00,psi_00,psi_00) \
VADD(UChi_10,psi_10,psi_10)\ VADD(UChi_10,psi_10,psi_10) \
VADD(UChi_01,psi_01,psi_01)\ VADD(UChi_01,psi_01,psi_01) \
VADD(UChi_11,psi_11,psi_11)\ VADD(UChi_11,psi_11,psi_11) \
VADD(UChi_02,psi_02,psi_02)\ VADD(UChi_02,psi_02,psi_02) \
VADD(UChi_12,psi_12,psi_12)\ VADD(UChi_12,psi_12,psi_12) \
VSUB(UChi_10,psi_20,psi_20)\ VSUB(UChi_10,psi_20,psi_20) \
VSUB(UChi_11,psi_21,psi_21)\ VSUB(UChi_11,psi_21,psi_21) \
VSUB(UChi_12,psi_22,psi_22)\ VSUB(UChi_12,psi_22,psi_22) \
VADD(UChi_00,psi_30,psi_30)\ VADD(UChi_00,psi_30,psi_30) \
VADD(UChi_01,psi_31,psi_31)\ VADD(UChi_01,psi_31,psi_31) \
VADD(UChi_02,psi_32,psi_32) ); VADD(UChi_02,psi_32,psi_32) );
#define ZP_RECON_ACCUM __asm__ ( \ #define ZP_RECON_ACCUM __asm__ ( \
VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\ VACCTIMESMINUSI0(UChi_00,psi_20,Z0) \
VACCTIMESI0(UChi_10,psi_30,Z3)\ VACCTIMESI0(UChi_10,psi_30,Z3) \
VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\ VACCTIMESMINUSI0(UChi_01,psi_21,Z1) \
VACCTIMESI0(UChi_11,psi_31,Z4)\ VACCTIMESI0(UChi_11,psi_31,Z4) \
VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\ VACCTIMESMINUSI0(UChi_02,psi_22,Z2) \
VACCTIMESI0(UChi_12,psi_32,Z5)\ VACCTIMESI0(UChi_12,psi_32,Z5) \
VADD(UChi_00,psi_00,psi_00)\ VADD(UChi_00,psi_00,psi_00) \
VADD(UChi_10,psi_10,psi_10)\ VADD(UChi_10,psi_10,psi_10) \
VADD(UChi_01,psi_01,psi_01)\ VADD(UChi_01,psi_01,psi_01) \
VADD(UChi_11,psi_11,psi_11)\ VADD(UChi_11,psi_11,psi_11) \
VADD(UChi_02,psi_02,psi_02)\ VADD(UChi_02,psi_02,psi_02) \
VADD(UChi_12,psi_12,psi_12)\ VADD(UChi_12,psi_12,psi_12) \
VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\ VACCTIMESMINUSI1(UChi_00,psi_20,Z0) \
VACCTIMESI1(UChi_10,psi_30,Z3)\ VACCTIMESI1(UChi_10,psi_30,Z3) \
VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\ VACCTIMESMINUSI1(UChi_01,psi_21,Z1) \
VACCTIMESI1(UChi_11,psi_31,Z4)\ VACCTIMESI1(UChi_11,psi_31,Z4) \
VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\ VACCTIMESMINUSI1(UChi_02,psi_22,Z2) \
VACCTIMESI1(UChi_12,psi_32,Z5)\ VACCTIMESI1(UChi_12,psi_32,Z5) \
VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\ VACCTIMESMINUSI2(UChi_00,psi_20,Z0) \
VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\ VACCTIMESMINUSI2(UChi_01,psi_21,Z1) \
VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\ VACCTIMESMINUSI2(UChi_02,psi_22,Z2) \
VACCTIMESI2(UChi_10,psi_30,Z3)\ VACCTIMESI2(UChi_10,psi_30,Z3) \
VACCTIMESI2(UChi_11,psi_31,Z4)\ VACCTIMESI2(UChi_11,psi_31,Z4) \
VACCTIMESI2(UChi_12,psi_32,Z5)\ VACCTIMESI2(UChi_12,psi_32,Z5) \
); );
#define ZM_RECON_ACCUM __asm__ ( \ #define ZM_RECON_ACCUM __asm__ ( \
VACCTIMESI0(UChi_00,psi_20,Z0)\ VACCTIMESI0(UChi_00,psi_20,Z0) \
VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\ VACCTIMESMINUSI0(UChi_10,psi_30,Z3) \
VACCTIMESI0(UChi_01,psi_21,Z1)\ VACCTIMESI0(UChi_01,psi_21,Z1) \
VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\ VACCTIMESMINUSI0(UChi_11,psi_31,Z4) \
VACCTIMESI0(UChi_02,psi_22,Z2)\ VACCTIMESI0(UChi_02,psi_22,Z2) \
VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\ VACCTIMESMINUSI0(UChi_12,psi_32,Z5) \
VADD(UChi_00,psi_00,psi_00)\ VADD(UChi_00,psi_00,psi_00) \
VADD(UChi_10,psi_10,psi_10)\ VADD(UChi_10,psi_10,psi_10) \
VADD(UChi_01,psi_01,psi_01)\ VADD(UChi_01,psi_01,psi_01) \
VADD(UChi_11,psi_11,psi_11)\ VADD(UChi_11,psi_11,psi_11) \
VADD(UChi_02,psi_02,psi_02)\ VADD(UChi_02,psi_02,psi_02) \
VADD(UChi_12,psi_12,psi_12)\ VADD(UChi_12,psi_12,psi_12) \
VACCTIMESI1(UChi_00,psi_20,Z0)\ VACCTIMESI1(UChi_00,psi_20,Z0) \
VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\ VACCTIMESMINUSI1(UChi_10,psi_30,Z3) \
VACCTIMESI1(UChi_01,psi_21,Z1)\ VACCTIMESI1(UChi_01,psi_21,Z1) \
VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\ VACCTIMESMINUSI1(UChi_11,psi_31,Z4) \
VACCTIMESI1(UChi_02,psi_22,Z2)\ VACCTIMESI1(UChi_02,psi_22,Z2) \
VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\ VACCTIMESMINUSI1(UChi_12,psi_32,Z5) \
VACCTIMESI2(UChi_00,psi_20,Z0)\ VACCTIMESI2(UChi_00,psi_20,Z0) \
VACCTIMESI2(UChi_01,psi_21,Z1)\ VACCTIMESI2(UChi_01,psi_21,Z1) \
VACCTIMESI2(UChi_02,psi_22,Z2)\ VACCTIMESI2(UChi_02,psi_22,Z2) \
VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\ VACCTIMESMINUSI2(UChi_10,psi_30,Z3) \
VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\ VACCTIMESMINUSI2(UChi_11,psi_31,Z4) \
VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\ VACCTIMESMINUSI2(UChi_12,psi_32,Z5) \
); );
#define TP_RECON_ACCUM __asm__ ( \ #define TP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,psi_00,psi_00)\ VADD(UChi_00,psi_00,psi_00) \
VADD(UChi_10,psi_10,psi_10)\ VADD(UChi_10,psi_10,psi_10) \
VADD(UChi_01,psi_01,psi_01)\ VADD(UChi_01,psi_01,psi_01) \
VADD(UChi_11,psi_11,psi_11)\ VADD(UChi_11,psi_11,psi_11) \
VADD(UChi_02,psi_02,psi_02)\ VADD(UChi_02,psi_02,psi_02) \
VADD(UChi_12,psi_12,psi_12)\ VADD(UChi_12,psi_12,psi_12) \
VADD(UChi_00,psi_20,psi_20)\ VADD(UChi_00,psi_20,psi_20) \
VADD(UChi_10,psi_30,psi_30)\ VADD(UChi_10,psi_30,psi_30) \
VADD(UChi_01,psi_21,psi_21)\ VADD(UChi_01,psi_21,psi_21) \
VADD(UChi_11,psi_31,psi_31)\ VADD(UChi_11,psi_31,psi_31) \
VADD(UChi_02,psi_22,psi_22)\ VADD(UChi_02,psi_22,psi_22) \
VADD(UChi_12,psi_32,psi_32) ); VADD(UChi_12,psi_32,psi_32) );
#define TM_RECON_ACCUM __asm__ ( \ #define TM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,psi_00,psi_00)\ VADD(UChi_00,psi_00,psi_00) \
VADD(UChi_10,psi_10,psi_10)\ VADD(UChi_10,psi_10,psi_10) \
VADD(UChi_01,psi_01,psi_01)\ VADD(UChi_01,psi_01,psi_01) \
VADD(UChi_11,psi_11,psi_11)\ VADD(UChi_11,psi_11,psi_11) \
VADD(UChi_02,psi_02,psi_02)\ VADD(UChi_02,psi_02,psi_02) \
VADD(UChi_12,psi_12,psi_12)\ VADD(UChi_12,psi_12,psi_12) \
VSUB(UChi_00,psi_20,psi_20)\ VSUB(UChi_00,psi_20,psi_20) \
VSUB(UChi_10,psi_30,psi_30)\ VSUB(UChi_10,psi_30,psi_30) \
VSUB(UChi_01,psi_21,psi_21)\ VSUB(UChi_01,psi_21,psi_21) \
VSUB(UChi_11,psi_31,psi_31)\ VSUB(UChi_11,psi_31,psi_31) \
VSUB(UChi_02,psi_22,psi_22)\ VSUB(UChi_02,psi_22,psi_22) \
VSUB(UChi_12,psi_32,psi_32) ); VSUB(UChi_12,psi_32,psi_32) );
#define AVX512_PF_L1 #define AVX512_PF_L1