From 4bc08ed9956b2a461ec227b4144dc93c2b134968 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 26 Jun 2016 12:54:14 -0700 Subject: [PATCH] Improved the prefetching when using cache blocking codes --- lib/Stencil.h | 5 +- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 86 ++++--- .../action/fermion/WilsonKernelsAsmBody.h.ab | 18 +- lib/simd/Intel512common.h | 24 +- lib/simd/Intel512wilson.h | 237 +++++++++++------- 5 files changed, 208 insertions(+), 162 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 8019e3f9..bc015370 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -1,4 +1,4 @@ - /************************************************************************************* + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -261,6 +261,9 @@ } }; + inline uint64_t Touch(int ent) { + // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); + } inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0); local = _entries[ent]._is_local; diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index d50999f6..7373d2eb 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,7 +1,9 @@ { int locala,perma, ptypea; int localb,permb, ptypeb; - uint64_t basea, baseb; + int localc,permc, ptypec; + uint64_t basea, baseb, basec; + const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; @@ -10,15 +12,22 @@ MASK_REGS; for(int site=0;site shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -45,7 +55,7 @@ LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFYP(Yp,basea); + MULT_2SPIN_DIR_PFYP(Yp,basec); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -53,16 +63,17 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; - if ( locala ) { + baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + if ( localc ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR1,perma); + ZM_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR1,permc); } else { - LOAD_CHI(basea); + LOAD_CHI(basec); } { - MULT_2SPIN_DIR_PFZP(Zp,baseb); + MULT_2SPIN_DIR_PFZP(Zp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -70,16 +81,17 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; - if ( localb ) { + basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR0,permb); + TM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR0,perma); } else { - LOAD_CHI(baseb); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFTP(Tp,basea); + MULT_2SPIN_DIR_PFTP(Tp,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -87,16 +99,17 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; - if ( locala ) { + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; + PREFETCH_CHIMU(basea); + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR3,perma); + XP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR3,permb); } else { - LOAD_CHI(basea); + LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFXM(Xm,baseb); + MULT_2SPIN_DIR_PFXM(Xm,basec); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -104,13 +117,14 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; - if ( localb ) { + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + if ( localc ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR2,permb); + YP_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR2,permc); } else { - LOAD_CHI(baseb); + LOAD_CHI(basec); } { MULT_2SPIN_DIR_PFYM(Ym,basea); @@ -121,7 +135,8 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; + basec = (uint64_t)&out._odata[ss]; + PREFETCH_CHIMU(basec); if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -138,7 +153,8 @@ //////////////////////////////// // Tm //////////////////////////////// - basea = (uint64_t)&out._odata[ss]; + // basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + // PREFETCH_CHIMU(basea); if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_PROJMEM(baseb); @@ -146,16 +162,16 @@ } else { LOAD_CHI(baseb); } - baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); { - MULT_2SPIN_DIR_PFTM(Tm,basea); + MULT_2SPIN_DIR_PFTM(Tm,basec); } + // baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - SAVE_RESULT(&out._odata[ss],baseb); - - } + SAVE_RESULT(&out._odata[ss],basec); + + } ssU++; } } diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab index 3ba9eec6..d50999f6 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab @@ -2,7 +2,6 @@ int locala,perma, ptypea; int localb,permb, ptypeb; uint64_t basea, baseb; - uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; @@ -19,9 +18,7 @@ //////////////////////////////// int ent=ss*8;// 2*Ndim basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(basea); baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - basex = basea; if ( locala ) { LOAD64(%r10,isigns); @@ -39,7 +36,7 @@ //////////////////////////////// // Yp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -56,7 +53,7 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_PROJMEM(basea); @@ -73,7 +70,7 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_PROJMEM(baseb); @@ -90,7 +87,7 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_PROJMEM(basea); @@ -107,7 +104,7 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YP_PROJMEM(baseb); @@ -124,7 +121,7 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -149,13 +146,14 @@ } else { LOAD_CHI(baseb); } + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); { MULT_2SPIN_DIR_PFTM(Tm,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - SAVE_RESULT(&out._odata[ss]); + SAVE_RESULT(&out._odata[ss],baseb); } ssU++; diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index a05f978c..dabbf6d8 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -31,9 +31,6 @@ Author: paboyle //////////////////////////////////////////////////////////////////////////////////////////////////// // Peformance options //////////////////////////////////////////////////////////////////////////////////////////////////// -#define AVX512_PF_L1 -#undef AVX512_PF_L2_LINEAR -#undef AVX512_PF_L2_TABLE #undef AVX512_PF_L2_WRITE //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -45,7 +42,7 @@ Author: paboyle "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); -//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" ); +//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" ); #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" @@ -96,30 +93,13 @@ Author: paboyle #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" #define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n" -#ifdef AVX512_PF_L1 -#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" -#else -#define VPREFETCHG(O,A) -#endif - -#ifdef AVX512_PF_L2_LINEAR +#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" -#else -#define VPREFETCH2(O,A) -#endif - -#ifdef AVX512_PF_L2_TABLE -#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" -#else -#define VPREFETCHP(O,A) -#endif - #ifdef AVX512_PF_L2_WRITE #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" #else #define VPREFETCHW(O,A) #endif - #define VPREFETCHNTA(O,A) #define VPREFETCH(O,A) diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 207d9db8..9deffd80 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -169,23 +169,6 @@ Author: paboyle VSTORE(5,%r8,Chi_12) \ ); -#define SAVE_RESULTi(PTR,pf) \ - LOAD64(%r8,PTR) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VSTORE(0,%r8,result_00) VPREFETCHG(0,%r9) \ - VSTORE(1,%r8,result_01) VPREFETCHG(1,%r9) \ - VSTORE(2,%r8,result_02) VPREFETCHG(2,%r9) \ - VSTORE(3,%r8,result_10) VPREFETCHG(3,%r9) \ - VSTORE(4,%r8,result_11) VPREFETCHG(4,%r9) \ - VSTORE(5,%r8,result_12) VPREFETCHG(5,%r9) \ - VSTORE(6,%r8,result_20) VPREFETCHG(6,%r9) \ - VSTORE(7,%r8,result_21) VPREFETCHG(7,%r9) \ - VSTORE(8,%r8,result_22) VPREFETCHG(8,%r9) \ - VSTORE(9,%r8,result_30) VPREFETCHG(9,%r9) \ - VSTORE(10,%r8,result_31) VPREFETCHG(10,%r9) \ - VSTORE(11,%r8,result_32) VPREFETCHG(11,%r9) \ - ); #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p) #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p) @@ -560,24 +543,89 @@ Author: paboyle VSUB(UChi_02,result_22,result_22)\ VSUB(UChi_12,result_32,result_32) ); -#define PREFETCH_CHIMU(A) -/* - LOAD64(%r9,A) \ - __asm__ ( \ - VPREFETCHG(0,%r9)\ - VPREFETCHG(1,%r9)\ - VPREFETCHG(2,%r9)\ - VPREFETCHG(3,%r9)\ - VPREFETCHG(4,%r9)\ - VPREFETCHG(5,%r9)\ - VPREFETCHG(6,%r9)\ - VPREFETCHG(7,%r9)\ - VPREFETCHG(8,%r9)\ - VPREFETCHG(9,%r9)\ - VPREFETCHG(10,%r9)\ - VPREFETCHG(11,%r9)); -*/ -#define PERMUTE_DIR0 __asm__ ( \ +#define AVX512_PF_L1 +#define AVX512_PF_L2_GAUGE +#define AVX512_PF_L2_TABLE +#undef AVX512_PF_L2_LINEAR + +#ifdef AVX512_PF_L2_TABLE +#define VPREFETCH_P1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_P2(A,B) VPREFETCH1(A,B) +#else +#define VPREFETCH_P1(A,B) +#define VPREFETCH_P2(A,B) +#endif +#ifdef AVX512_PF_L2_LINEAR +#define VPREFETCH_M1(A,B) +#define VPREFETCH_M2(A,B) +#else +#define VPREFETCH_M1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_M2(A,B) VPREFETCH2(A,B) +#endif +#ifdef AVX512_PF_L2_GAUGE +#define VPREFETCH_G1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_G2(A,B) VPREFETCH2(A,B) +#else +#endif + +#define PF_GAUGE(A) \ + LOAD64(%r8,&U._odata[sU](A)) \ + __asm__ ( \ + VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \ + VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \ + ); + +#define SAVE_RESULTi(PTR,pf) \ + LOAD64(%r8,PTR) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \ + VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \ + VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \ + VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \ + VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \ + VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \ + VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \ + VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \ + VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \ + VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \ + VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \ + VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \ + ); + +#define PREFETCH_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P2(0,%r9) \ + VPREFETCH_P2(1,%r9) \ + VPREFETCH_P2(2,%r9) \ + VPREFETCH_P2(3,%r9) \ + VPREFETCH_P2(4,%r9) \ + VPREFETCH_P2(5,%r9) \ + VPREFETCH_P2(6,%r9) \ + VPREFETCH_P2(7,%r9) \ + VPREFETCH_P2(8,%r9) \ + VPREFETCH_P2(9,%r9) \ + VPREFETCH_P2(10,%r9) \ + VPREFETCH_P2(11,%r9)); + +#define PREFETCH1_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ + VPREFETCH_P1(11,%r9)); + +#define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_01,Chi_01) \ VPERM0(Chi_02,Chi_02) \ @@ -614,14 +662,15 @@ Author: paboyle LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ - VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \ - VPREFETCH2(11,%r8) \ - VPREFETCH2(12,%r8) \ - VPREFETCH2(13,%r8) \ - VPREFETCH2(14,%r8) \ - VPREFETCH2(15,%r8) \ - VPREFETCH2(16,%r8) \ - VPREFETCH2(17,%r8) \ + VPREFETCH_G2(9,%r8) \ + VPREFETCH_G2(10,%r8) \ + VPREFETCH_G2(11,%r8) \ + VPREFETCH_G2(12,%r8) \ + VPREFETCH_G2(13,%r8) \ + VPREFETCH_G2(14,%r8) \ + VPREFETCH_G2(15,%r8) \ + VPREFETCH_G2(16,%r8) \ + VPREFETCH_G2(17,%r8) \ VSHUF(Chi_00,T1) \ VMOVIDUP(0,%r8,Z0 ) \ VMOVIDUP(3,%r8,Z1 ) \ @@ -633,10 +682,10 @@ Author: paboyle VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ - VPREFETCHG(0,%r9) \ - VPREFETCHG(1,%r9) \ - VPREFETCHG(2,%r9) \ - VPREFETCHG(3,%r9) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ /*18*/ \ VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ VMADDSUB(Z3,Chi_10,UChi_10) \ @@ -644,10 +693,10 @@ Author: paboyle VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ VMADDSUB(Z5,Chi_10,UChi_12) \ - VPREFETCHG(4,%r9) \ - VPREFETCHG(5,%r9) \ - VPREFETCHG(6,%r9) \ - VPREFETCHG(7,%r9) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ /*28*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ @@ -674,15 +723,15 @@ Author: paboyle VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ VMADDSUB(Z5,Chi_11,UChi_12) \ - VPREFETCHG(9,%r8) \ - VPREFETCHG(10,%r8) \ - VPREFETCHG(11,%r8) \ - VPREFETCHG(12,%r8) \ - VPREFETCHG(13,%r8) \ - VPREFETCHG(14,%r8) \ - VPREFETCHG(15,%r8) \ - VPREFETCHG(16,%r8) \ - VPREFETCHG(17,%r8) \ + VPREFETCH_M1(9,%r8) \ + VPREFETCH_M1(10,%r8) \ + VPREFETCH_M1(11,%r8) \ + VPREFETCH_M1(12,%r8) \ + VPREFETCH_M1(13,%r8) \ + VPREFETCH_M1(14,%r8) \ + VPREFETCH_M1(15,%r8) \ + VPREFETCH_M1(16,%r8) \ + VPREFETCH_M1(17,%r8) \ /*48*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ @@ -690,10 +739,10 @@ Author: paboyle VMADDSUB(Z1,T2,UChi_11) \ VMADDSUB(Z2,T1,UChi_02) \ VMADDSUB(Z2,T2,UChi_12) \ - VPREFETCHG(8,%r9) \ - VPREFETCHG(9,%r9) \ - VPREFETCHG(10,%r9) \ - VPREFETCHG(11,%r9) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ /*55*/ \ VMADDSUB(Z3,Chi_02,UChi_00) \ VMADDSUB(Z3,Chi_12,UChi_10) \ @@ -712,56 +761,56 @@ Author: paboyle VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ - VPREFETCHG(0,%r9) \ - VPREFETCHG(1,%r9) \ - VPREFETCHG(2,%r9) \ - VPREFETCHG(3,%r9) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ /*8*/ \ VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ - VPREFETCHG(4,%r9) \ - VPREFETCHG(5,%r9) \ - VPREFETCHG(6,%r9) \ - VPREFETCHG(7,%r9) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ /*16*/ \ VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ - VPREFETCHG(8,%r9) \ - VPREFETCHG(9,%r9) \ - VPREFETCHG(10,%r9) \ - VPREFETCHG(11,%r9) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ /*22*/ \ VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ - VPREFETCH2(12,%r9) \ - VPREFETCH2(13,%r9) \ - VPREFETCH2(14,%r9) \ - VPREFETCH2(15,%r9) \ + VPREFETCH_M2(12,%r9) \ + VPREFETCH_M2(13,%r9) \ + VPREFETCH_M2(14,%r9) \ + VPREFETCH_M2(15,%r9) \ /*30*/ \ VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ - VPREFETCH2(16,%r9) \ - VPREFETCH2(17,%r9) \ - VPREFETCH2(18,%r9) \ - VPREFETCH2(19,%r9) \ + VPREFETCH_M2(16,%r9) \ + VPREFETCH_M2(17,%r9) \ + VPREFETCH_M2(18,%r9) \ + VPREFETCH_M2(19,%r9) \ VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - VPREFETCH2(20,%r9) \ - VPREFETCH2(21,%r9) \ - VPREFETCH2(22,%r9) \ - VPREFETCH2(23,%r9) \ - VPREFETCHG(2,%r8) \ - VPREFETCHG(3,%r8) \ - VPREFETCH2(4,%r8) \ - VPREFETCH2(5,%r8) \ + VPREFETCH_M2(20,%r9) \ + VPREFETCH_M2(21,%r9) \ + VPREFETCH_M2(22,%r9) \ + VPREFETCH_M2(23,%r9) \ + VPREFETCH_G1(2,%r8) \ + VPREFETCH_G1(3,%r8) \ + VPREFETCH_G2(4,%r8) \ + VPREFETCH_G2(5,%r8) \ /*42 insns*/ ); #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ @@ -794,8 +843,8 @@ Author: paboyle VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - /* VPREFETCHG(2,%r8)*/ \ - /* VPREFETCHG(3,%r8)*/ \ + /* VPREFETCH1(2,%r8)*/ \ + /* VPREFETCH1(3,%r8)*/ \ /*42 insns*/ );