From e5657510b0f9e068fb3142dec6b47c4cc5e29818 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 22:24:18 +0100 Subject: [PATCH] Rotate support for Ls simd-ized --- lib/simd/Grid_avx512.h | 116 +++++++++++++++++++++++++++++++++++ lib/simd/Grid_sse4.h | 26 ++++++++ lib/simd/Grid_vector_types.h | 40 ++++++++++-- 3 files changed, 176 insertions(+), 6 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 5d014137..9e601971 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -308,6 +308,122 @@ namespace Optimization { }; + struct Rotate{ + + static inline __m512 rotate(__m512 in, int n){ + return = _mm512_alignr_epi32(in,in,n); + }; + + static inline __m512d rotate(__m512d in, int n){ + return = _mm512_alignr_epi64(tmp,in,n); + }; + + +#if 0 + // 16 x 32 bit = 512 bits; 0-15 rotates + static inline __m512 rotateR(__m512 in, int n){ + + // 0 : D3210 C3210 B3210 A3210 -> D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 A0 + // 1 : A0321 D3210 C3210 B3210 -> A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 + // 2 : B0321 A0321 D3210 C3210 -> B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 + // 3 : C0321 B0321 A0321 D3210 -> C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 + // 4 : D0321 C0321 B0321 A0321 -> D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 + // 5 : A1032 D0321 C0321 B0321 -> A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 + // 6 : B1032 A1032 D0321 C0321 -> B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 + // 7 : C1032 B1032 A1032 D0321 -> C1 B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 D1 + // 8 : D1032 C1032 B1032 A1032 -> D1 C1 B1 A1 D0 C0 B0 A0 D3 C3 B3 A3 D2 C2 B2 A2 + //... + //15 : C3210 B3210 A3210 D2103 -> C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 A0 D3 + + int shuf_l = ( (n+3)/4 ) % 4; // shuf = 0,1,1,1,1,2,2,2,2,3,3,3,3,0,0,0 + int shuf_r = ( (n)/4 ) % 4; // shuf = 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 + + int peri = n%4; + __m512 left,right; + switch(shuf_l){ // In = D3210 C3210 B3210 A3210 + case 0: left = in; break; // tmp = D3210 C3210 B3210 A3210 + case 1: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,3,2,1)); break; // tmp = D0321 C0321 B0321 A0321 + case 2: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); break; // tmp = D1032... + case 3: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,1,0,3)); break; // tmp = D2103... + } + + switch(shuf_r){ // In = D3210 C3210 B3210 A3210 + case 0: right = in; break; // tmp = D3210 C3210 B3210 A3210 + case 1: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,3,2,1)); break; // tmp = D0321 C0321 B0321 A0321 + case 2: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); break; // tmp = D1032... + case 3: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,1,0,3)); break; // tmp = D2103... + } + return = _mm512_alignr_epi32(left,right,peri*4); + + }; + + // 8 x 64 bit = 512 bits; 0-7 rotates + static inline __m512 RotateZ(__m512 in, int n){ + + // 0 : D10 C10 B10 A10 -> D1 C1 B1 A1 D0 C0 B0 A0 + // 1 : A01 D10 C10 B10 -> A0 D1 C1 B1 A1 D0 C0 B0 + // 2 : B01 A01 D10 C10 -> B0 A0 D1 C1 B1 A1 D0 C0 + // 3 : C01 B01 A01 D10 -> C0 B0 A0 D1 C1 B1 A1 D0 + // 4 : D01 C01 B01 A01 -> D0 C0 B0 A0 D1 C1 B1 A1 + // 5 : A10 D01 C01 B01 -> A1 D0 C0 B0 A0 D1 C1 B1 + // 6 : B10 A10 D01 C01 -> B1 A1 D0 C0 B0 A0 D1 C1 + // 7 : C10 B10 A10 D01 -> C1 B1 A1 D0 C0 B0 A0 D1 + + int shuf_l = ((n+3)/4) % 2;// 0,1,1,1,1,0,0,0 + int shuf_r = (n/4) % 2; + int peri = n%4; + + __m512 left, right; + switch(shuf_l){ // In = D3210 C3210 B3210 A3210 + case 0: left = in; break; // tmp = D3210 C3210 B3210 A3210 + case 1: left = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,1,3,2)); break; // tmp = D0132... + } + switch(shuf_r){ // In = D3210 C3210 B3210 A3210 + case 0: right = in; break; // tmp = D3210 C3210 B3210 A3210 + case 1: right = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(0,1,3,2)); break; // tmp = D0132... + } + return = _mm512_alignr_epi32(tmp,in,peri*4); + }; + + + + // 8 x 64 bit = 512 bits; 0-7 rotates + static inline __m512d RotateR(__m512d in, int n){ + + // 0 : D10 C10 B10 A10 -> D1 C1 B1 A1 D0 C0 B0 A0 + // 1 : A01 D10 C10 B10 -> A0 D1 C1 B1 A1 D0 C0 B0 + // 2 : B01 A01 D10 C10 -> B0 A0 D1 C1 B1 A1 D0 C0 + // 3 : C01 B01 A01 D10 -> C0 B0 A0 D1 C1 B1 A1 D0 + // 4 : D01 C01 B01 A01 -> D0 C0 B0 A0 D1 C1 B1 A1 + // 5 : A10 D01 C01 B01 -> A1 D0 C0 B0 A0 D1 C1 B1 + // 6 : B10 A10 D01 C01 -> B1 A1 D0 C0 B0 A0 D1 C1 + // 7 : C10 B10 A10 D01 -> C1 B1 A1 D0 C0 B0 A0 D1 + int shuf_l = ((n+3)/4) % 2;// 0,1,1,1,1,0,0,0 + int shuf_r = (n/4) % 2; + int peri = n%4; + + __m512 left, right; + switch(shuf_l){ + case 0: left = in; break; + case 1: left = _mm512_shuffle_pd(in,in,0x55); + } + switch(shuf_r){ + case 0: right = in; break; + case 1: right = _mm512_shuffle_pd(in,in,0x55); + } + return = _mm512_alignr_epi64(tmp,in,peri*2); + + }; + + // 4 x 128 bit = 512 bits; 0-4 rotates + static inline __m512 RotateZ(__m512 in, int n){ + int peri = n%4; + return = _mm512_alignr_epi32(in,in,peri*2); + }; +#endif + + }; + ////////////////////////////////////////////// // Some Template specialization diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index b88ad4c9..8abd4581 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -294,6 +294,32 @@ namespace Optimization { }; + struct Rotate{ + + static inline __m128 rotate(__m128 in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + case 2: return tRotate<2>(in);break; + case 3: return tRotate<3>(in);break; + default: assert(0); + } + } + static inline __m128d rotate(__m128d in,int n){ + switch(n){ + case 0: return tRotate<0>(in);break; + case 1: return tRotate<1>(in);break; + default: assert(0); + } + } + +#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) + + template static inline __m128 tRotate(__m128 in){ return _mm_alignr_epi32(in,in,n); }; + template static inline __m128d tRotate(__m128d in){ return _mm_alignr_epi64(in,in,n); }; + + }; ////////////////////////////////////////////// // Some Template specialization diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index c247f15d..b485a47a 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -299,16 +299,44 @@ namespace Grid { } friend inline void permute(Grid_simd &y,Grid_simd b,int perm) { - if (perm==3) permute3(y,b); - else if (perm==2) permute2(y,b); - else if (perm==1) permute1(y,b); - else if (perm==0) permute0(y,b); + if ( perm & RotateBit ) { + int dist = perm&0xF; + y=rotate(b,dist); + return; + } + switch(perm){ + case 3: permute3(y,b); break; + case 2: permute2(y,b); break; + case 1: permute1(y,b); break; + case 0: permute0(y,b); break; + default: assert(0); + } } - - };// end of Grid_simd class definition + //////////////////////////////////////////////////////////////////// + // General rotate + //////////////////////////////////////////////////////////////////// + template =0> + inline Grid_simd rotate(Grid_simd b,int nrot) + { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + // std::cout << "Rotate Real by "< =0> + inline Grid_simd rotate(Grid_simd b,int nrot) + { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + // std::cout << "Rotate Complex by "<