diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index b836e757..f7292add 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -437,6 +437,111 @@ namespace Optimization { }; +#if defined (AVX2) || defined (AVXFMA4) +#define _mm256_alignr_epi32(ret,a,b,n) ret= _mm256_alignr_epi8(a,b,(n*4)%16) +#define _mm256_alignr_epi64(ret,a,b,n) ret= _mm256_alignr_epi8(a,b,(n*8)%16) +#endif + +#if defined (AVX1) + +#define _mm256_alignr_epi32(ret,a,b,n) { \ + __m128 aa, bb; \ + \ + aa = _mm256_extractf128_ps(a,1); \ + bb = _mm256_extractf128_ps(b,1); \ + aa = _mm_alignr_epi8(aa,bb,(n*4)%16); \ + ret = _mm256_insertf128_ps(ret,aa,1); \ + \ + aa = _mm256_extractf128_ps(a,0); \ + bb = _mm256_extractf128_ps(b,0); \ + aa = _mm_alignr_epi8(aa,bb,(n*4)%16); \ + ret = _mm256_insertf128_ps(ret,aa,0); \ + } + +#define _mm256_alignr_epi64(ret,a,b,n) { \ + __m128 aa, bb; \ + \ + aa = _mm256_extractf128_pd(a,1); \ + bb = _mm256_extractf128_pd(b,1); \ + aa = _mm_alignr_epi8(aa,bb,(n*8)%16); \ + ret = _mm256_insertf128_pd(ret,aa,1); \ + \ + aa = _mm256_extractf128_pd(a,0); \ + bb = _mm256_extractf128_pd(b,0); \ + aa = _mm_alignr_epi8(aa,bb,(n*8)%16); \ + ret = _mm256_insertf128_pd(ret,aa,0); \ + } + +#endif + + inline std::ostream & operator << (std::ostream& stream, const __m256 a) + { + const float *p=(const float *)&a; + stream<< "{"<
(in);break;
+ case 1: return tRotate<1>(in);break;
+ case 2: return tRotate<2>(in);break;
+ case 3: return tRotate<3>(in);break;
+ case 4: return tRotate<4>(in);break;
+ case 5: return tRotate<5>(in);break;
+ case 6: return tRotate<6>(in);break;
+ case 7: return tRotate<7>(in);break;
+ default: assert(0);
+ }
+ }
+ static inline __m256d rotate(__m256d in,int n){
+ switch(n){
+ case 0: return tRotate<0>(in);break;
+ case 1: return tRotate<1>(in);break;
+ case 2: return tRotate<2>(in);break;
+ case 3: return tRotate<3>(in);break;
+ default: assert(0);
+ }
+ }
+
+
+ template