1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Simd revert to Guido's commit. I edited concurrently and things went bad.

This commit is contained in:
Peter Boyle 2015-05-26 22:20:09 +01:00
parent 48bb3ab4e7
commit ccd47011b9

View File

@ -95,138 +95,6 @@ namespace Grid {
template<> inline void zeroit(RealF &arg){ arg=0; }; template<> inline void zeroit(RealF &arg){ arg=0; };
template<> inline void zeroit(RealD &arg){ arg=0; }; template<> inline void zeroit(RealD &arg){ arg=0; };
#if defined (SSE4)
typedef __m128 fvec;
typedef __m128d dvec;
typedef __m128 cvec;
typedef __m128d zvec;
typedef __m128i ivec;
#endif
#if defined (AVX1) || defined (AVX2)
typedef __m256 fvec;
typedef __m256d dvec;
typedef __m256 cvec;
typedef __m256d zvec;
typedef __m256i ivec;
#endif
#if defined (AVX512)
typedef __m512 fvec;
typedef __m512d dvec;
typedef __m512 cvec;
typedef __m512d zvec;
typedef __m512i ivec;
#endif
#if defined (QPX)
typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision
typedef float cvec __attribute__ ((vector_size (16)));
typedef vector4double dvec;
typedef vector4double zvec;
#endif
#if defined (AVX1) || defined (AVX2) || defined (AVX512)
inline void v_prefetch0(int size, const char *ptr){
for(int i=0;i<size;i+=64){ // Define L1 linesize above// What about SSE?
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
}
}
#else
inline void v_prefetch0(int size, const char *ptr){};
#endif
//////////////////////////////////////////////////////////
// Permute
// Permute 0 every ABCDEFGH -> BA DC FE HG
// Permute 1 every ABCDEFGH -> CD AB GH EF
// Permute 2 every ABCDEFGH -> EFGH ABCD
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
// Permute 4 possible on half precision @512bit vectors.
//////////////////////////////////////////////////////////
template<class vsimd>
inline void Gpermute0(vsimd &y,const vsimd &b) {
union {
fvec f;
decltype(vsimd::v) v;
} conv;
conv.v = b.v;
#ifdef SSE4
conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));
#endif
#if defined(AVX1)||defined(AVX2)
conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01);
#endif
#ifdef AVX512
conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2));
#endif
y.v=conv.v;
};
template<class vsimd>
inline void Gpermute1(vsimd &y,const vsimd &b) {
union {
fvec f;
decltype(vsimd::v) v;
} conv;
conv.v = b.v;
#ifdef SSE4
conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1));
#endif
#if defined(AVX1)||defined(AVX2)
conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));
#endif
#ifdef AVX512
conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1));
#endif
y.v=conv.v;
};
template<class vsimd>
inline void Gpermute2(vsimd &y,const vsimd &b) {
union {
fvec f;
decltype(vsimd::v) v;
} conv;
conv.v = b.v;
#ifdef SSE4
#endif
#if defined(AVX1)||defined(AVX2)
conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1));
#endif
#ifdef AVX512
conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC);
#endif
y.v=conv.v;
};
template<class vsimd>
inline void Gpermute3(vsimd &y,const vsimd &b) {
union {
fvec f;
decltype(vsimd::v) v;
} conv;
conv.v = b.v;
#ifdef AVX512
conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB);
#endif
y.v=conv.v;
};
template<class vsimd>
inline void Gpermute(vsimd &y,const vsimd &b,int perm){
union {
fvec f;
decltype(vsimd::v) v;
} conv;
conv.v = b.v;
switch (perm){
case 3: Gpermute3(y,b); break;
case 2: Gpermute2(y,b); break;
case 1: Gpermute1(y,b); break;
case 0: Gpermute0(y,b); break;
default: assert(0); break;
}
};
}; };
#include <simd/Grid_vector_types.h> #include <simd/Grid_vector_types.h>