diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index cccc82e0..26edb4c9 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -95,6 +95,20 @@ namespace Grid { template<> inline void zeroit(RealF &arg){ arg=0; }; template<> inline void zeroit(RealD &arg){ arg=0; }; + + ////////////////////////////////////////////////////////// + // Permute + // Permute 0 every ABCDEFGH -> BA DC FE HG + // Permute 1 every ABCDEFGH -> CD AB GH EF + // Permute 2 every ABCDEFGH -> EFGH ABCD + // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) + // Permute 4 possible on half precision @512bit vectors. + // + // Defined inside SIMD specialization files + ////////////////////////////////////////////////////////// + template + inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm); + }; #include diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index f9fa1f85..ec2df44b 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -4,7 +4,7 @@ Using intrinsics */ -// Time-stamp: <2015-05-22 18:58:27 neo> +// Time-stamp: <2015-05-27 12:07:15 neo> //---------------------------------------------------------------------- #include @@ -383,6 +383,30 @@ namespace Grid { typedef __m256d SIMD_Dtype; // Double precision type typedef __m256i SIMD_Itype; // Integer type + // prefecthing + inline void v_prefetch0(int size, const char *ptr){ + for(int i=0;i + inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { + union { + __m256 f; + decltype(VectorSIMD::v) v; + } conv; + conv.v = b.v; + switch(perm){ + case 3: break; //empty for AVX1/2 + case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; + case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; + case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; + default: assert(0); break; + } + y.v=conv.v; + }; // Function name aliases typedef Optimization::Vsplat VsplatSIMD; diff --git a/lib/simd/Grid_knc.h b/lib/simd/Grid_knc.h index daec3973..bb914270 100644 --- a/lib/simd/Grid_knc.h +++ b/lib/simd/Grid_knc.h @@ -4,7 +4,7 @@ Using intrinsics */ -// Time-stamp: <2015-05-22 17:12:44 neo> +// Time-stamp: <2015-05-27 12:08:50 neo> //---------------------------------------------------------------------- #include @@ -302,7 +302,32 @@ namespace Grid { typedef __m512d SIMD_Dtype; // Double precision type typedef __m512i SIMD_Itype; // Integer type + // prefecth + inline void v_prefetch0(int size, const char *ptr){ + for(int i=0;i + inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { + union { + __m512 f; + decltype(VectorSIMD::v) v; + } conv; + conv.v = b.v; + switch(perm){ + case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; + case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; + case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; + case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; + default: assert(0); break; + } + y=conv.v; + }; + // Function name aliases typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vstore VstoreSIMD; diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index dc0251f8..2bd5a20e 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -4,7 +4,7 @@ Using intrinsics */ -// Time-stamp: <2015-05-22 17:29:26 neo> +// Time-stamp: <2015-05-27 11:30:21 neo> //---------------------------------------------------------------------- // lot of undefined functions @@ -251,6 +251,7 @@ namespace Grid { typedef vector4double SIMD_Dtype; // Double precision type typedef int SIMD_Itype; // Integer type + inline void v_prefetch0(int size, const char *ptr){}; // Function name aliases typedef Optimization::Vsplat VsplatSIMD; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index eb930003..ce2737d5 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -4,7 +4,7 @@ Using intrinsics */ -// Time-stamp: <2015-05-21 18:06:30 neo> +// Time-stamp: <2015-05-27 12:02:07 neo> //---------------------------------------------------------------------- #include @@ -221,7 +221,7 @@ namespace Optimization { }; - + ////////////////////////////////////////////// @@ -277,6 +277,10 @@ namespace Optimization { assert(0); } + + + + } @@ -289,6 +293,28 @@ namespace Grid { typedef __m128i SIMD_Itype; // Integer type + inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities + + // Gpermute function + template < typename VectorSIMD > + inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { + union { + __m128 f; + decltype(VectorSIMD::v) v; + } conv; + conv.v = b.v; + switch(perm){ + case 3: break; //empty for SSE4 + case 2: break; //empty for SSE4 + case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; + case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; + default: assert(0); break; + } + y.v=conv.v; + }; + + + // Function name aliases typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vstore VstoreSIMD; @@ -296,6 +322,8 @@ namespace Grid { typedef Optimization::Vstream VstreamSIMD; template using ReduceSIMD = Optimization::Reduce; + + // Arithmetic operations typedef Optimization::Sum SumSIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 41706906..0bedf32f 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -2,7 +2,7 @@ /*! @file Grid_vector_types.h @brief Defines templated class Grid_simd to deal with inner vector types */ -// Time-stamp: <2015-05-26 14:08:13 neo> +// Time-stamp: <2015-05-27 12:04:06 neo> //--------------------------------------------------------------------------- #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES @@ -58,56 +58,7 @@ namespace Grid { /////////////////////////////////////////////// - // Move to the simd files -////////////////////////////////////////////////////////// -// Permute -// Permute 0 every ABCDEFGH -> BA DC FE HG -// Permute 1 every ABCDEFGH -> CD AB GH EF -// Permute 2 every ABCDEFGH -> EFGH ABCD -// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) -// Permute 4 possible on half precision @512bit vectors. -////////////////////////////////////////////////////////// -template -inline void Gpermute(vsimd &y,const vsimd &b,int perm){ - union { - SIMD_Ftype f; - decltype(vsimd::v) v; - } conv; - conv.v = b.v; - switch (perm){ -#if defined(AVX1)||defined(AVX2) - // 8x32 bits=>3 permutes - case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; - case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; - case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; -#endif -#ifdef SSE4 - case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break; -#endif -#ifdef AVX512 - // 16 floats=> permutes - // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo - // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn - // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl - // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh - case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; - case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; - case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; -#endif -#ifdef QPX -#error not implemented -#endif - default: assert(0); break; - } - y.v=conv.v; - - }; - -/////////////////////////////////////// - - + /* @brief Grid_simd class for the SIMD vector type operations