diff --git a/TODO b/TODO index ed4dedd4..670c53e3 100644 --- a/TODO +++ b/TODO @@ -1,8 +1,8 @@ ================================================================ *** Hacks and bug fixes to clean up and Audits ================================================================ -* Base class to share common code between vRealF, VComplexF etc... - - Performance check on Guido's reimplementation strategy +* Base class to share common code between vRealF, VComplexF etc... done + - Performance check on Guido's reimplementation strategy - (GUIDO) tested and no difference was found, merged * FIXME audit diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 19b30f03..39eb4654 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -95,100 +95,10 @@ namespace Grid { template<> inline void zeroit(RealF &arg){ arg=0; }; template<> inline void zeroit(RealD &arg){ arg=0; }; - // Eventually delete this part -#if defined (SSE4) - typedef __m128 fvec; - typedef __m128d dvec; - typedef __m128 cvec; - typedef __m128d zvec; - typedef __m128i ivec; -#endif -#if defined (AVX1) || defined (AVX2) - typedef __m256 fvec; - typedef __m256d dvec; - typedef __m256 cvec; - typedef __m256d zvec; - typedef __m256i ivec; -#endif -#if defined (AVX512) - typedef __m512 fvec; - typedef __m512d dvec; - typedef __m512 cvec; - typedef __m512d zvec; - typedef __m512i ivec; -#endif -#if defined (QPX) - typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision - typedef float cvec __attribute__ ((vector_size (16))); - - typedef vector4double dvec; - typedef vector4double zvec; -#endif - -#if defined (AVX1) || defined (AVX2) || defined (AVX512) - inline void v_prefetch0(int size, const char *ptr){ - for(int i=0;i BA DC FE HG -// Permute 1 every ABCDEFGH -> CD AB GH EF -// Permute 2 every ABCDEFGH -> EFGH ABCD -// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) -// Permute 4 possible on half precision @512bit vectors. -////////////////////////////////////////////////////////// -template -inline void Gpermute(vsimd &y,const vsimd &b,int perm){ - union { - fvec f; - decltype(vsimd::v) v; - } conv; - conv.v = b.v; - switch (perm){ -#if defined(AVX1)||defined(AVX2) - // 8x32 bits=>3 permutes - case 2: - conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); - break; - case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; - case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; -#endif -#ifdef SSE4 - case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break; -#endif -#ifdef AVX512 - // 16 floats=> permutes - // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo - // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn - // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl - // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh - case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; - case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; - case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; -#endif -#ifdef QPX -#error not implemented -#endif - default: assert(0); break; - } - y.v=conv.v; - - }; }; #include - namespace Grid { // NB: Template the following on "type Complex" and then implement *,+,- for diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index a77cf963..3664e0f7 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -2,7 +2,7 @@ /*! @file Grid_vector_types.h @brief Defines templated class Grid_simd to deal with inner vector types */ -// Time-stamp: <2015-05-26 12:05:39 neo> +// Time-stamp: <2015-05-26 13:22:36 neo> //--------------------------------------------------------------------------- #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES @@ -16,7 +16,9 @@ #if defined AVX512 #include "Grid_knc.h" #endif - +#if defined QPX +#include "Grid_qpx.h" +#endif namespace Grid { @@ -33,8 +35,6 @@ namespace Grid { template using EnableIf = Invoke>; template using NotEnableIf= Invoke>; - - //////////////////////////////////////////////////////// // Check for complexity with type traits template struct is_complex : std::false_type {}; @@ -57,6 +57,58 @@ namespace Grid { /////////////////////////////////////////////// +////////////////////////////////////////////////////////// +// Permute +// Permute 0 every ABCDEFGH -> BA DC FE HG +// Permute 1 every ABCDEFGH -> CD AB GH EF +// Permute 2 every ABCDEFGH -> EFGH ABCD +// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) +// Permute 4 possible on half precision @512bit vectors. +////////////////////////////////////////////////////////// +template +inline void Gpermute(vsimd &y,const vsimd &b,int perm){ + union { + SIMD_Ftype f; + decltype(vsimd::v) v; + } conv; + conv.v = b.v; + switch (perm){ +#if defined(AVX1)||defined(AVX2) + // 8x32 bits=>3 permutes + case 2: + conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); + break; + case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; + case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; +#endif +#ifdef SSE4 + case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; + case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break; +#endif +#ifdef AVX512 + // 16 floats=> permutes + // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo + // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn + // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl + // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh + case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; + case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; + case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; + case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; +#endif +#ifdef QPX +#error not implemented +#endif + default: assert(0); break; + } + y.v=conv.v; + + }; + +/////////////////////////////////////// + + + /* @brief Grid_simd class for the SIMD vector type operations */ @@ -380,6 +432,12 @@ namespace Grid { typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD; typedef Grid_simd< Integer , SIMD_Itype > vInteger; + + + + + + } #endif