1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Corrected AVX regression error. Tested.

This commit is contained in:
Guido Cossu
2015-05-27 10:49:33 +09:00
parent b99f2279c3
commit 8abf6403d5
14 changed files with 3634 additions and 4381 deletions

View File

@ -2,7 +2,7 @@
/* lib/Grid_config.h.in. Generated from configure.ac by autoheader. */
/* AVX */
/* #undef AVX1 */
#define AVX1 1
/* AVX2 */
/* #undef AVX2 */
@ -93,14 +93,11 @@
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "grid"
/* Define to the home page for this package. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.0"
/* SSE4 */
#define SSE4 1
/* #undef SSE4 */
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1

View File

@ -92,9 +92,6 @@
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION

View File

@ -299,7 +299,7 @@ namespace Optimization {
//////////////////////////////////////////////
// Some Template specialization
template < typename vtype >
void permute(vtype a, vtype b, int perm) {
void permute(vtype &a, vtype &b, int perm) {
union {
__m256 f;
vtype v;
@ -320,11 +320,16 @@ namespace Optimization {
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
__m256 v1,v2;
union {
__m256 v;
float f[8];
} conv;
Optimization::permute(v1,in,0); // sse 128; paired complex single
v1 = _mm256_add_ps(v1,in);
Optimization::permute(v2,v1,1); // avx 256; quad complex single
v1 = _mm256_add_ps(v1,v2);
return Grid::ComplexF(v1[0],v1[1]);
conv.v = v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
}
//Real float Reduce
template<>

View File

@ -77,9 +77,7 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
switch (perm){
#if defined(AVX1)||defined(AVX2)
// 8x32 bits=>3 permutes
case 2:
conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1));
break;
case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
#endif