diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 52be9c05..57d9064d 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -701,9 +701,28 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m256i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + __m128i ret; +#if defined (AVX2) + // AVX2 horizontal adds within upper and lower halves of register; use + // SSE to add upper and lower halves for result. + __m256i v1, v2; + __m128i u1, u2; + v1 = _mm256_hadd_epi32(in, in); + v2 = _mm256_hadd_epi32(v1, v1); + u1 = _mm256_castsi256_si128(v2); // upper half + u2 = _mm256_extracti128_si256(v2, 1); // lower half + ret = _mm256_add_epi32(u1, u2); +#else + // No AVX horizontal add; extract upper and lower halves of register & use + // SSE intrinsics. + __m128i u1, u2, u3; + u1 = _mm256_extractf128_si256(in, 0); // upper half + u2 = _mm256_extractf128_si256(in, 1); // lower half + u3 = _mm_add_epi32(u1, u2); + u1 = _mm_hadd_epi32(u3, u3); + ret = _mm_hadd_epi32(u1, u1); +#endif + return _mm_cvtsi128_si32(ret); } } diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index ba054665..458a8f7c 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -543,6 +543,24 @@ namespace Optimization { u512d conv; conv.v = v1; return conv.f[0]; } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(__m512i in){ + // No full vector reduce, use AVX to add upper and lower halves of register + // and perform AVX reduction. + __m256i v1, v2, v3; + __m128i u1, u2, ret; + v1 = _mm512_castsi512_si256(in); // upper half + v2 = _mm512_extracti32x8_epi32(in, 1); // lower half + v3 = _mm256_add_epi32(v1, v2); + v1 = _mm256_hadd_epi32(v3, v3); + v2 = _mm256_hadd_epi32(v1, v1); + u1 = _mm256_castsi256_si128(v2) // upper half + u2 = _mm256_extracti128_si256(v2, 1); // lower half + ret = _mm256_add_epi32(u1, u2); + return _mm_cvtsi128_si32(ret); + } #else //Complex float Reduce template<> @@ -570,9 +588,7 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m512i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + return _mm512_reduce_add_epi32(in); } #endif diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h index 173e57d8..a1dae565 100644 --- a/lib/simd/Grid_imci.h +++ b/lib/simd/Grid_imci.h @@ -401,9 +401,7 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m512i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + return _mm512_reduce_add_epi32(in); } diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 2fb2df76..0b1f9ffb 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -570,9 +570,9 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m128i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + __m128i v1 = _mm_hadd_epi32(in, in); + __m128i v2 = _mm_hadd_epi32(v1, v1); + return _mm_cvtsi128_si32(v2); } }