mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 21:25:56 +01:00
Added missing SIMD integer reduction implementation for AVX, AVX-512, SSE4, IMCI
This commit is contained in:
parent
07b2c1b253
commit
a833f88c32
@ -701,9 +701,28 @@ namespace Optimization {
|
|||||||
//Integer Reduce
|
//Integer Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
|
inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
|
||||||
// FIXME unimplemented
|
__m128i ret;
|
||||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
#if defined (AVX2)
|
||||||
assert(0);
|
// AVX2 horizontal adds within upper and lower halves of register; use
|
||||||
|
// SSE to add upper and lower halves for result.
|
||||||
|
__m256i v1, v2;
|
||||||
|
__m128i u1, u2;
|
||||||
|
v1 = _mm256_hadd_epi32(in, in);
|
||||||
|
v2 = _mm256_hadd_epi32(v1, v1);
|
||||||
|
u1 = _mm256_castsi256_si128(v2); // upper half
|
||||||
|
u2 = _mm256_extracti128_si256(v2, 1); // lower half
|
||||||
|
ret = _mm256_add_epi32(u1, u2);
|
||||||
|
#else
|
||||||
|
// No AVX horizontal add; extract upper and lower halves of register & use
|
||||||
|
// SSE intrinsics.
|
||||||
|
__m128i u1, u2, u3;
|
||||||
|
u1 = _mm256_extractf128_si256(in, 0); // upper half
|
||||||
|
u2 = _mm256_extractf128_si256(in, 1); // lower half
|
||||||
|
u3 = _mm_add_epi32(u1, u2);
|
||||||
|
u1 = _mm_hadd_epi32(u3, u3);
|
||||||
|
ret = _mm_hadd_epi32(u1, u1);
|
||||||
|
#endif
|
||||||
|
return _mm_cvtsi128_si32(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -543,6 +543,24 @@ namespace Optimization {
|
|||||||
u512d conv; conv.v = v1;
|
u512d conv; conv.v = v1;
|
||||||
return conv.f[0];
|
return conv.f[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Integer Reduce
|
||||||
|
template<>
|
||||||
|
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
||||||
|
// No full vector reduce, use AVX to add upper and lower halves of register
|
||||||
|
// and perform AVX reduction.
|
||||||
|
__m256i v1, v2, v3;
|
||||||
|
__m128i u1, u2, ret;
|
||||||
|
v1 = _mm512_castsi512_si256(in); // upper half
|
||||||
|
v2 = _mm512_extracti32x8_epi32(in, 1); // lower half
|
||||||
|
v3 = _mm256_add_epi32(v1, v2);
|
||||||
|
v1 = _mm256_hadd_epi32(v3, v3);
|
||||||
|
v2 = _mm256_hadd_epi32(v1, v1);
|
||||||
|
u1 = _mm256_castsi256_si128(v2) // upper half
|
||||||
|
u2 = _mm256_extracti128_si256(v2, 1); // lower half
|
||||||
|
ret = _mm256_add_epi32(u1, u2);
|
||||||
|
return _mm_cvtsi128_si32(ret);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
//Complex float Reduce
|
//Complex float Reduce
|
||||||
template<>
|
template<>
|
||||||
@ -570,9 +588,7 @@ namespace Optimization {
|
|||||||
//Integer Reduce
|
//Integer Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
||||||
// FIXME unimplemented
|
return _mm512_reduce_add_epi32(in);
|
||||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
|
||||||
assert(0);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -401,9 +401,7 @@ namespace Optimization {
|
|||||||
//Integer Reduce
|
//Integer Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
||||||
// FIXME unimplemented
|
return _mm512_reduce_add_epi32(in);
|
||||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
|
||||||
assert(0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -570,9 +570,9 @@ namespace Optimization {
|
|||||||
//Integer Reduce
|
//Integer Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
|
inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
|
||||||
// FIXME unimplemented
|
__m128i v1 = _mm_hadd_epi32(in, in);
|
||||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
__m128i v2 = _mm_hadd_epi32(v1, v1);
|
||||||
assert(0);
|
return _mm_cvtsi128_si32(v2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user