diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 52be9c05..f4634432 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -701,9 +701,28 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m256i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + __m128i ret; +#if defined (AVX2) + // AVX2 horizontal adds within upper and lower halves of register; use + // SSE to add upper and lower halves for result. + __m256i v1, v2; + __m128i u1, u2; + v1 = _mm256_hadd_epi32(in, in); + v2 = _mm256_hadd_epi32(v1, v1); + u1 = _mm256_castsi256_si128(v2); // upper half + u2 = _mm256_extracti128_si256(v2, 1); // lower half + ret = _mm_add_epi32(u1, u2); +#else + // No AVX horizontal add; extract upper and lower halves of register & use + // SSE intrinsics. + __m128i u1, u2, u3; + u1 = _mm256_extractf128_si256(in, 0); // upper half + u2 = _mm256_extractf128_si256(in, 1); // lower half + u3 = _mm_add_epi32(u1, u2); + u1 = _mm_hadd_epi32(u3, u3); + ret = _mm_hadd_epi32(u1, u1); +#endif + return _mm_cvtsi128_si32(ret); } } diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index ba054665..85d27421 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -543,6 +543,24 @@ namespace Optimization { u512d conv; conv.v = v1; return conv.f[0]; } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(__m512i in){ + // No full vector reduce, use AVX to add upper and lower halves of register + // and perform AVX reduction. + __m256i v1, v2, v3; + __m128i u1, u2, ret; + v1 = _mm512_castsi512_si256(in); // upper half + v2 = _mm512_extracti32x8_epi32(in, 1); // lower half + v3 = _mm256_add_epi32(v1, v2); + v1 = _mm256_hadd_epi32(v3, v3); + v2 = _mm256_hadd_epi32(v1, v1); + u1 = _mm256_castsi256_si128(v2) // upper half + u2 = _mm256_extracti128_si256(v2, 1); // lower half + ret = _mm_add_epi32(u1, u2); + return _mm_cvtsi128_si32(ret); + } #else //Complex float Reduce template<> @@ -570,9 +588,7 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m512i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + return _mm512_reduce_add_epi32(in); } #endif diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h index 173e57d8..a1dae565 100644 --- a/lib/simd/Grid_imci.h +++ b/lib/simd/Grid_imci.h @@ -401,9 +401,7 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m512i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + return _mm512_reduce_add_epi32(in); } diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index cbca9118..8de7bde8 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -374,6 +374,84 @@ namespace Optimization { // Complex float FLOAT_WRAP_2(operator(), inline) }; +#define USE_FP16 + struct PrecisionChange { + static inline vech StoH (const vector4float &a, const vector4float &b) { + vech ret; + std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void HtoS (vech h, vector4float &sa, vector4float &sb) { + std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl; + assert(0); + } + static inline vector4float DtoS (vector4double a, vector4double b) { + vector4float ret; + std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void StoD (vector4float s, vector4double &a, vector4double &b) { + std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl; + assert(0); + } + static inline vech DtoH (vector4double a, vector4double b, + vector4double c, vector4double d) { + vech ret; + std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl; + assert(0); + return ret; + } + static inline void HtoD (vech h, vector4double &a, vector4double &b, + vector4double &c, vector4double &d) { + std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl; + assert(0); + } + }; + + ////////////////////////////////////////////// + // Exchange support +#define FLOAT_WRAP_EXCHANGE(fn) \ + static inline void fn(vector4float &out1, vector4float &out2, \ + vector4float in1, vector4float in2) \ + { \ + vector4double out1d, out2d, in1d, in2d; \ + in1d = Vset()(in1); \ + in2d = Vset()(in2); \ + fn(out1d, out2d, in1d, in2d); \ + Vstore()(out1d, out1); \ + Vstore()(out2d, out2); \ + } + + struct Exchange{ + + // double precision + static inline void Exchange0(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + out1 = vec_perm(in1, in2, vec_gpci(0145)); + out2 = vec_perm(in1, in2, vec_gpci(02367)); + } + static inline void Exchange1(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + out1 = vec_perm(in1, in2, vec_gpci(0426)); + out2 = vec_perm(in1, in2, vec_gpci(01537)); + } + static inline void Exchange2(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + assert(0); + } + static inline void Exchange3(vector4double &out1, vector4double &out2, + vector4double in1, vector4double in2) { + assert(0); + } + + // single precision + FLOAT_WRAP_EXCHANGE(Exchange0); + FLOAT_WRAP_EXCHANGE(Exchange1); + FLOAT_WRAP_EXCHANGE(Exchange2); + FLOAT_WRAP_EXCHANGE(Exchange3); + }; struct Permute{ //Complex double @@ -497,15 +575,19 @@ namespace Optimization { //Integer Reduce template<> - inline Integer Reduce::operator()(int in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + inline Integer Reduce::operator()(veci in){ + Integer a = 0; + for (unsigned int i = 0; i < W::r; ++i) + { + a += in.v[i]; + } + return a; } } //////////////////////////////////////////////////////////////////////////////// // Here assign types +typedef Optimization::vech SIMD_Htype; // Half precision type typedef Optimization::vector4float SIMD_Ftype; // Single precision type typedef vector4double SIMD_Dtype; // Double precision type typedef Optimization::veci SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 2fb2df76..0b1f9ffb 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -570,9 +570,9 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(__m128i in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + __m128i v1 = _mm_hadd_epi32(in, in); + __m128i v2 = _mm_hadd_epi32(v1, v1); + return _mm_cvtsi128_si32(v2); } } diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index c0bbef1d..b2e8d68e 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -183,8 +183,6 @@ void IntTester(const functor &func) { typedef Integer scal; typedef vInteger vec; - GridSerialRNG sRNG; - sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); int Nsimd = vec::Nsimd(); @@ -287,6 +285,50 @@ void ReductionTester(const functor &func) } +template +void IntReductionTester(const functor &func) +{ + int Nsimd = vec::Nsimd(); + + std::vector input1(Nsimd); + std::vector input2(Nsimd); + reduced result(0); + reduced reference(0); + reduced tmp; + + std::vector > buf(3); + vec & v_input1 = buf[0]; + vec & v_input2 = buf[1]; + + for(int i=0;i(v_input1,input1); + merge(v_input2,input2); + + func.template vfunc(result,v_input1,v_input2); + + for(int i=0;i(tmp,input1[i],input2[i]); + reference+=tmp; + } + + std::cout<(funcReduce()); std::cout<