mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-02 21:14:32 +00:00 
			
		
		
		
	Added missing SIMD integer reduction implementation for AVX, AVX-512, SSE4, IMCI
This commit is contained in:
		@@ -701,9 +701,28 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    __m128i ret;
 | 
			
		||||
#if defined (AVX2)
 | 
			
		||||
    // AVX2 horizontal adds within upper and lower halves of register; use
 | 
			
		||||
    // SSE to add upper and lower halves for result.
 | 
			
		||||
    __m256i v1, v2;
 | 
			
		||||
    __m128i u1, u2;
 | 
			
		||||
    v1  = _mm256_hadd_epi32(in, in);
 | 
			
		||||
    v2  = _mm256_hadd_epi32(v1, v1);
 | 
			
		||||
    u1  = _mm256_castsi256_si128(v2);      // upper half
 | 
			
		||||
    u2  = _mm256_extracti128_si256(v2, 1); // lower half
 | 
			
		||||
    ret = _mm256_add_epi32(u1, u2);
 | 
			
		||||
#else
 | 
			
		||||
    // No AVX horizontal add; extract upper and lower halves of register & use
 | 
			
		||||
    // SSE intrinsics.
 | 
			
		||||
    __m128i u1, u2, u3;
 | 
			
		||||
    u1  = _mm256_extractf128_si256(in, 0); // upper half
 | 
			
		||||
    u2  = _mm256_extractf128_si256(in, 1); // lower half
 | 
			
		||||
    u3  = _mm_add_epi32(u1, u2);
 | 
			
		||||
    u1  = _mm_hadd_epi32(u3, u3);
 | 
			
		||||
    ret = _mm_hadd_epi32(u1, u1);
 | 
			
		||||
#endif
 | 
			
		||||
    return _mm_cvtsi128_si32(ret);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -543,6 +543,24 @@ namespace Optimization {
 | 
			
		||||
     u512d conv; conv.v = v1;
 | 
			
		||||
     return conv.f[0];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
 | 
			
		||||
    // No full vector reduce, use AVX to add upper and lower halves of register
 | 
			
		||||
    // and perform AVX reduction.
 | 
			
		||||
    __m256i v1, v2, v3;
 | 
			
		||||
    __m128i u1, u2, ret;
 | 
			
		||||
    v1  = _mm512_castsi512_si256(in);       // upper half
 | 
			
		||||
    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
 | 
			
		||||
    v3  = _mm256_add_epi32(v1, v2);
 | 
			
		||||
    v1  = _mm256_hadd_epi32(v3, v3);
 | 
			
		||||
    v2  = _mm256_hadd_epi32(v1, v1);
 | 
			
		||||
    u1  = _mm256_castsi256_si128(v2)        // upper half
 | 
			
		||||
    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
 | 
			
		||||
    ret = _mm256_add_epi32(u1, u2);
 | 
			
		||||
    return _mm_cvtsi128_si32(ret);
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
@@ -570,9 +588,7 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    return _mm512_reduce_add_epi32(in);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -401,9 +401,7 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    return _mm512_reduce_add_epi32(in);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -570,9 +570,9 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
   printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    __m128i v1 = _mm_hadd_epi32(in, in);
 | 
			
		||||
    __m128i v2 = _mm_hadd_epi32(v1, v1);
 | 
			
		||||
    return _mm_cvtsi128_si32(v2);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user