mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/hadrons
This commit is contained in:
		@@ -701,9 +701,28 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    __m128i ret;
 | 
			
		||||
#if defined (AVX2)
 | 
			
		||||
    // AVX2 horizontal adds within upper and lower halves of register; use
 | 
			
		||||
    // SSE to add upper and lower halves for result.
 | 
			
		||||
    __m256i v1, v2;
 | 
			
		||||
    __m128i u1, u2;
 | 
			
		||||
    v1  = _mm256_hadd_epi32(in, in);
 | 
			
		||||
    v2  = _mm256_hadd_epi32(v1, v1);
 | 
			
		||||
    u1  = _mm256_castsi256_si128(v2);      // upper half
 | 
			
		||||
    u2  = _mm256_extracti128_si256(v2, 1); // lower half
 | 
			
		||||
    ret = _mm_add_epi32(u1, u2);
 | 
			
		||||
#else
 | 
			
		||||
    // No AVX horizontal add; extract upper and lower halves of register & use
 | 
			
		||||
    // SSE intrinsics.
 | 
			
		||||
    __m128i u1, u2, u3;
 | 
			
		||||
    u1  = _mm256_extractf128_si256(in, 0); // upper half
 | 
			
		||||
    u2  = _mm256_extractf128_si256(in, 1); // lower half
 | 
			
		||||
    u3  = _mm_add_epi32(u1, u2);
 | 
			
		||||
    u1  = _mm_hadd_epi32(u3, u3);
 | 
			
		||||
    ret = _mm_hadd_epi32(u1, u1);
 | 
			
		||||
#endif
 | 
			
		||||
    return _mm_cvtsi128_si32(ret);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -543,6 +543,24 @@ namespace Optimization {
 | 
			
		||||
     u512d conv; conv.v = v1;
 | 
			
		||||
     return conv.f[0];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
 | 
			
		||||
    // No full vector reduce, use AVX to add upper and lower halves of register
 | 
			
		||||
    // and perform AVX reduction.
 | 
			
		||||
    __m256i v1, v2, v3;
 | 
			
		||||
    __m128i u1, u2, ret;
 | 
			
		||||
    v1  = _mm512_castsi512_si256(in);       // upper half
 | 
			
		||||
    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
 | 
			
		||||
    v3  = _mm256_add_epi32(v1, v2);
 | 
			
		||||
    v1  = _mm256_hadd_epi32(v3, v3);
 | 
			
		||||
    v2  = _mm256_hadd_epi32(v1, v1);
 | 
			
		||||
    u1  = _mm256_castsi256_si128(v2)        // upper half
 | 
			
		||||
    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
 | 
			
		||||
    ret = _mm_add_epi32(u1, u2);
 | 
			
		||||
    return _mm_cvtsi128_si32(ret);
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
@@ -570,9 +588,7 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    return _mm512_reduce_add_epi32(in);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -401,9 +401,7 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    return _mm512_reduce_add_epi32(in);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -374,6 +374,84 @@ namespace Optimization {
 | 
			
		||||
    // Complex float
 | 
			
		||||
    FLOAT_WRAP_2(operator(), inline)
 | 
			
		||||
  };
 | 
			
		||||
#define USE_FP16
 | 
			
		||||
  struct PrecisionChange {
 | 
			
		||||
    static inline vech StoH (const vector4float &a, const vector4float &b) {
 | 
			
		||||
      vech ret;
 | 
			
		||||
      std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    static inline void  HtoS (vech h, vector4float &sa, vector4float &sb) {
 | 
			
		||||
      std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    static inline vector4float DtoS (vector4double a, vector4double b) {
 | 
			
		||||
      vector4float ret;
 | 
			
		||||
      std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
 | 
			
		||||
      std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    static inline vech DtoH (vector4double a, vector4double b, 
 | 
			
		||||
                             vector4double c, vector4double d) {
 | 
			
		||||
      vech ret;
 | 
			
		||||
      std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    static inline void HtoD (vech h, vector4double &a, vector4double &b, 
 | 
			
		||||
                                     vector4double &c, vector4double &d) {
 | 
			
		||||
      std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Exchange support
 | 
			
		||||
#define FLOAT_WRAP_EXCHANGE(fn) \
 | 
			
		||||
  static inline void fn(vector4float &out1, vector4float &out2, \
 | 
			
		||||
                        vector4float in1,  vector4float in2) \
 | 
			
		||||
  { \
 | 
			
		||||
    vector4double out1d, out2d, in1d, in2d; \
 | 
			
		||||
    in1d  = Vset()(in1);   \
 | 
			
		||||
    in2d  = Vset()(in2);   \
 | 
			
		||||
    fn(out1d, out2d, in1d, in2d); \
 | 
			
		||||
    Vstore()(out1d, out1); \
 | 
			
		||||
    Vstore()(out2d, out2); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  struct Exchange{
 | 
			
		||||
 | 
			
		||||
    // double precision
 | 
			
		||||
    static inline void Exchange0(vector4double &out1, vector4double &out2,
 | 
			
		||||
                                 vector4double in1,  vector4double in2) {
 | 
			
		||||
      out1 = vec_perm(in1, in2, vec_gpci(0145));
 | 
			
		||||
      out2 = vec_perm(in1, in2, vec_gpci(02367));
 | 
			
		||||
    }
 | 
			
		||||
    static inline void Exchange1(vector4double &out1, vector4double &out2,
 | 
			
		||||
                                 vector4double in1,  vector4double in2) {
 | 
			
		||||
      out1 = vec_perm(in1, in2, vec_gpci(0426));
 | 
			
		||||
      out2 = vec_perm(in1, in2, vec_gpci(01537));
 | 
			
		||||
    }
 | 
			
		||||
    static inline void Exchange2(vector4double &out1, vector4double &out2,
 | 
			
		||||
                                 vector4double in1,  vector4double in2) {
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    static inline void Exchange3(vector4double &out1, vector4double &out2,
 | 
			
		||||
                                 vector4double in1,  vector4double in2) {
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // single precision
 | 
			
		||||
    FLOAT_WRAP_EXCHANGE(Exchange0);
 | 
			
		||||
    FLOAT_WRAP_EXCHANGE(Exchange1);
 | 
			
		||||
    FLOAT_WRAP_EXCHANGE(Exchange2);
 | 
			
		||||
    FLOAT_WRAP_EXCHANGE(Exchange3);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Permute{
 | 
			
		||||
    //Complex double
 | 
			
		||||
@@ -497,15 +575,19 @@ namespace Optimization {
 | 
			
		||||
  
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, int>::operator()(int in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
  inline Integer Reduce<Integer, veci>::operator()(veci in){
 | 
			
		||||
    Integer a = 0;
 | 
			
		||||
    for (unsigned int i = 0; i < W<Integer>::r; ++i)
 | 
			
		||||
    {
 | 
			
		||||
        a += in.v[i];
 | 
			
		||||
    }
 | 
			
		||||
    return a;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types
 | 
			
		||||
typedef Optimization::vech         SIMD_Htype;  // Half precision type
 | 
			
		||||
typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
 | 
			
		||||
typedef vector4double              SIMD_Dtype; // Double precision type
 | 
			
		||||
typedef Optimization::veci         SIMD_Itype; // Integer type
 | 
			
		||||
 
 | 
			
		||||
@@ -570,9 +570,9 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
   printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    __m128i v1 = _mm_hadd_epi32(in, in);
 | 
			
		||||
    __m128i v2 = _mm_hadd_epi32(v1, v1);
 | 
			
		||||
    return _mm_cvtsi128_si32(v2);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user