mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Exchange in generic
Precision change in AVX, SSE, AVX512, Generic. QPX still to do.
This commit is contained in:
		@@ -10,7 +10,7 @@ AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h])
 | 
			
		||||
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 | 
			
		||||
 | 
			
		||||
############### Checks for programs
 | 
			
		||||
CXXFLAGS="-O3 $CXXFLAGS"
 | 
			
		||||
CXXFLAGS="-g $CXXFLAGS"
 | 
			
		||||
AC_PROG_CXX
 | 
			
		||||
AC_PROG_RANLIB
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -377,8 +377,8 @@ namespace Optimization {
 | 
			
		||||
      b0 = _mm256_extractf128_si256(b,0);
 | 
			
		||||
      a1 = _mm256_extractf128_si256(a,1);
 | 
			
		||||
      b1 = _mm256_extractf128_si256(b,1);
 | 
			
		||||
      a0 = _mm_mul_epi32(a0,b0);
 | 
			
		||||
      a1 = _mm_mul_epi32(a1,b1);
 | 
			
		||||
      a0 = _mm_mullo_epi32(a0,b0);
 | 
			
		||||
      a1 = _mm_mullo_epi32(a1,b1);
 | 
			
		||||
      return _mm256_set_m128i(a1,a0);
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX2)
 | 
			
		||||
@@ -494,7 +494,7 @@ namespace Optimization {
 | 
			
		||||
      a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0));
 | 
			
		||||
      b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1));
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m256 DtoH (__m256i a,__m256 b,__m256 c,__m256 d) {
 | 
			
		||||
    static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) {
 | 
			
		||||
      __m256 sa,sb;
 | 
			
		||||
      sa = DtoS(a,b);
 | 
			
		||||
      sb = DtoS(c,d);
 | 
			
		||||
 
 | 
			
		||||
@@ -235,11 +235,9 @@ namespace Optimization {
 | 
			
		||||
    inline void mac(__m512 &a, __m512 b, __m512 c){         
 | 
			
		||||
       a= _mm512_fmadd_ps( b, c, a);                         
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline void mac(__m512d &a, __m512d b, __m512d c){
 | 
			
		||||
      a= _mm512_fmadd_pd( b, c, a);                   
 | 
			
		||||
    }                                             
 | 
			
		||||
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline __m512 operator()(__m512 a, __m512 b){
 | 
			
		||||
      return _mm512_mul_ps(a,b);
 | 
			
		||||
@@ -366,7 +364,7 @@ namespace Optimization {
 | 
			
		||||
      a = _mm512_cvtps_pd(_mm512_extractf256_ps(s,0));
 | 
			
		||||
      b = _mm512_cvtps_pd(_mm512_extractf256_ps(s,1));
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m512 DtoH (__m512i a,__m512 b,__m512 c,__m512 d) {
 | 
			
		||||
    static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) {
 | 
			
		||||
      __m512 sa,sb;
 | 
			
		||||
      sa = DtoS(a,b);
 | 
			
		||||
      sb = DtoS(c,d);
 | 
			
		||||
 
 | 
			
		||||
@@ -279,6 +279,93 @@ namespace Optimization {
 | 
			
		||||
  
 | 
			
		||||
  #undef timesi
 | 
			
		||||
 | 
			
		||||
  struct PrecisionChange {
 | 
			
		||||
    static inline vech StoH (const vecf &a,const vecf &b) {
 | 
			
		||||
      vech ret;
 | 
			
		||||
      vech *ha = (vech *)&a;
 | 
			
		||||
      vech *hb = (vech *)&b;
 | 
			
		||||
      const int nf = W<float>::r;
 | 
			
		||||
      //      VECTOR_FOR(i, nf,1){ ret.v[i]    = ( (uint16_t *) &a.v[i])[1] ; }
 | 
			
		||||
      //      VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
 | 
			
		||||
      VECTOR_FOR(i, nf,1){ ret.v[i]    = ha->v[2*i+1]; }
 | 
			
		||||
      VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    static inline void  HtoS (vech h,vecf &sa,vecf &sb) {
 | 
			
		||||
      const int nf = W<float>::r;
 | 
			
		||||
      const int nh = W<uint16_t>::r;
 | 
			
		||||
      vech *ha = (vech *)&sa;
 | 
			
		||||
      vech *hb = (vech *)&sb;
 | 
			
		||||
      VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; }
 | 
			
		||||
      //      VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];}
 | 
			
		||||
      //      VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
 | 
			
		||||
      VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
 | 
			
		||||
      VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
 | 
			
		||||
    }
 | 
			
		||||
    static inline vecf DtoS (vecd a,vecd b) {
 | 
			
		||||
      const int nd = W<double>::r;
 | 
			
		||||
      const int nf = W<float>::r;
 | 
			
		||||
      vecf ret;
 | 
			
		||||
      VECTOR_FOR(i, nd,1){ ret.v[i]    = a.v[i] ; }
 | 
			
		||||
      VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    static inline void StoD (vecf s,vecd &a,vecd &b) {
 | 
			
		||||
      const int nd = W<double>::r;
 | 
			
		||||
      VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; }
 | 
			
		||||
      VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; }
 | 
			
		||||
    }
 | 
			
		||||
    static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) {
 | 
			
		||||
      vecf sa,sb;
 | 
			
		||||
      sa = DtoS(a,b);
 | 
			
		||||
      sb = DtoS(c,d);
 | 
			
		||||
      return StoH(sa,sb);
 | 
			
		||||
    }
 | 
			
		||||
    static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
 | 
			
		||||
      vecf sa,sb;
 | 
			
		||||
      HtoS(h,sa,sb);
 | 
			
		||||
      StoD(sa,a,b);
 | 
			
		||||
      StoD(sb,c,d);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Exchange support
 | 
			
		||||
  struct Exchange{
 | 
			
		||||
 | 
			
		||||
    template <typename T,int n>
 | 
			
		||||
    static inline void ExchangeN(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
 | 
			
		||||
      const int w = W<T>::r;
 | 
			
		||||
      unsigned int mask = w >> (n + 1);
 | 
			
		||||
      //      std::cout << " Exchange "<<n<<" nsimd "<<w<<" mask 0x" <<std::hex<<mask<<std::dec<<std::endl;
 | 
			
		||||
      VECTOR_FOR(i, w, 1) {	
 | 
			
		||||
	int j1 = i&(~mask);
 | 
			
		||||
	if  ( (i&mask) == 0 ) { out1.v[i]=in1.v[j1];}
 | 
			
		||||
	else                  { out1.v[i]=in2.v[j1];}
 | 
			
		||||
	int j2 = i|mask;
 | 
			
		||||
	if  ( (i&mask) == 0 ) { out2.v[i]=in1.v[j2];}
 | 
			
		||||
	else                  { out2.v[i]=in2.v[j2];}
 | 
			
		||||
      }      
 | 
			
		||||
    }
 | 
			
		||||
    template <typename T>
 | 
			
		||||
    static inline void Exchange0(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
 | 
			
		||||
      ExchangeN<T,0>(out1,out2,in1,in2);
 | 
			
		||||
    };
 | 
			
		||||
    template <typename T>
 | 
			
		||||
    static inline void Exchange1(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
 | 
			
		||||
      ExchangeN<T,1>(out1,out2,in1,in2);
 | 
			
		||||
    };
 | 
			
		||||
    template <typename T>
 | 
			
		||||
    static inline void Exchange2(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
 | 
			
		||||
      ExchangeN<T,2>(out1,out2,in1,in2);
 | 
			
		||||
    };
 | 
			
		||||
    template <typename T>
 | 
			
		||||
    static inline void Exchange3(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
 | 
			
		||||
      ExchangeN<T,3>(out1,out2,in1,in2);
 | 
			
		||||
    };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
  #define perm(a, b, n, w)\
 | 
			
		||||
@@ -403,6 +490,7 @@ namespace Optimization {
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
 | 
			
		||||
  typedef Optimization::vech SIMD_Htype; // Reduced precision type
 | 
			
		||||
  typedef Optimization::vecf SIMD_Ftype; // Single precision type
 | 
			
		||||
  typedef Optimization::vecd SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef Optimization::veci SIMD_Itype; // Integer type
 | 
			
		||||
 
 | 
			
		||||
@@ -66,6 +66,10 @@ namespace Optimization {
 | 
			
		||||
  template <> struct W<Integer> {
 | 
			
		||||
    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
 | 
			
		||||
  };
 | 
			
		||||
  template <> struct W<uint16_t> {
 | 
			
		||||
    constexpr static unsigned int c = GEN_SIMD_WIDTH/4u;
 | 
			
		||||
    constexpr static unsigned int r = GEN_SIMD_WIDTH/2u;
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  // SIMD vector types
 | 
			
		||||
  template <typename T>
 | 
			
		||||
@@ -75,6 +79,7 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  typedef vec<float>     vecf;
 | 
			
		||||
  typedef vec<double>    vecd;
 | 
			
		||||
  typedef vec<uint16_t>  vech; // half precision comms
 | 
			
		||||
  typedef vec<Integer>   veci;
 | 
			
		||||
  
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -125,7 +125,6 @@ namespace Optimization {
 | 
			
		||||
      f[2] = a.v2;
 | 
			
		||||
      f[3] = a.v3;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(double *d, vector4double a){
 | 
			
		||||
      vec_st(a, 0, d);
 | 
			
		||||
 
 | 
			
		||||
@@ -357,7 +357,7 @@ namespace Optimization {
 | 
			
		||||
      s = (__m128)_mm_alignr_epi32((__m128i)s,(__m128i)s,2);
 | 
			
		||||
      b = _mm_cvtps_pd(s);
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m128 DtoH (__m128i a,__m128 b,__m128 c,__m128 d) {
 | 
			
		||||
    static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) {
 | 
			
		||||
      __m128 sa,sb;
 | 
			
		||||
      sa = DtoS(a,b);
 | 
			
		||||
      sb = DtoS(c,d);
 | 
			
		||||
 
 | 
			
		||||
@@ -308,18 +308,23 @@ public:
 | 
			
		||||
  int n;
 | 
			
		||||
  funcExchange(int _n) { n=_n;};
 | 
			
		||||
  template<class vec>    void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);}
 | 
			
		||||
  template<class scal>   void apply(std::vector<scal> &r1,std::vector<scal> &r2,std::vector<scal> &in1,std::vector<scal> &in2)  const { 
 | 
			
		||||
  template<class scal>   void apply(std::vector<scal> &r1,
 | 
			
		||||
				    std::vector<scal> &r2,
 | 
			
		||||
				    std::vector<scal> &in1,
 | 
			
		||||
				    std::vector<scal> &in2)  const 
 | 
			
		||||
  { 
 | 
			
		||||
    int sz=in1.size();
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    int msk = sz>>(n+1);
 | 
			
		||||
 | 
			
		||||
    int j1=0;
 | 
			
		||||
    int j2=0;
 | 
			
		||||
    for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in1[ i ];
 | 
			
		||||
    for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in2[ i ];
 | 
			
		||||
    for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in1[ i ];
 | 
			
		||||
    for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in2[ i ];
 | 
			
		||||
    for(int i=0;i<sz;i++) {
 | 
			
		||||
      int j1 = i&(~msk);
 | 
			
		||||
      int j2 = i|msk;
 | 
			
		||||
      if  ( (i&msk) == 0 ) { r1[i]=in1[j1];}
 | 
			
		||||
      else                 { r1[i]=in2[j1];}
 | 
			
		||||
 | 
			
		||||
      if  ( (i&msk) == 0 ) { r2[i]=in1[j2];}
 | 
			
		||||
      else                 { r2[i]=in2[j2];}
 | 
			
		||||
    }      
 | 
			
		||||
  }
 | 
			
		||||
  std::string name(void) const { return std::string("Exchange"); }
 | 
			
		||||
};
 | 
			
		||||
@@ -454,8 +459,8 @@ void ExchangeTester(const functor &func)
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl;
 | 
			
		||||
 | 
			
		||||
  //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference1[i]<<" "<<result1[i]<<std::endl;
 | 
			
		||||
  //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference2[i]<<" "<<result2[i]<<std::endl;
 | 
			
		||||
  //for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" ref "<<reference1[i]<<" res "<<result1[i]<<std::endl;
 | 
			
		||||
  //for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" ref "<<reference2[i]<<" res "<<result2[i]<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    int found=0;
 | 
			
		||||
@@ -465,7 +470,7 @@ void ExchangeTester(const functor &func)
 | 
			
		||||
	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    assert(found==1);
 | 
			
		||||
    //    assert(found==1);
 | 
			
		||||
  }
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    int found=0;
 | 
			
		||||
@@ -475,12 +480,14 @@ void ExchangeTester(const functor &func)
 | 
			
		||||
	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    assert(found==1);
 | 
			
		||||
    //    assert(found==1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    std::cout << " i "<< i
 | 
			
		||||
	      <<" result1  "<<result1[i]
 | 
			
		||||
	      <<" result2  "<<result2[i]
 | 
			
		||||
	      <<" test1  "<<test1[i]
 | 
			
		||||
	      <<" test2  "<<test2[i]
 | 
			
		||||
	      <<" input1 "<<input1[i]
 | 
			
		||||
@@ -728,7 +735,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      nrm = innerProduct(DD[i],DD[i]);
 | 
			
		||||
      auto tmp = Reduce(nrm);
 | 
			
		||||
      //      std::cout << tmp << std::endl;
 | 
			
		||||
      assert( tmp < 1.0e-6 ); 
 | 
			
		||||
      assert( tmp < 1.0e-3 ); 
 | 
			
		||||
    }
 | 
			
		||||
    std::cout <<" OK ! "<<std::endl;
 | 
			
		||||
 | 
			
		||||
@@ -743,7 +750,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      nrm = innerProduct(FF[i],FF[i]);
 | 
			
		||||
      auto tmp = Reduce(nrm);
 | 
			
		||||
      //      std::cout << tmp << std::endl;
 | 
			
		||||
      assert( tmp < 1.0e-6 ); 
 | 
			
		||||
      assert( tmp < 1.0e-3 ); 
 | 
			
		||||
    }
 | 
			
		||||
    std::cout <<" OK ! "<<std::endl;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user