FP16 optional compile time

2025-08-02 20:57:06 +01:00 · 2017-04-13 11:55:24 +01:00
parent 73cdf0fffe
commit 1d502e4ed6
5 changed files with 50 additions and 6 deletions
--- a/configure.ac
+++ b/configure.ac
@@ -83,6 +83,19 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac

+############### FP16 conversions
+AC_ARG_ENABLE([fp16],
+    [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], 
+    [ac_FP16=${enable_fp16}], [ac_FP16=no])
+case ${ac_FP16} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([USE_FP16],[1],[conversion to fp16]);;
+    *)
+	;;
+esac
+
 ############### MKL
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -473,15 +473,23 @@ namespace Optimization {

  struct PrecisionChange {
    static inline __m256i StoH (__m256 a,__m256 b) {
+#ifdef USE_FP16
      __m128i ha = _mm256_cvtps_ph(a,0);
      __m128i hb = _mm256_cvtps_ph(b,0);
      __m256 h = _mm256_castps128_ps256(ha);
      h = _mm256_insertf128_ps(h,hb,1);
+#else 
+      assert(0);
+#endif
      return h;
    }
    static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) {
+#ifdef USE_FP16
      sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0));
      sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1));
+#else 
+      assert(0);
+#endif
    }
    static inline __m256 DtoS (__m256d a,__m256d b) {
      __m128 sa = _mm256_cvtpd_ps(a);
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -343,15 +343,23 @@ namespace Optimization {

  struct PrecisionChange {
    static inline __m512i StoH (__m512 a,__m512 b) {
+#ifdef USE_FP16
      __m256i ha = _mm512_cvtps_ph(a,0);
      __m256i hb = _mm512_cvtps_ph(b,0);
      __m512 h = _mm512_castps256_ps512(ha);
      h = _mm512_insertf256_ps(h,hb,1);
+#else
+      assert(0);
+#endif
      return h;
    }
    static inline void  HtoS (__m512i h,__m512 &sa,__m512 &sb) {
+#ifdef USE_FP16
      sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0));
      sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1));
+#else
+      assert(0);
+#endif
    }
    static inline __m512 DtoS (__m512d a,__m512d b) {
      __m256 sa = _mm512_cvtpd_ps(a);
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@@ -281,6 +281,7 @@ namespace Optimization {

  struct PrecisionChange {
    static inline vech StoH (const vecf &a,const vecf &b) {
+#ifdef USE_FP16
      vech ret;
      vech *ha = (vech *)&a;
      vech *hb = (vech *)&b;
@@ -289,9 +290,13 @@ namespace Optimization {
      //      VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
      VECTOR_FOR(i, nf,1){ ret.v[i]    = ha->v[2*i+1]; }
      VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
+#else
+      assert(0);
+#endif
      return ret;
    }
    static inline void  HtoS (vech h,vecf &sa,vecf &sb) {
+#ifdef USE_FP16
      const int nf = W<float>::r;
      const int nh = W<uint16_t>::r;
      vech *ha = (vech *)&sa;
@@ -301,6 +306,9 @@ namespace Optimization {
      //      VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
      VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
      VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
+#else
+      assert(0);
+#endif
    }
    static inline vecf DtoS (vecd a,vecd b) {
      const int nd = W<double>::r;
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -334,20 +334,27 @@ namespace Optimization {
 #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
 #endif 
+
  struct PrecisionChange {
    static inline __m128i StoH (__m128 a,__m128 b) {
-      //      __m128i ha = _mm_cvtps_ph(a,0);
-      //      __m128i hb = _mm_cvtps_ph(b,0);
-      //      __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+#ifdef USE_FP16
+      __m128i ha = _mm_cvtps_ph(a,0);
+      __m128i hb = _mm_cvtps_ph(b,0);
+      __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+#else
      __m128i h = (__m128i)a;
      assert(0);
+#endif
      return h;
    }
    static inline void  HtoS (__m128i h,__m128 &sa,__m128 &sb) {
-      //      sa = _mm_cvtph_ps(h); 
-      //      h =  (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2);
-      //      sb = _mm_cvtph_ps(h);
+#ifdef USE_FP16
+      sa = _mm_cvtph_ps(h); 
+      h =  (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2);
+      sb = _mm_cvtph_ps(h);
+#else
      assert(0);
+#endif
    }
    static inline __m128 DtoS (__m128d a,__m128d b) {
      __m128 sa = _mm_cvtpd_ps(a);