From 3844bcf80065f878c35c900acea2d78bec3bc511 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Apr 2017 15:30:52 +0100
Subject: [PATCH] If no f16c instructions supported must use software half
 precision conversion. This will also become useful on BG/Q, so will move out
 from SSE4 into a general area. Lifted the Eigen half precision from web.
 Looks sensible, but not extensively regressed against the intrinsics
 implementation yet.

---
 configure.ac         |  26 ++++++----
 lib/simd/Grid_sse4.h | 111 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/configure.ac b/configure.ac
index f18beb32..81d38ae9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -84,16 +84,15 @@ case ${ac_LAPACK} in
 esac
 
 ############### FP16 conversions
-AC_ARG_ENABLE([fp16],
-    [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], 
-    [ac_FP16=${enable_fp16}], [ac_FP16=no])
-case ${ac_FP16} in
-    no)
-        ;;
+AC_ARG_ENABLE([sfw-fp16],
+    [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])], 
+    [ac_SFW_FP16=${enable_sfw_fp16}], [ac_SFW_FP16=yes])
+case ${ac_SFW_FP16} in
     yes)
-        AC_DEFINE([USE_FP16],[1],[conversion to fp16]);;
+      AC_DEFINE([SFW_FP16],[1],[software conversion to fp16]);;
+    no);;
     *)
-	;;
+      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac
 
 ############### MKL
@@ -189,7 +188,14 @@ case ${ax_cv_cxx_compiler_vendor} in
     case ${ac_SIMD} in
       SSE4)
         AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
-        SIMD_FLAGS='-msse4.2';;
+	case ${ac_SFW_FP16} in
+	  yes)
+	  SIMD_FLAGS='-msse4.2';;
+	  no)
+	  SIMD_FLAGS='-msse4.2 -mf16c';;
+	  *)
+          AC_MSG_ERROR(["SFW_FP16 must be either yes or no value ${ac_SFW_FP16} "]);;
+	esac;;
       AVX)
         AC_DEFINE([AVX1],[1],[AVX intrinsics])
         SIMD_FLAGS='-mavx -mf16c';;
@@ -423,7 +429,6 @@ AC_OUTPUT
 echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Summary of configuration for $PACKAGE v$VERSION
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 ----- PLATFORM ----------------------------------------
 architecture (build)        : $build_cpu
 os (build)                  : $build_os
@@ -436,6 +441,7 @@ SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp} 
 Communications type         : ${comms_type}
 Default precision           : ${ac_PRECISION}
+Software FP16 conversion    : ${ac_SFW_FP16}
 RNG choice                  : ${ac_RNG} 
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 LAPACK                      : ${ac_LAPACK}
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index 8a4537c2..59ad996a 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -38,7 +38,6 @@ Author: neo <cossu@post.kek.jp>
 
 #include <pmmintrin.h>
 
-
 namespace Grid {
 namespace Optimization {
 
@@ -333,26 +332,110 @@ namespace Optimization {
 #define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 #define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
 
+#ifdef SFW_FP16
+
+  struct Grid_half {
+    Grid_half(){}
+    Grid_half(uint16_t raw) : x(raw) {}
+    uint16_t x;
+  };
+  union FP32 {
+    unsigned int u;
+    float f;
+  };
+
+  // PAB - Lifted and adapted from Eigen, which is GPL V2
+  inline float sfw_half_to_float(Grid_half h) {
+    const FP32 magic = { 113 << 23 };
+    const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+    FP32 o;
+    o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
+    unsigned int exp = shifted_exp & o.u;   // just the exponent
+    o.u += (127 - 15) << 23;                // exponent adjust
+    // handle exponent special cases
+    if (exp == shifted_exp) {     // Inf/NaN?
+      o.u += (128 - 16) << 23;    // extra exp adjust
+    } else if (exp == 0) {        // Zero/Denormal?
+      o.u += 1 << 23;             // extra exp adjust
+      o.f -= magic.f;             // renormalize
+    }
+    o.u |= (h.x & 0x8000) << 16;    // sign bit
+    return o.f;
+  }
+  inline Grid_half sfw_float_to_half(float ff) {
+    FP32 f; f.f = ff;
+    const FP32 f32infty = { 255 << 23 };
+    const FP32 f16max = { (127 + 16) << 23 };
+    const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+    unsigned int sign_mask = 0x80000000u;
+    Grid_half o;
+    
+    o.x = static_cast<unsigned short>(0x0u);
+    unsigned int sign = f.u & sign_mask;
+    f.u ^= sign;
+    // NOTE all the integer compares in this function can be safely
+    // compiled into signed compares since all operands are below
+    // 0x80000000. Important if you want fast straight SSE2 code
+    // (since there's no unsigned PCMPGTD).
+    if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
+      o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+    } else {  // (De)normalized number or zero
+      if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
+	// use a magic value to align our 10 mantissa bits at the bottom of
+	// the float. as long as FP addition is round-to-nearest-even this
+	// just works.
+	f.f += denorm_magic.f;
+	// and one integer subtract of the bias later, we have our final float!
+	o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
+      } else {
+	unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+	
+	// update exponent, rounding bias part 1
+	f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
+	// rounding bias part 2
+	f.u += mant_odd;
+	// take the bits!
+	o.x = static_cast<unsigned short>(f.u >> 13);
+      }
+    } 
+    o.x |= static_cast<unsigned short>(sign >> 16);
+    return o;
+  }
+  static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
+    __m128i ret=(__m128i)_mm_setzero_ps();
+    float *fp = (float *)&f;
+    Grid_half *hp = (Grid_half *)&ret;
+    hp[0] = sfw_float_to_half(fp[0]);
+    hp[1] = sfw_float_to_half(fp[1]);
+    hp[2] = sfw_float_to_half(fp[2]);
+    hp[3] = sfw_float_to_half(fp[3]);
+    return ret;
+  }
+  static inline __m128i Grid_mm_cvtph_ps(__m128i h,int discard) {
+    __m128 ret=_mm_setzero_ps();
+    float *fp = (float *)&ret;
+    Grid_half  *hp = (Grid_half *)&h;
+    fp[0] = sfw_half_to_float(hp[0]);
+    fp[1] = sfw_half_to_float(hp[1]);
+    fp[2] = sfw_half_to_float(hp[2]);
+    fp[3] = sfw_half_to_float(hp[3]);
+    return ret;
+  }
+#else 
+#define Grid_mm_cvtps_ph _mm_cvtps_ph
+#define Grid_mm_cvtph_ps _mm_cvtph_ps
+#endif
   struct PrecisionChange {
     static inline __m128i StoH (__m128 a,__m128 b) {
-#ifdef USE_FP16
-      __m128i ha = _mm_cvtps_ph(a,0);
-      __m128i hb = _mm_cvtps_ph(b,0);
+      __m128i ha = Grid_mm_cvtps_ph(a,0);
+      __m128i hb = Grid_mm_cvtps_ph(b,0);
       __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
-#else
-      __m128i h = (__m128i)a;
-      assert(0);
-#endif
       return h;
     }
     static inline void  HtoS (__m128i h,__m128 &sa,__m128 &sb) {
-#ifdef USE_FP16
-      sa = _mm_cvtph_ps(h); 
+      sa = Grid_mm_cvtph_ps(h,0); 
       h =  (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2);
-      sb = _mm_cvtph_ps(h);
-#else
-      assert(0);
-#endif
+      sb = Grid_mm_cvtph_ps(h,0);
     }
     static inline __m128 DtoS (__m128d a,__m128d b) {
       __m128 sa = _mm_cvtpd_ps(a);