From 3844bcf80065f878c35c900acea2d78bec3bc511 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 20 Apr 2017 15:30:52 +0100 Subject: [PATCH] If no f16c instructions supported must use software half precision conversion. This will also become useful on BG/Q, so will move out from SSE4 into a general area. Lifted the Eigen half precision from web. Looks sensible, but not extensively regressed against the intrinsics implementation yet. --- configure.ac | 26 ++++++---- lib/simd/Grid_sse4.h | 111 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 113 insertions(+), 24 deletions(-) diff --git a/configure.ac b/configure.ac index f18beb32..81d38ae9 100644 --- a/configure.ac +++ b/configure.ac @@ -84,16 +84,15 @@ case ${ac_LAPACK} in esac ############### FP16 conversions -AC_ARG_ENABLE([fp16], - [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], - [ac_FP16=${enable_fp16}], [ac_FP16=no]) -case ${ac_FP16} in - no) - ;; +AC_ARG_ENABLE([sfw-fp16], + [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])], + [ac_SFW_FP16=${enable_sfw_fp16}], [ac_SFW_FP16=yes]) +case ${ac_SFW_FP16} in yes) - AC_DEFINE([USE_FP16],[1],[conversion to fp16]);; + AC_DEFINE([SFW_FP16],[1],[software conversion to fp16]);; + no);; *) - ;; + AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; esac ############### MKL @@ -189,7 +188,14 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) - SIMD_FLAGS='-msse4.2';; + case ${ac_SFW_FP16} in + yes) + SIMD_FLAGS='-msse4.2';; + no) + SIMD_FLAGS='-msse4.2 -mf16c';; + *) + AC_MSG_ERROR(["SFW_FP16 must be either yes or no value ${ac_SFW_FP16} "]);; + esac;; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -mf16c';; @@ -423,7 +429,6 @@ AC_OUTPUT echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Summary of configuration for $PACKAGE v$VERSION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ----- PLATFORM ---------------------------------------- architecture (build) : $build_cpu os (build) : $build_os @@ -436,6 +441,7 @@ SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} Threading : ${ac_openmp} Communications type : ${comms_type} Default precision : ${ac_PRECISION} +Software FP16 conversion : ${ac_SFW_FP16} RNG choice : ${ac_RNG} GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi` LAPACK : ${ac_LAPACK} diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 8a4537c2..59ad996a 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -38,7 +38,6 @@ Author: neo #include - namespace Grid { namespace Optimization { @@ -333,26 +332,110 @@ namespace Optimization { #define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) +#ifdef SFW_FP16 + + struct Grid_half { + Grid_half(){} + Grid_half(uint16_t raw) : x(raw) {} + uint16_t x; + }; + union FP32 { + unsigned int u; + float f; + }; + + // PAB - Lifted and adapted from Eigen, which is GPL V2 + inline float sfw_half_to_float(Grid_half h) { + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; + } + inline Grid_half sfw_float_to_half(float ff) { + FP32 f; f.f = ff; + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + Grid_half o; + + o.x = static_cast(0x0u); + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + o.x |= static_cast(sign >> 16); + return o; + } + static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) { + __m128i ret=(__m128i)_mm_setzero_ps(); + float *fp = (float *)&f; + Grid_half *hp = (Grid_half *)&ret; + hp[0] = sfw_float_to_half(fp[0]); + hp[1] = sfw_float_to_half(fp[1]); + hp[2] = sfw_float_to_half(fp[2]); + hp[3] = sfw_float_to_half(fp[3]); + return ret; + } + static inline __m128i Grid_mm_cvtph_ps(__m128i h,int discard) { + __m128 ret=_mm_setzero_ps(); + float *fp = (float *)&ret; + Grid_half *hp = (Grid_half *)&h; + fp[0] = sfw_half_to_float(hp[0]); + fp[1] = sfw_half_to_float(hp[1]); + fp[2] = sfw_half_to_float(hp[2]); + fp[3] = sfw_half_to_float(hp[3]); + return ret; + } +#else +#define Grid_mm_cvtps_ph _mm_cvtps_ph +#define Grid_mm_cvtph_ps _mm_cvtph_ps +#endif struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { -#ifdef USE_FP16 - __m128i ha = _mm_cvtps_ph(a,0); - __m128i hb = _mm_cvtps_ph(b,0); + __m128i ha = Grid_mm_cvtps_ph(a,0); + __m128i hb = Grid_mm_cvtps_ph(b,0); __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); -#else - __m128i h = (__m128i)a; - assert(0); -#endif return h; } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { -#ifdef USE_FP16 - sa = _mm_cvtph_ps(h); + sa = Grid_mm_cvtph_ps(h,0); h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); - sb = _mm_cvtph_ps(h); -#else - assert(0); -#endif + sb = Grid_mm_cvtph_ps(h,0); } static inline __m128 DtoS (__m128d a,__m128d b) { __m128 sa = _mm_cvtpd_ps(a);