From 1d502e4ed6e481647702f2f1fe39fa9bb8c6084f Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 13 Apr 2017 11:55:24 +0100
Subject: [PATCH] FP16 optional compile time

---
 configure.ac            | 13 +++++++++++++
 lib/simd/Grid_avx.h     |  8 ++++++++
 lib/simd/Grid_avx512.h  |  8 ++++++++
 lib/simd/Grid_generic.h |  8 ++++++++
 lib/simd/Grid_sse4.h    | 19 +++++++++++++------
 5 files changed, 50 insertions(+), 6 deletions(-)
diff --git a/configure.ac b/configure.ac
index c45693c5..e0820d79 100644
--- a/configure.ac
+++ b/configure.ac
@@ -83,6 +83,19 @@ case ${ac_LAPACK} in
         AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac
 
+############### FP16 conversions
+AC_ARG_ENABLE([fp16],
+    [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], 
+    [ac_FP16=${enable_fp16}], [ac_FP16=no])
+case ${ac_FP16} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([USE_FP16],[1],[conversion to fp16]);;
+    *)
+	;;
+esac
+
 ############### MKL
 AC_ARG_ENABLE([mkl],
     [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index c2e02cb2..73792f84 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -473,15 +473,23 @@ namespace Optimization {
 
   struct PrecisionChange {
     static inline __m256i StoH (__m256 a,__m256 b) {
+#ifdef USE_FP16
       __m128i ha = _mm256_cvtps_ph(a,0);
       __m128i hb = _mm256_cvtps_ph(b,0);
       __m256 h = _mm256_castps128_ps256(ha);
       h = _mm256_insertf128_ps(h,hb,1);
+#else 
+      assert(0);
+#endif
       return h;
     }
     static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) {
+#ifdef USE_FP16
       sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0));
       sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1));
+#else 
+      assert(0);
+#endif
     }
     static inline __m256 DtoS (__m256d a,__m256d b) {
       __m128 sa = _mm256_cvtpd_ps(a);
diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 87798637..dae3c1c7 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -343,15 +343,23 @@ namespace Optimization {
 
   struct PrecisionChange {
     static inline __m512i StoH (__m512 a,__m512 b) {
+#ifdef USE_FP16
       __m256i ha = _mm512_cvtps_ph(a,0);
       __m256i hb = _mm512_cvtps_ph(b,0);
       __m512 h = _mm512_castps256_ps512(ha);
       h = _mm512_insertf256_ps(h,hb,1);
+#else
+      assert(0);
+#endif
       return h;
     }
     static inline void  HtoS (__m512i h,__m512 &sa,__m512 &sb) {
+#ifdef USE_FP16
       sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0));
       sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1));
+#else
+      assert(0);
+#endif
     }
     static inline __m512 DtoS (__m512d a,__m512d b) {
       __m256 sa = _mm512_cvtpd_ps(a);
diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h
index 466aa2f7..bdf5aa01 100644
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@@ -281,6 +281,7 @@ namespace Optimization {
 
   struct PrecisionChange {
     static inline vech StoH (const vecf &a,const vecf &b) {
+#ifdef USE_FP16
       vech ret;
       vech *ha = (vech *)&a;
       vech *hb = (vech *)&b;
@@ -289,9 +290,13 @@ namespace Optimization {
       //      VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
       VECTOR_FOR(i, nf,1){ ret.v[i]    = ha->v[2*i+1]; }
       VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
+#else
+      assert(0);
+#endif
       return ret;
     }
     static inline void  HtoS (vech h,vecf &sa,vecf &sb) {
+#ifdef USE_FP16
       const int nf = W<float>::r;
       const int nh = W<uint16_t>::r;
       vech *ha = (vech *)&sa;
@@ -301,6 +306,9 @@ namespace Optimization {
       //      VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
       VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
       VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
+#else
+      assert(0);
+#endif
     }
     static inline vecf DtoS (vecd a,vecd b) {
       const int nd = W<double>::r;
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index a6ab369d..969ba3ed 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -334,20 +334,27 @@ namespace Optimization {
 #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
 #endif 
+
   struct PrecisionChange {
     static inline __m128i StoH (__m128 a,__m128 b) {
-      //      __m128i ha = _mm_cvtps_ph(a,0);
-      //      __m128i hb = _mm_cvtps_ph(b,0);
-      //      __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+#ifdef USE_FP16
+      __m128i ha = _mm_cvtps_ph(a,0);
+      __m128i hb = _mm_cvtps_ph(b,0);
+      __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+#else
       __m128i h = (__m128i)a;
       assert(0);
+#endif
       return h;
     }
     static inline void  HtoS (__m128i h,__m128 &sa,__m128 &sb) {
-      //      sa = _mm_cvtph_ps(h); 
-      //      h =  (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2);
-      //      sb = _mm_cvtph_ps(h);
+#ifdef USE_FP16
+      sa = _mm_cvtph_ps(h); 
+      h =  (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2);
+      sb = _mm_cvtph_ps(h);
+#else
       assert(0);
+#endif
     }
     static inline __m128 DtoS (__m128d a,__m128d b) {
       __m128 sa = _mm_cvtpd_ps(a);