diff --git a/configure.ac b/configure.ac index c45693c5..e0820d79 100644 --- a/configure.ac +++ b/configure.ac @@ -83,6 +83,19 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; esac +############### FP16 conversions +AC_ARG_ENABLE([fp16], + [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], + [ac_FP16=${enable_fp16}], [ac_FP16=no]) +case ${ac_FP16} in + no) + ;; + yes) + AC_DEFINE([USE_FP16],[1],[conversion to fp16]);; + *) + ;; +esac + ############### MKL AC_ARG_ENABLE([mkl], [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index c2e02cb2..73792f84 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -473,15 +473,23 @@ namespace Optimization { struct PrecisionChange { static inline __m256i StoH (__m256 a,__m256 b) { +#ifdef USE_FP16 __m128i ha = _mm256_cvtps_ph(a,0); __m128i hb = _mm256_cvtps_ph(b,0); __m256 h = _mm256_castps128_ps256(ha); h = _mm256_insertf128_ps(h,hb,1); +#else + assert(0); +#endif return h; } static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { +#ifdef USE_FP16 sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); +#else + assert(0); +#endif } static inline __m256 DtoS (__m256d a,__m256d b) { __m128 sa = _mm256_cvtpd_ps(a); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 87798637..dae3c1c7 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -343,15 +343,23 @@ namespace Optimization { struct PrecisionChange { static inline __m512i StoH (__m512 a,__m512 b) { +#ifdef USE_FP16 __m256i ha = _mm512_cvtps_ph(a,0); __m256i hb = _mm512_cvtps_ph(b,0); __m512 h = _mm512_castps256_ps512(ha); h = _mm512_insertf256_ps(h,hb,1); +#else + assert(0); +#endif return h; } static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { +#ifdef USE_FP16 sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0)); sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1)); +#else + assert(0); +#endif } static inline __m512 DtoS (__m512d a,__m512d b) { __m256 sa = _mm512_cvtpd_ps(a); diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 466aa2f7..bdf5aa01 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -281,6 +281,7 @@ namespace Optimization { struct PrecisionChange { static inline vech StoH (const vecf &a,const vecf &b) { +#ifdef USE_FP16 vech ret; vech *ha = (vech *)&a; vech *hb = (vech *)&b; @@ -289,9 +290,13 @@ namespace Optimization { // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } +#else + assert(0); +#endif return ret; } static inline void HtoS (vech h,vecf &sa,vecf &sb) { +#ifdef USE_FP16 const int nf = W::r; const int nh = W::r; vech *ha = (vech *)&sa; @@ -301,6 +306,9 @@ namespace Optimization { // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } +#else + assert(0); +#endif } static inline vecf DtoS (vecd a,vecd b) { const int nd = W::r; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index a6ab369d..969ba3ed 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -334,20 +334,27 @@ namespace Optimization { #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) #endif + struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { - // __m128i ha = _mm_cvtps_ph(a,0); - // __m128i hb = _mm_cvtps_ph(b,0); - // __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); +#ifdef USE_FP16 + __m128i ha = _mm_cvtps_ph(a,0); + __m128i hb = _mm_cvtps_ph(b,0); + __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); +#else __m128i h = (__m128i)a; assert(0); +#endif return h; } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { - // sa = _mm_cvtph_ps(h); - // h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); - // sb = _mm_cvtph_ps(h); +#ifdef USE_FP16 + sa = _mm_cvtph_ps(h); + h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + sb = _mm_cvtph_ps(h); +#else assert(0); +#endif } static inline __m128 DtoS (__m128d a,__m128d b) { __m128 sa = _mm_cvtpd_ps(a);