1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

FP16 optional compile time

This commit is contained in:
paboyle 2017-04-13 11:55:24 +01:00
parent 73cdf0fffe
commit 1d502e4ed6
5 changed files with 50 additions and 6 deletions

View File

@ -83,6 +83,19 @@ case ${ac_LAPACK} in
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
esac
############### FP16 conversions
AC_ARG_ENABLE([fp16],
[AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])],
[ac_FP16=${enable_fp16}], [ac_FP16=no])
case ${ac_FP16} in
no)
;;
yes)
AC_DEFINE([USE_FP16],[1],[conversion to fp16]);;
*)
;;
esac
############### MKL
AC_ARG_ENABLE([mkl],
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],

View File

@ -473,15 +473,23 @@ namespace Optimization {
struct PrecisionChange {
static inline __m256i StoH (__m256 a,__m256 b) {
#ifdef USE_FP16
__m128i ha = _mm256_cvtps_ph(a,0);
__m128i hb = _mm256_cvtps_ph(b,0);
__m256 h = _mm256_castps128_ps256(ha);
h = _mm256_insertf128_ps(h,hb,1);
#else
assert(0);
#endif
return h;
}
static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) {
#ifdef USE_FP16
sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0));
sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1));
#else
assert(0);
#endif
}
static inline __m256 DtoS (__m256d a,__m256d b) {
__m128 sa = _mm256_cvtpd_ps(a);

View File

@ -343,15 +343,23 @@ namespace Optimization {
struct PrecisionChange {
static inline __m512i StoH (__m512 a,__m512 b) {
#ifdef USE_FP16
__m256i ha = _mm512_cvtps_ph(a,0);
__m256i hb = _mm512_cvtps_ph(b,0);
__m512 h = _mm512_castps256_ps512(ha);
h = _mm512_insertf256_ps(h,hb,1);
#else
assert(0);
#endif
return h;
}
static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) {
#ifdef USE_FP16
sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0));
sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1));
#else
assert(0);
#endif
}
static inline __m512 DtoS (__m512d a,__m512d b) {
__m256 sa = _mm512_cvtpd_ps(a);

View File

@ -281,6 +281,7 @@ namespace Optimization {
struct PrecisionChange {
static inline vech StoH (const vecf &a,const vecf &b) {
#ifdef USE_FP16
vech ret;
vech *ha = (vech *)&a;
vech *hb = (vech *)&b;
@ -289,9 +290,13 @@ namespace Optimization {
// VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; }
VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
#else
assert(0);
#endif
return ret;
}
static inline void HtoS (vech h,vecf &sa,vecf &sb) {
#ifdef USE_FP16
const int nf = W<float>::r;
const int nh = W<uint16_t>::r;
vech *ha = (vech *)&sa;
@ -301,6 +306,9 @@ namespace Optimization {
// VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
#else
assert(0);
#endif
}
static inline vecf DtoS (vecd a,vecd b) {
const int nd = W<double>::r;

View File

@ -334,20 +334,27 @@ namespace Optimization {
#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
#endif
struct PrecisionChange {
static inline __m128i StoH (__m128 a,__m128 b) {
// __m128i ha = _mm_cvtps_ph(a,0);
// __m128i hb = _mm_cvtps_ph(b,0);
// __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
#ifdef USE_FP16
__m128i ha = _mm_cvtps_ph(a,0);
__m128i hb = _mm_cvtps_ph(b,0);
__m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
#else
__m128i h = (__m128i)a;
assert(0);
#endif
return h;
}
static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) {
// sa = _mm_cvtph_ps(h);
// h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2);
// sb = _mm_cvtph_ps(h);
#ifdef USE_FP16
sa = _mm_cvtph_ps(h);
h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2);
sb = _mm_cvtph_ps(h);
#else
assert(0);
#endif
}
static inline __m128 DtoS (__m128d a,__m128d b) {
__m128 sa = _mm_cvtpd_ps(a);