mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
FP16 optional compile time
This commit is contained in:
parent
73cdf0fffe
commit
1d502e4ed6
13
configure.ac
13
configure.ac
@ -83,6 +83,19 @@ case ${ac_LAPACK} in
|
||||
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
|
||||
esac
|
||||
|
||||
############### FP16 conversions
|
||||
AC_ARG_ENABLE([fp16],
|
||||
[AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])],
|
||||
[ac_FP16=${enable_fp16}], [ac_FP16=no])
|
||||
case ${ac_FP16} in
|
||||
no)
|
||||
;;
|
||||
yes)
|
||||
AC_DEFINE([USE_FP16],[1],[conversion to fp16]);;
|
||||
*)
|
||||
;;
|
||||
esac
|
||||
|
||||
############### MKL
|
||||
AC_ARG_ENABLE([mkl],
|
||||
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
|
||||
|
@ -473,15 +473,23 @@ namespace Optimization {
|
||||
|
||||
struct PrecisionChange {
|
||||
static inline __m256i StoH (__m256 a,__m256 b) {
|
||||
#ifdef USE_FP16
|
||||
__m128i ha = _mm256_cvtps_ph(a,0);
|
||||
__m128i hb = _mm256_cvtps_ph(b,0);
|
||||
__m256 h = _mm256_castps128_ps256(ha);
|
||||
h = _mm256_insertf128_ps(h,hb,1);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
return h;
|
||||
}
|
||||
static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) {
|
||||
#ifdef USE_FP16
|
||||
sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0));
|
||||
sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1));
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
static inline __m256 DtoS (__m256d a,__m256d b) {
|
||||
__m128 sa = _mm256_cvtpd_ps(a);
|
||||
|
@ -343,15 +343,23 @@ namespace Optimization {
|
||||
|
||||
struct PrecisionChange {
|
||||
static inline __m512i StoH (__m512 a,__m512 b) {
|
||||
#ifdef USE_FP16
|
||||
__m256i ha = _mm512_cvtps_ph(a,0);
|
||||
__m256i hb = _mm512_cvtps_ph(b,0);
|
||||
__m512 h = _mm512_castps256_ps512(ha);
|
||||
h = _mm512_insertf256_ps(h,hb,1);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
return h;
|
||||
}
|
||||
static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) {
|
||||
#ifdef USE_FP16
|
||||
sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0));
|
||||
sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1));
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
static inline __m512 DtoS (__m512d a,__m512d b) {
|
||||
__m256 sa = _mm512_cvtpd_ps(a);
|
||||
|
@ -281,6 +281,7 @@ namespace Optimization {
|
||||
|
||||
struct PrecisionChange {
|
||||
static inline vech StoH (const vecf &a,const vecf &b) {
|
||||
#ifdef USE_FP16
|
||||
vech ret;
|
||||
vech *ha = (vech *)&a;
|
||||
vech *hb = (vech *)&b;
|
||||
@ -289,9 +290,13 @@ namespace Optimization {
|
||||
// VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
|
||||
VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; }
|
||||
VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
static inline void HtoS (vech h,vecf &sa,vecf &sb) {
|
||||
#ifdef USE_FP16
|
||||
const int nf = W<float>::r;
|
||||
const int nh = W<uint16_t>::r;
|
||||
vech *ha = (vech *)&sa;
|
||||
@ -301,6 +306,9 @@ namespace Optimization {
|
||||
// VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
|
||||
VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
|
||||
VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
static inline vecf DtoS (vecd a,vecd b) {
|
||||
const int nd = W<double>::r;
|
||||
|
@ -334,20 +334,27 @@ namespace Optimization {
|
||||
#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
|
||||
#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
|
||||
#endif
|
||||
|
||||
struct PrecisionChange {
|
||||
static inline __m128i StoH (__m128 a,__m128 b) {
|
||||
// __m128i ha = _mm_cvtps_ph(a,0);
|
||||
// __m128i hb = _mm_cvtps_ph(b,0);
|
||||
// __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
#ifdef USE_FP16
|
||||
__m128i ha = _mm_cvtps_ph(a,0);
|
||||
__m128i hb = _mm_cvtps_ph(b,0);
|
||||
__m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
#else
|
||||
__m128i h = (__m128i)a;
|
||||
assert(0);
|
||||
#endif
|
||||
return h;
|
||||
}
|
||||
static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) {
|
||||
// sa = _mm_cvtph_ps(h);
|
||||
// h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2);
|
||||
// sb = _mm_cvtph_ps(h);
|
||||
#ifdef USE_FP16
|
||||
sa = _mm_cvtph_ps(h);
|
||||
h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2);
|
||||
sb = _mm_cvtph_ps(h);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
static inline __m128 DtoS (__m128d a,__m128d b) {
|
||||
__m128 sa = _mm_cvtpd_ps(a);
|
||||
|
Loading…
Reference in New Issue
Block a user