1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Updates for supporting Mobius better

This commit is contained in:
Peter Boyle 2016-12-08 16:43:28 +00:00
parent e27c6b217c
commit 69ae817d1c
10 changed files with 154 additions and 12 deletions

View File

@ -213,6 +213,29 @@ namespace Optimization {
}
};
struct MultRealPart{
inline __m256 operator()(__m256 a, __m256 b){
__m256 ymm0;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m256d operator()(__m256d a, __m256d b){
__m256d ymm0;
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m256 operator()(__m256 a, __m256 b, __m256 c){
__m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar,
_mm256_add_ps(_mm256_mul_ps( ymm0, b),c);
}
inline __m256d operator()(__m256d a, __m256d b, __m256d c){
__m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 );
return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c);
}
};
struct MultComplex{
// Complex float
inline __m256 operator()(__m256 a, __m256 b){
@ -627,7 +650,9 @@ namespace Optimization {
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;

View File

@ -189,6 +189,29 @@ namespace Optimization {
// 2mul,4 mac +add+sub = 8 flop type insns
// 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load.
struct MultRealPart{
inline __m512 operator()(__m512 a, __m512 b){
__m512 ymm0;
ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m512d operator()(__m512d a, __m512d b){
__m512d ymm0;
ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m512 operator()(__m512 a, __m512 b, __m512 c){
__m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
return _mm512_fmadd_ps( ymm0, b, c);
}
inline __m512d operator()(__m512d a, __m512d b, __m512d c){
__m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
return _mm512_fmadd_pd( ymm0, b, c);
}
};
struct MultComplex{
// Complex float
inline __m512 operator()(__m512 a, __m512 b){
@ -501,6 +524,8 @@ namespace Optimization {
typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;

View File

@ -224,6 +224,21 @@ namespace Optimization {
#define cmul(a, b, c, i)\
c[i] = a[i]*b[i] - a[i+1]*b[i+1];\
c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
struct MultRealPart{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
out.v[2*i] = a[2*i]*b[2*i];
out.v[2*i+1] = a[2*i]*b[2*i+1];
}
return out;
};
};
struct MultComplex{
// Complex
@ -456,6 +471,7 @@ namespace Optimization {
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;

View File

@ -220,6 +220,14 @@ namespace Optimization {
}
};
struct MultRealPart{
// Complex double
inline vector4double operator()(vector4double a, vector4double b){
// return vec_xmul(b, a);
return vec_xmul(a, b);
}
FLOAT_WRAP_2(operator(), inline)
};
struct MultComplex{
// Complex double
inline vector4double operator()(vector4double a, vector4double b){
@ -430,6 +438,7 @@ typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;

View File

@ -177,6 +177,29 @@ namespace Optimization {
}
};
struct MultRealPart{
inline __m128 operator()(__m128 a, __m128 b){
__m128 ymm0;
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m128d operator()(__m128d a, __m128d b){
__m128d ymm0;
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m128 operator()(__m128 a, __m128 b, __m128 c){
__m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
_mm_add_ps(_mm_mul_ps( ymm0, b),c);
}
inline __m128d operator()(__m128d a, __m128d b, __m128 c){
__m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
return _mm_add_pd(_mm_mul_pd( ymm0, b),c);
}
};
struct MultComplex{
// Complex float
inline __m128 operator()(__m128 a, __m128 b){
@ -415,6 +438,8 @@ namespace Optimization {
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;

View File

@ -101,6 +101,11 @@ template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integr
// general forms to allow for vsplat syntax
// need explicit declaration of types when used since
// clang cannot automatically determine the output type sometimes
template <class Out, class Input1, class Input2, class Input3, class Operation>
Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
return op(src_1, src_2, src_3);
}
template <class Out, class Input1, class Input2, class Operation>
Out binary(Input1 src_1, Input2 src_2, Operation op) {
return op(src_1, src_2);
@ -178,6 +183,7 @@ class Grid_simd {
const Grid_simd *__restrict__ r) {
*y = (*l) * (*r);
}
friend inline void sub(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l,
const Grid_simd *__restrict__ r) {
@ -188,7 +194,6 @@ class Grid_simd {
const Grid_simd *__restrict__ r) {
*y = (*l) + (*r);
}
friend inline void mac(Grid_simd *__restrict__ y,
const Scalar_type *__restrict__ a,
const Grid_simd *__restrict__ x) {
@ -260,7 +265,7 @@ class Grid_simd {
}
////////////////////////////
// opreator scalar * simd
// operator scalar * simd
////////////////////////////
friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
Grid_simd va;
@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
S* typepun =(S*) &src;
vsplat(ret,typepun[lane]);
}
template <class S, class V, IfComplex<S> =0>
inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
S* typepun =(S*) &src;
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
}
///////////////////////
// Splat
@ -449,6 +459,10 @@ template <class S, class V>
inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
vsplat(ret, real(c), imag(c));
}
template <class S, class V>
inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
vsplat(ret, real(c), real(c));
}
// if real fill with a, if complex fill with a in the real part (first function
// above)
@ -550,6 +564,21 @@ inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
return ret;
};
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
return ret;
};
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
Grid_simd<S, V> ret;
ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
return ret;
};
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {

View File

@ -95,10 +95,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n"
#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n"
#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST)
#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST)
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
@ -106,11 +110,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"

View File

@ -87,7 +87,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VACCTIMESMINUSI1d(A,ACC,tmp) \
VACCTIMESMINUSI2d(A,ACC,tmp)
#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A );
#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A
#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr));
#define LOAD64(A,ptr) LOAD64i(A,ptr)
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
@ -108,8 +109,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
// "clevict0 "#O"*64("#A");\n"
#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
@ -143,8 +144,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#else
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#endif
// Swaps Re/Im ; could unify this with IMCI

View File

@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
#undef VMADDRDUP
#undef VMADDSUBRDUP
#undef VMADDSUBIDUP
#undef VMULRDUP
#undef VMULIDUP
#define VMADDRDUP(O,P,B,accum) VMADDRDUPd(O,P,B,accum)
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)

View File

@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
#undef VMADDRDUP
#undef VMADDSUBRDUP
#undef VMADDSUBIDUP
#undef VMULRDUP
#undef VMULIDUP
#define VMADDRDUP(O,P,B,accum) VMADDRDUPf(O,P,B,accum)
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)