mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Updates for supporting Mobius better
This commit is contained in:
		@@ -213,6 +213,29 @@ namespace Optimization {
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultRealPart{
 | 
			
		||||
    inline __m256 operator()(__m256 a, __m256 b){
 | 
			
		||||
      __m256 ymm0;
 | 
			
		||||
      ymm0  = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
 | 
			
		||||
      return  _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
 | 
			
		||||
    }
 | 
			
		||||
    inline __m256d operator()(__m256d a, __m256d b){
 | 
			
		||||
      __m256d ymm0;
 | 
			
		||||
      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
 | 
			
		||||
      return _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  struct MaddRealPart{
 | 
			
		||||
    inline __m256 operator()(__m256 a, __m256 b, __m256 c){
 | 
			
		||||
      __m256 ymm0 =  _mm256_moveldup_ps(a); // ymm0 <- ar ar,
 | 
			
		||||
      _mm256_add_ps(_mm256_mul_ps( ymm0, b),c);                         
 | 
			
		||||
    }
 | 
			
		||||
    inline __m256d operator()(__m256d a, __m256d b, __m256d c){
 | 
			
		||||
      __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 );
 | 
			
		||||
      return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c);                         
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline __m256 operator()(__m256 a, __m256 b){
 | 
			
		||||
@@ -628,6 +651,8 @@ namespace Optimization {
 | 
			
		||||
  typedef Optimization::Div         DivSIMD;
 | 
			
		||||
  typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
  typedef Optimization::MultComplex  MultComplexSIMD;
 | 
			
		||||
  typedef Optimization::MultRealPart MultRealPartSIMD;
 | 
			
		||||
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
 | 
			
		||||
  typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
  typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
  typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
 
 | 
			
		||||
@@ -189,6 +189,29 @@ namespace Optimization {
 | 
			
		||||
  //  2mul,4 mac +add+sub = 8 flop type insns
 | 
			
		||||
  //  3shuf + 2 (+shuf)   = 5/6 simd perm and 1/2 the load.
 | 
			
		||||
 | 
			
		||||
  struct MultRealPart{
 | 
			
		||||
    inline __m512 operator()(__m512 a, __m512 b){
 | 
			
		||||
      __m512 ymm0;
 | 
			
		||||
      ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
 | 
			
		||||
      return _mm512_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
 | 
			
		||||
    }
 | 
			
		||||
    inline __m512d operator()(__m512d a, __m512d b){
 | 
			
		||||
      __m512d ymm0;
 | 
			
		||||
      ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
 | 
			
		||||
      return _mm512_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  struct MaddRealPart{
 | 
			
		||||
    inline __m512 operator()(__m512 a, __m512 b, __m512 c){
 | 
			
		||||
      __m512 ymm0 =  _mm512_moveldup_ps(a); // ymm0 <- ar ar,
 | 
			
		||||
      return _mm512_fmadd_ps( ymm0, b, c);                         
 | 
			
		||||
    }
 | 
			
		||||
    inline __m512d operator()(__m512d a, __m512d b, __m512d c){
 | 
			
		||||
      __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
 | 
			
		||||
      return _mm512_fmadd_pd( ymm0, b, c);                         
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline __m512 operator()(__m512 a, __m512 b){
 | 
			
		||||
@@ -501,6 +524,8 @@ namespace Optimization {
 | 
			
		||||
  typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
  typedef Optimization::Div         DivSIMD;
 | 
			
		||||
  typedef Optimization::MultComplex MultComplexSIMD;
 | 
			
		||||
  typedef Optimization::MultRealPart MultRealPartSIMD;
 | 
			
		||||
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
 | 
			
		||||
  typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
  typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
  typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
 
 | 
			
		||||
@@ -225,6 +225,21 @@ namespace Optimization {
 | 
			
		||||
  c[i]   = a[i]*b[i]   - a[i+1]*b[i+1];\
 | 
			
		||||
  c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
 | 
			
		||||
 | 
			
		||||
  struct MultRealPart{
 | 
			
		||||
    template <typename T>
 | 
			
		||||
    inline vec<T> operator()(vec<T> a, vec<T> b){
 | 
			
		||||
      vec<T> out;
 | 
			
		||||
      
 | 
			
		||||
      VECTOR_FOR(i, W<T>::c, 1)
 | 
			
		||||
      {
 | 
			
		||||
         out.v[2*i]   = a[2*i]*b[2*i];
 | 
			
		||||
         out.v[2*i+1] = a[2*i]*b[2*i+1];
 | 
			
		||||
      }      
 | 
			
		||||
      return out;
 | 
			
		||||
    };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex
 | 
			
		||||
    template <typename T>
 | 
			
		||||
@@ -456,6 +471,7 @@ namespace Optimization {
 | 
			
		||||
  typedef Optimization::Div         DivSIMD;
 | 
			
		||||
  typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
  typedef Optimization::MultComplex MultComplexSIMD;
 | 
			
		||||
  typedef Optimization::MultRealPart MultRealPartSIMD;
 | 
			
		||||
  typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
  typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
  typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
 
 | 
			
		||||
@@ -220,6 +220,14 @@ namespace Optimization {
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  struct MultRealPart{
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline vector4double operator()(vector4double a, vector4double b){
 | 
			
		||||
  //      return vec_xmul(b, a);
 | 
			
		||||
        return vec_xmul(a, b);
 | 
			
		||||
    }
 | 
			
		||||
    FLOAT_WRAP_2(operator(), inline)
 | 
			
		||||
  };
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline vector4double operator()(vector4double a, vector4double b){
 | 
			
		||||
@@ -430,6 +438,7 @@ typedef Optimization::Sub         SubSIMD;
 | 
			
		||||
typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
typedef Optimization::Div         DivSIMD;
 | 
			
		||||
typedef Optimization::MultComplex MultComplexSIMD;
 | 
			
		||||
typedef Optimization::MultRealPart MultRealPartSIMD;
 | 
			
		||||
typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
 
 | 
			
		||||
@@ -177,6 +177,29 @@ namespace Optimization {
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultRealPart{
 | 
			
		||||
    inline __m128 operator()(__m128 a, __m128 b){
 | 
			
		||||
      __m128 ymm0;
 | 
			
		||||
      ymm0  = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
 | 
			
		||||
      return  _mm_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
 | 
			
		||||
    }
 | 
			
		||||
    inline __m128d operator()(__m128d a, __m128d b){
 | 
			
		||||
      __m128d ymm0;
 | 
			
		||||
      ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
 | 
			
		||||
      return _mm_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  struct MaddRealPart{
 | 
			
		||||
    inline __m128 operator()(__m128 a, __m128 b, __m128 c){
 | 
			
		||||
      __m128 ymm0 =  _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
 | 
			
		||||
      _mm_add_ps(_mm_mul_ps( ymm0, b),c);                         
 | 
			
		||||
    }
 | 
			
		||||
    inline __m128d operator()(__m128d a, __m128d b, __m128 c){
 | 
			
		||||
      __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
 | 
			
		||||
      return _mm_add_pd(_mm_mul_pd( ymm0, b),c);                         
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline __m128 operator()(__m128 a, __m128 b){
 | 
			
		||||
@@ -415,6 +438,8 @@ namespace Optimization {
 | 
			
		||||
  typedef Optimization::Div         DivSIMD;
 | 
			
		||||
  typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
  typedef Optimization::MultComplex MultComplexSIMD;
 | 
			
		||||
  typedef Optimization::MultRealPart MultRealPartSIMD;
 | 
			
		||||
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
 | 
			
		||||
  typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
  typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
  typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
 
 | 
			
		||||
@@ -101,6 +101,11 @@ template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integr
 | 
			
		||||
// general forms to allow for vsplat syntax
 | 
			
		||||
// need explicit declaration of types when used since
 | 
			
		||||
// clang cannot automatically determine the output type sometimes
 | 
			
		||||
template <class Out, class Input1, class Input2, class Input3, class Operation>
 | 
			
		||||
Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
 | 
			
		||||
  return op(src_1, src_2, src_3);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Out, class Input1, class Input2, class Operation>
 | 
			
		||||
Out binary(Input1 src_1, Input2 src_2, Operation op) {
 | 
			
		||||
  return op(src_1, src_2);
 | 
			
		||||
@@ -178,6 +183,7 @@ class Grid_simd {
 | 
			
		||||
                          const Grid_simd *__restrict__ r) {
 | 
			
		||||
    *y = (*l) * (*r);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  friend inline void sub(Grid_simd *__restrict__ y,
 | 
			
		||||
                         const Grid_simd *__restrict__ l,
 | 
			
		||||
                         const Grid_simd *__restrict__ r) {
 | 
			
		||||
@@ -188,7 +194,6 @@ class Grid_simd {
 | 
			
		||||
                         const Grid_simd *__restrict__ r) {
 | 
			
		||||
    *y = (*l) + (*r);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  friend inline void mac(Grid_simd *__restrict__ y,
 | 
			
		||||
                         const Scalar_type *__restrict__ a,
 | 
			
		||||
                         const Grid_simd *__restrict__ x) {
 | 
			
		||||
@@ -260,7 +265,7 @@ class Grid_simd {
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////
 | 
			
		||||
  // opreator scalar * simd
 | 
			
		||||
  // operator scalar * simd
 | 
			
		||||
  ////////////////////////////
 | 
			
		||||
  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
 | 
			
		||||
    Grid_simd va;
 | 
			
		||||
@@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
 | 
			
		||||
  S* typepun =(S*) &src;
 | 
			
		||||
  vsplat(ret,typepun[lane]);
 | 
			
		||||
}    
 | 
			
		||||
template <class S, class V, IfComplex<S> =0> 
 | 
			
		||||
inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
 | 
			
		||||
  S* typepun =(S*) &src;
 | 
			
		||||
  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
 | 
			
		||||
}    
 | 
			
		||||
 | 
			
		||||
///////////////////////
 | 
			
		||||
// Splat
 | 
			
		||||
@@ -449,6 +459,10 @@ template <class S, class V>
 | 
			
		||||
inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
 | 
			
		||||
  vsplat(ret, real(c), imag(c));
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V>
 | 
			
		||||
inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
 | 
			
		||||
  vsplat(ret, real(c), real(c));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// if real fill with a, if complex fill with a in the real part (first function
 | 
			
		||||
// above)
 | 
			
		||||
@@ -550,6 +564,21 @@ inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
 | 
			
		||||
  return ret;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Distinguish between complex types and others
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
 | 
			
		||||
  Grid_simd<S, V> ret;
 | 
			
		||||
  ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
 | 
			
		||||
  return ret;
 | 
			
		||||
};
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
 | 
			
		||||
  Grid_simd<S, V> ret;
 | 
			
		||||
  ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
 | 
			
		||||
  return ret;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Distinguish between complex types and others
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
 | 
			
		||||
 
 | 
			
		||||
@@ -99,6 +99,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
 | 
			
		||||
#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss    (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss    (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
#define VBCASTCDUPf(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*64  )(" #A ")," #DEST  ";\n" 
 | 
			
		||||
#define VBCASTZDUPf(OFF,A,DEST)           "vbroadcastf32x4 (" #OFF "*64  )(" #A ")," #DEST  ";\n" 
 | 
			
		||||
#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST) 
 | 
			
		||||
#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST) 
 | 
			
		||||
 | 
			
		||||
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
 | 
			
		||||
@@ -106,11 +110,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
 
 | 
			
		||||
@@ -87,7 +87,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
  VACCTIMESMINUSI1d(A,ACC,tmp)				\
 | 
			
		||||
  VACCTIMESMINUSI2d(A,ACC,tmp)			
 | 
			
		||||
 | 
			
		||||
#define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
 | 
			
		||||
#define LOAD64a(A,ptr)  "movq %0, %" #A :  : "r"(ptr)  : #A  
 | 
			
		||||
#define LOAD64i(A,ptr)  __asm__ ( LOAD64a(A,ptr));
 | 
			
		||||
#define LOAD64(A,ptr)  LOAD64i(A,ptr)
 | 
			
		||||
 | 
			
		||||
#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 | 
			
		||||
@@ -108,8 +109,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 | 
			
		||||
//  "clevict0 "#O"*64("#A");\n" 
 | 
			
		||||
 | 
			
		||||
#define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
#define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
#define VLOADf(OFF,PTR,DEST)   "vmovups  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
#define VLOADd(OFF,PTR,DEST)   "vmovupd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
#define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
@@ -143,8 +144,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define VSTOREf(OFF,PTR,SRC)   "vmovntps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
#define VSTOREd(OFF,PTR,SRC)   "vmovntpd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
#else
 | 
			
		||||
#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
#define VSTOREf(OFF,PTR,SRC)   "vmovups " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
#define VSTOREd(OFF,PTR,SRC)   "vmovupd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Swaps Re/Im ; could unify this with IMCI
 | 
			
		||||
 
 | 
			
		||||
@@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
 | 
			
		||||
#define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
 | 
			
		||||
#define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
 | 
			
		||||
#undef VMADDRDUP   
 | 
			
		||||
#undef VMADDSUBRDUP   
 | 
			
		||||
#undef VMADDSUBIDUP   
 | 
			
		||||
#undef VMULRDUP   
 | 
			
		||||
#undef VMULIDUP   
 | 
			
		||||
#define VMADDRDUP(O,P,B,accum)    VMADDRDUPd(O,P,B,accum) 
 | 
			
		||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
 | 
			
		||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
 | 
			
		||||
#define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
 | 
			
		||||
 
 | 
			
		||||
@@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
 | 
			
		||||
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
 | 
			
		||||
 | 
			
		||||
#undef VMADDRDUP   
 | 
			
		||||
#undef VMADDSUBRDUP   
 | 
			
		||||
#undef VMADDSUBIDUP   
 | 
			
		||||
#undef VMULRDUP   
 | 
			
		||||
#undef VMULIDUP   
 | 
			
		||||
#define VMADDRDUP(O,P,B,accum)    VMADDRDUPf(O,P,B,accum) 
 | 
			
		||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
 | 
			
		||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
 | 
			
		||||
#define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user