diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index b836e757..f33bdf9c 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -410,22 +410,22 @@ namespace Optimization { struct Permute{ static inline __m256 Permute0(__m256 in){ - return _mm256_permute2f128_ps(in,in,0x01); + return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD }; static inline __m256 Permute1(__m256 in){ - return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF }; static inline __m256 Permute2(__m256 in){ - return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG }; static inline __m256 Permute3(__m256 in){ return in; }; static inline __m256d Permute0(__m256d in){ - return _mm256_permute2f128_pd(in,in,0x01); + return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB }; - static inline __m256d Permute1(__m256d in){ + static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC return _mm256_shuffle_pd(in,in,0x5); }; static inline __m256d Permute2(__m256d in){ diff --git a/lib/simd/Grid_empty.h b/lib/simd/Grid_empty.h index 8858624e..5ab75de7 100644 --- a/lib/simd/Grid_empty.h +++ b/lib/simd/Grid_empty.h @@ -55,51 +55,67 @@ namespace Optimization { struct Vsplat{ //Complex float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(float a, float b){ + u128f out; + out.f[0] = a; + out.f[1] = b; + out.f[2] = a; + out.f[3] = b; + return out; } // Real float - inline float operator()(float a){ - return 0; + inline u128f operator()(float a){ + u128f out; + out.f[0] = a; + out.f[1] = a; + out.f[2] = a; + out.f[3] = a; + return out; } //Complex double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(double a, double b){ + u128d out; + out.f[0] = a; + out.f[1] = b; + return out; } //Real double - inline double operator()(double a){ - return 0; + inline u128d operator()(double a){ + u128d out; + out.f[0] = a; + out.f[1] = a; + return out; } //Integer inline int operator()(Integer a){ - return 0; + return a; } }; struct Vstore{ //Float - inline void operator()(float a, float* F){ - + inline void operator()(u128f a, float* F){ + memcpy(F,a.f,4*sizeof(float)); } //Double - inline void operator()(double a, double* D){ - + inline void operator()(u128d a, double* D){ + memcpy(D,a.f,2*sizeof(double)); } //Integer inline void operator()(int a, Integer* I){ - + I[0] = a; } }; struct Vstream{ //Float - inline void operator()(float * a, float b){ - + inline void operator()(float * a, u128f b){ + memcpy(a,b.f,4*sizeof(float)); } //Double - inline void operator()(double * a, double b){ - + inline void operator()(double * a, u128d b){ + memcpy(a,b.f,2*sizeof(double)); } @@ -107,24 +123,40 @@ namespace Optimization { struct Vset{ // Complex float - inline float operator()(Grid::ComplexF *a){ - return 0; + inline u128f operator()(Grid::ComplexF *a){ + u128f out; + out.f[0] = a[0].real(); + out.f[1] = a[0].imag(); + out.f[2] = a[1].real(); + out.f[3] = a[1].imag(); + return out; } // Complex double - inline double operator()(Grid::ComplexD *a){ - return 0; + inline u128d operator()(Grid::ComplexD *a){ + u128d out; + out.f[0] = a[0].real(); + out.f[1] = a[0].imag(); + return out; } // Real float - inline float operator()(float *a){ - return 0; + inline u128f operator()(float *a){ + u128f out; + out.f[0] = a[0]; + out.f[1] = a[1]; + out.f[2] = a[2]; + out.f[3] = a[3]; + return out; } // Real double - inline double operator()(double *a){ - return 0; + inline u128d operator()(double *a){ + u128d out; + out.f[0] = a[0]; + out.f[1] = a[1]; + return out; } // Integer inline int operator()(Integer *a){ - return 0; + return a[0]; } @@ -146,129 +178,198 @@ namespace Optimization { ///////////////////////////////////////////////////// struct Sum{ //Complex/Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0] + b.f[0]; + out.f[1] = a.f[1] + b.f[1]; + out.f[2] = a.f[2] + b.f[2]; + out.f[3] = a.f[3] + b.f[3]; + return out; } //Complex/Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0] + b.f[0]; + out.f[1] = a.f[1] + b.f[1]; + return out; } //Integer inline int operator()(int a, int b){ - return 0; + return a + b; } }; struct Sub{ //Complex/Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0] - b.f[0]; + out.f[1] = a.f[1] - b.f[1]; + out.f[2] = a.f[2] - b.f[2]; + out.f[3] = a.f[3] - b.f[3]; + return out; } //Complex/Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0] - b.f[0]; + out.f[1] = a.f[1] - b.f[1]; + return out; } //Integer inline int operator()(int a, int b){ - return 0; + return a-b; } }; struct MultComplex{ // Complex float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1]; + out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0]; + out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3]; + out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2]; + return out; } // Complex double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1]; + out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0]; + return out; } }; struct Mult{ - inline float mac(float a, float b,double c){ - return 0; - } - inline double mac(double a, double b,double c){ - return 0; - } + //CK: Appear unneeded + // inline float mac(float a, float b,double c){ + // return 0; + // } + // inline double mac(double a, double b,double c){ + // return 0; + // } + // Real float - inline float operator()(float a, float b){ - return 0; + inline u128f operator()(u128f a, u128f b){ + u128f out; + out.f[0] = a.f[0]*b.f[0]; + out.f[1] = a.f[1]*b.f[1]; + out.f[2] = a.f[2]*b.f[2]; + out.f[3] = a.f[3]*b.f[3]; + return out; } // Real double - inline double operator()(double a, double b){ - return 0; + inline u128d operator()(u128d a, u128d b){ + u128d out; + out.f[0] = a.f[0]*b.f[0]; + out.f[1] = a.f[1]*b.f[1]; + return out; } // Integer inline int operator()(int a, int b){ - return 0; + return a*b; } }; struct Conj{ // Complex single - inline float operator()(float in){ - return 0; + inline u128f operator()(u128f in){ + u128f out; + out.f[0] = in.f[0]; + out.f[1] = -in.f[1]; + out.f[2] = in.f[2]; + out.f[3] = -in.f[3]; + return out; } // Complex double - inline double operator()(double in){ - return 0; + inline u128d operator()(u128d in){ + u128d out; + out.f[0] = in.f[0]; + out.f[1] = -in.f[1]; + return out; } // do not define for integer input }; struct TimesMinusI{ //Complex single - inline float operator()(float in, float ret){ - return 0; + inline u128f operator()(u128f in, u128f ret){ //note ret is ignored + u128f out; + out.f[0] = in.f[1]; + out.f[1] = -in.f[0]; + out.f[2] = in.f[3]; + out.f[3] = -in.f[2]; + return out; } //Complex double - inline double operator()(double in, double ret){ - return 0; + inline u128d operator()(u128d in, u128d ret){ + u128d out; + out.f[0] = in.f[1]; + out.f[1] = -in.f[0]; + return out; } - - }; struct TimesI{ //Complex single - inline float operator()(float in, float ret){ - return 0; + inline u128f operator()(u128f in, u128f ret){ //note ret is ignored + u128f out; + out.f[0] = -in.f[1]; + out.f[1] = in.f[0]; + out.f[2] = -in.f[3]; + out.f[3] = in.f[2]; + return out; } //Complex double - inline double operator()(double in, double ret){ - return 0; + inline u128d operator()(u128d in, u128d ret){ + u128d out; + out.f[0] = -in.f[1]; + out.f[1] = in.f[0]; + return out; } }; ////////////////////////////////////////////// // Some Template specialization struct Permute{ - - static inline float Permute0(float in){ + //We just have to mirror the permutes of Grid_sse4.h + static inline u128f Permute0(u128f in){ //AB CD -> CD AB + u128f out; + out.f[0] = in.f[2]; + out.f[1] = in.f[3]; + out.f[2] = in.f[0]; + out.f[3] = in.f[1]; + return out; + }; + static inline u128f Permute1(u128f in){ //AB CD -> BA DC + u128f out; + out.f[0] = in.f[1]; + out.f[1] = in.f[0]; + out.f[2] = in.f[3]; + out.f[3] = in.f[2]; + return out; + }; + static inline u128f Permute2(u128f in){ return in; }; - static inline float Permute1(float in){ - return in; - }; - static inline float Permute2(float in){ - return in; - }; - static inline float Permute3(float in){ + static inline u128f Permute3(u128f in){ return in; }; - static inline double Permute0(double in){ + static inline u128d Permute0(u128d in){ //AB -> BA + u128d out; + out.f[0] = in.f[1]; + out.f[1] = in.f[0]; + return out; + }; + static inline u128d Permute1(u128d in){ return in; }; - static inline double Permute1(double in){ + static inline u128d Permute2(u128d in){ return in; }; - static inline double Permute2(double in){ - return in; - }; - static inline double Permute3(double in){ + static inline u128d Permute3(u128d in){ return in; }; @@ -280,26 +381,26 @@ namespace Optimization { //Complex float Reduce template<> - inline Grid::ComplexF Reduce::operator()(float in){ - return 0; + inline Grid::ComplexF Reduce::operator()(u128f in){ //2 complex + return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]); } //Real float Reduce template<> - inline Grid::RealF Reduce::operator()(float in){ - return 0; + inline Grid::RealF Reduce::operator()(u128f in){ //4 floats + return in.f[0] + in.f[1] + in.f[2] + in.f[3]; } //Complex double Reduce template<> - inline Grid::ComplexD Reduce::operator()(double in){ - return 0; + inline Grid::ComplexD Reduce::operator()(u128d in){ //1 complex + return Grid::ComplexD(in.f[0],in.f[1]); } //Real double Reduce template<> - inline Grid::RealD Reduce::operator()(double in){ - return 0; + inline Grid::RealD Reduce::operator()(u128d in){ //2 doubles + return in.f[0] + in.f[1]; } //Integer Reduce @@ -314,8 +415,8 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef float SIMD_Ftype; // Single precision type - typedef double SIMD_Dtype; // Double precision type + typedef Optimization::u128f SIMD_Ftype; // Single precision type + typedef Optimization::u128d SIMD_Dtype; // Double precision type typedef int SIMD_Itype; // Integer type // prefetch utilities diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index b88ad4c9..8f4a9c93 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -267,10 +267,10 @@ namespace Optimization { struct Permute{ static inline __m128 Permute0(__m128 in){ - return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB }; static inline __m128 Permute1(__m128 in){ - return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC }; static inline __m128 Permute2(__m128 in){ return in; @@ -279,7 +279,7 @@ namespace Optimization { return in; }; - static inline __m128d Permute0(__m128d in){ + static inline __m128d Permute0(__m128d in){ //AB -> BA return _mm_shuffle_pd(in,in,0x1); }; static inline __m128d Permute1(__m128d in){