mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-12 20:27:06 +01:00
Merge remote-tracking branch 'origin/develop' into temporary-smearing
This commit is contained in:
1139
lib/simd/Avx512Asm.h
1139
lib/simd/Avx512Asm.h
File diff suppressed because it is too large
Load Diff
@ -410,22 +410,22 @@ namespace Optimization {
|
||||
struct Permute{
|
||||
|
||||
static inline __m256 Permute0(__m256 in){
|
||||
return _mm256_permute2f128_ps(in,in,0x01);
|
||||
return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
|
||||
};
|
||||
static inline __m256 Permute1(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
|
||||
};
|
||||
static inline __m256 Permute2(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
|
||||
};
|
||||
static inline __m256 Permute3(__m256 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m256d Permute0(__m256d in){
|
||||
return _mm256_permute2f128_pd(in,in,0x01);
|
||||
return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
|
||||
};
|
||||
static inline __m256d Permute1(__m256d in){
|
||||
static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
|
||||
return _mm256_shuffle_pd(in,in,0x5);
|
||||
};
|
||||
static inline __m256d Permute2(__m256d in){
|
||||
@ -437,6 +437,111 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
#if defined (AVX2) || defined (AVXFMA4)
|
||||
#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
|
||||
#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
|
||||
#endif
|
||||
|
||||
#if defined (AVX1)
|
||||
|
||||
#define _mm256_alignr_epi32(ret,a,b,n) { \
|
||||
__m128 aa, bb; \
|
||||
\
|
||||
aa = _mm256_extractf128_ps(a,1); \
|
||||
bb = _mm256_extractf128_ps(b,1); \
|
||||
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
|
||||
ret = _mm256_insertf128_ps(ret,aa,1); \
|
||||
\
|
||||
aa = _mm256_extractf128_ps(a,0); \
|
||||
bb = _mm256_extractf128_ps(b,0); \
|
||||
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
|
||||
ret = _mm256_insertf128_ps(ret,aa,0); \
|
||||
}
|
||||
|
||||
#define _mm256_alignr_epi64(ret,a,b,n) { \
|
||||
__m128d aa, bb; \
|
||||
\
|
||||
aa = _mm256_extractf128_pd(a,1); \
|
||||
bb = _mm256_extractf128_pd(b,1); \
|
||||
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
|
||||
ret = _mm256_insertf128_pd(ret,aa,1); \
|
||||
\
|
||||
aa = _mm256_extractf128_pd(a,0); \
|
||||
bb = _mm256_extractf128_pd(b,0); \
|
||||
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
|
||||
ret = _mm256_insertf128_pd(ret,aa,0); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
inline std::ostream & operator << (std::ostream& stream, const __m256 a)
|
||||
{
|
||||
const float *p=(const float *)&a;
|
||||
stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
|
||||
return stream;
|
||||
};
|
||||
inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
|
||||
{
|
||||
const double *p=(const double *)&a;
|
||||
stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
|
||||
return stream;
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m256 rotate(__m256 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m256d rotate(__m256d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int n>
|
||||
static inline __m256 tRotate(__m256 in){
|
||||
__m256 tmp = Permute::Permute0(in);
|
||||
__m256 ret;
|
||||
if ( n > 3 ) {
|
||||
_mm256_alignr_epi32(ret,in,tmp,n);
|
||||
} else {
|
||||
_mm256_alignr_epi32(ret,tmp,in,n);
|
||||
}
|
||||
// std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
|
||||
return ret;
|
||||
};
|
||||
|
||||
template<int n>
|
||||
static inline __m256d tRotate(__m256d in){
|
||||
__m256d tmp = Permute::Permute0(in);
|
||||
__m256d ret;
|
||||
if ( n > 1 ) {
|
||||
_mm256_alignr_epi64(ret,in,tmp,n);
|
||||
} else {
|
||||
_mm256_alignr_epi64(ret,tmp,in,n);
|
||||
}
|
||||
// std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
|
||||
return ret;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
|
@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <immintrin.h>
|
||||
|
||||
|
||||
|
||||
namespace Grid{
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
@ -246,26 +246,30 @@ namespace Optimization {
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
|
||||
//__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
//return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E??
|
||||
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
//__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
//return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp);
|
||||
}
|
||||
|
||||
|
||||
@ -305,6 +309,54 @@ namespace Optimization {
|
||||
};
|
||||
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m512 rotate(__m512 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
|
||||
case 8 : return tRotate<8>(in);break;
|
||||
case 9 : return tRotate<9>(in);break;
|
||||
case 10: return tRotate<10>(in);break;
|
||||
case 11: return tRotate<11>(in);break;
|
||||
case 12: return tRotate<12>(in);break;
|
||||
case 13: return tRotate<13>(in);break;
|
||||
case 14: return tRotate<14>(in);break;
|
||||
case 15: return tRotate<15>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m512d rotate(__m512d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> static inline __m512 tRotate(__m512 in){
|
||||
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
template<int n> static inline __m512d tRotate(__m512d in){
|
||||
return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
@ -345,7 +397,7 @@ namespace Optimization {
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
namespace Grid {
|
||||
|
||||
typedef __m512 SIMD_Ftype; // Single precision type
|
||||
typedef __m512d SIMD_Dtype; // Double precision type
|
||||
typedef __m512i SIMD_Itype; // Integer type
|
||||
|
@ -35,6 +35,7 @@ Author: neo <cossu@post.kek.jp>
|
||||
// Time-stamp: <2015-06-09 14:28:02 neo>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
namespace Grid {
|
||||
namespace Optimization {
|
||||
|
||||
template<class vtype>
|
||||
@ -54,51 +55,67 @@ namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(float a, float b){
|
||||
u128f out;
|
||||
out.f[0] = a;
|
||||
out.f[1] = b;
|
||||
out.f[2] = a;
|
||||
out.f[3] = b;
|
||||
return out;
|
||||
}
|
||||
// Real float
|
||||
inline float operator()(float a){
|
||||
return 0;
|
||||
inline u128f operator()(float a){
|
||||
u128f out;
|
||||
out.f[0] = a;
|
||||
out.f[1] = a;
|
||||
out.f[2] = a;
|
||||
out.f[3] = a;
|
||||
return out;
|
||||
}
|
||||
//Complex double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(double a, double b){
|
||||
u128d out;
|
||||
out.f[0] = a;
|
||||
out.f[1] = b;
|
||||
return out;
|
||||
}
|
||||
//Real double
|
||||
inline double operator()(double a){
|
||||
return 0;
|
||||
inline u128d operator()(double a){
|
||||
u128d out;
|
||||
out.f[0] = a;
|
||||
out.f[1] = a;
|
||||
return out;
|
||||
}
|
||||
//Integer
|
||||
inline int operator()(Integer a){
|
||||
return 0;
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(float a, float* F){
|
||||
|
||||
inline void operator()(u128f a, float* F){
|
||||
memcpy(F,a.f,4*sizeof(float));
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double a, double* D){
|
||||
|
||||
inline void operator()(u128d a, double* D){
|
||||
memcpy(D,a.f,2*sizeof(double));
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(int a, Integer* I){
|
||||
|
||||
I[0] = a;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, float b){
|
||||
|
||||
inline void operator()(float * a, u128f b){
|
||||
memcpy(a,b.f,4*sizeof(float));
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, double b){
|
||||
|
||||
inline void operator()(double * a, u128d b){
|
||||
memcpy(a,b.f,2*sizeof(double));
|
||||
}
|
||||
|
||||
|
||||
@ -106,24 +123,40 @@ namespace Optimization {
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline float operator()(Grid::ComplexF *a){
|
||||
return 0;
|
||||
inline u128f operator()(Grid::ComplexF *a){
|
||||
u128f out;
|
||||
out.f[0] = a[0].real();
|
||||
out.f[1] = a[0].imag();
|
||||
out.f[2] = a[1].real();
|
||||
out.f[3] = a[1].imag();
|
||||
return out;
|
||||
}
|
||||
// Complex double
|
||||
inline double operator()(Grid::ComplexD *a){
|
||||
return 0;
|
||||
inline u128d operator()(Grid::ComplexD *a){
|
||||
u128d out;
|
||||
out.f[0] = a[0].real();
|
||||
out.f[1] = a[0].imag();
|
||||
return out;
|
||||
}
|
||||
// Real float
|
||||
inline float operator()(float *a){
|
||||
return 0;
|
||||
inline u128f operator()(float *a){
|
||||
u128f out;
|
||||
out.f[0] = a[0];
|
||||
out.f[1] = a[1];
|
||||
out.f[2] = a[2];
|
||||
out.f[3] = a[3];
|
||||
return out;
|
||||
}
|
||||
// Real double
|
||||
inline double operator()(double *a){
|
||||
return 0;
|
||||
inline u128d operator()(double *a){
|
||||
u128d out;
|
||||
out.f[0] = a[0];
|
||||
out.f[1] = a[1];
|
||||
return out;
|
||||
}
|
||||
// Integer
|
||||
inline int operator()(Integer *a){
|
||||
return 0;
|
||||
return a[0];
|
||||
}
|
||||
|
||||
|
||||
@ -145,130 +178,279 @@ namespace Optimization {
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(u128f a, u128f b){
|
||||
u128f out;
|
||||
out.f[0] = a.f[0] + b.f[0];
|
||||
out.f[1] = a.f[1] + b.f[1];
|
||||
out.f[2] = a.f[2] + b.f[2];
|
||||
out.f[3] = a.f[3] + b.f[3];
|
||||
return out;
|
||||
}
|
||||
//Complex/Real double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(u128d a, u128d b){
|
||||
u128d out;
|
||||
out.f[0] = a.f[0] + b.f[0];
|
||||
out.f[1] = a.f[1] + b.f[1];
|
||||
return out;
|
||||
}
|
||||
//Integer
|
||||
inline int operator()(int a, int b){
|
||||
return 0;
|
||||
return a + b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(u128f a, u128f b){
|
||||
u128f out;
|
||||
out.f[0] = a.f[0] - b.f[0];
|
||||
out.f[1] = a.f[1] - b.f[1];
|
||||
out.f[2] = a.f[2] - b.f[2];
|
||||
out.f[3] = a.f[3] - b.f[3];
|
||||
return out;
|
||||
}
|
||||
//Complex/Real double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(u128d a, u128d b){
|
||||
u128d out;
|
||||
out.f[0] = a.f[0] - b.f[0];
|
||||
out.f[1] = a.f[1] - b.f[1];
|
||||
return out;
|
||||
}
|
||||
//Integer
|
||||
inline int operator()(int a, int b){
|
||||
return 0;
|
||||
return a-b;
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(u128f a, u128f b){
|
||||
u128f out;
|
||||
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
|
||||
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
|
||||
out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
|
||||
out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
|
||||
return out;
|
||||
}
|
||||
// Complex double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(u128d a, u128d b){
|
||||
u128d out;
|
||||
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
|
||||
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
inline float mac(float a, float b,double c){
|
||||
return 0;
|
||||
}
|
||||
inline double mac(double a, double b,double c){
|
||||
return 0;
|
||||
}
|
||||
//CK: Appear unneeded
|
||||
// inline float mac(float a, float b,double c){
|
||||
// return 0;
|
||||
// }
|
||||
// inline double mac(double a, double b,double c){
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
// Real float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(u128f a, u128f b){
|
||||
u128f out;
|
||||
out.f[0] = a.f[0]*b.f[0];
|
||||
out.f[1] = a.f[1]*b.f[1];
|
||||
out.f[2] = a.f[2]*b.f[2];
|
||||
out.f[3] = a.f[3]*b.f[3];
|
||||
return out;
|
||||
}
|
||||
// Real double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(u128d a, u128d b){
|
||||
u128d out;
|
||||
out.f[0] = a.f[0]*b.f[0];
|
||||
out.f[1] = a.f[1]*b.f[1];
|
||||
return out;
|
||||
}
|
||||
// Integer
|
||||
inline int operator()(int a, int b){
|
||||
return 0;
|
||||
return a*b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline float operator()(float in){
|
||||
return 0;
|
||||
inline u128f operator()(u128f in){
|
||||
u128f out;
|
||||
out.f[0] = in.f[0];
|
||||
out.f[1] = -in.f[1];
|
||||
out.f[2] = in.f[2];
|
||||
out.f[3] = -in.f[3];
|
||||
return out;
|
||||
}
|
||||
// Complex double
|
||||
inline double operator()(double in){
|
||||
return 0;
|
||||
inline u128d operator()(u128d in){
|
||||
u128d out;
|
||||
out.f[0] = in.f[0];
|
||||
out.f[1] = -in.f[1];
|
||||
return out;
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline float operator()(float in, float ret){
|
||||
return 0;
|
||||
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
|
||||
u128f out;
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = -in.f[0];
|
||||
out.f[2] = in.f[3];
|
||||
out.f[3] = -in.f[2];
|
||||
return out;
|
||||
}
|
||||
//Complex double
|
||||
inline double operator()(double in, double ret){
|
||||
return 0;
|
||||
inline u128d operator()(u128d in, u128d ret){
|
||||
u128d out;
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = -in.f[0];
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline float operator()(float in, float ret){
|
||||
return 0;
|
||||
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
|
||||
u128f out;
|
||||
out.f[0] = -in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
out.f[2] = -in.f[3];
|
||||
out.f[3] = in.f[2];
|
||||
return out;
|
||||
}
|
||||
//Complex double
|
||||
inline double operator()(double in, double ret){
|
||||
return 0;
|
||||
inline u128d operator()(u128d in, u128d ret){
|
||||
u128d out;
|
||||
out.f[0] = -in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
struct Permute{
|
||||
//We just have to mirror the permutes of Grid_sse4.h
|
||||
static inline u128f Permute0(u128f in){ //AB CD -> CD AB
|
||||
u128f out;
|
||||
out.f[0] = in.f[2];
|
||||
out.f[1] = in.f[3];
|
||||
out.f[2] = in.f[0];
|
||||
out.f[3] = in.f[1];
|
||||
return out;
|
||||
};
|
||||
static inline u128f Permute1(u128f in){ //AB CD -> BA DC
|
||||
u128f out;
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
out.f[2] = in.f[3];
|
||||
out.f[3] = in.f[2];
|
||||
return out;
|
||||
};
|
||||
static inline u128f Permute2(u128f in){
|
||||
return in;
|
||||
};
|
||||
static inline u128f Permute3(u128f in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline u128d Permute0(u128d in){ //AB -> BA
|
||||
u128d out;
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
return out;
|
||||
};
|
||||
static inline u128d Permute1(u128d in){
|
||||
return in;
|
||||
};
|
||||
static inline u128d Permute2(u128d in){
|
||||
return in;
|
||||
};
|
||||
static inline u128d Permute3(u128d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
template < typename vtype >
|
||||
void permute(vtype &a, vtype b, int perm) {
|
||||
};
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline u128f rotate(u128f in,int n){
|
||||
u128f out;
|
||||
switch(n){
|
||||
case 0:
|
||||
out.f[0] = in.f[0];
|
||||
out.f[1] = in.f[1];
|
||||
out.f[2] = in.f[2];
|
||||
out.f[3] = in.f[3];
|
||||
break;
|
||||
case 1:
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = in.f[2];
|
||||
out.f[2] = in.f[3];
|
||||
out.f[3] = in.f[0];
|
||||
break;
|
||||
case 2:
|
||||
out.f[0] = in.f[2];
|
||||
out.f[1] = in.f[3];
|
||||
out.f[2] = in.f[0];
|
||||
out.f[3] = in.f[1];
|
||||
break;
|
||||
case 3:
|
||||
out.f[0] = in.f[3];
|
||||
out.f[1] = in.f[0];
|
||||
out.f[2] = in.f[1];
|
||||
out.f[3] = in.f[2];
|
||||
break;
|
||||
default: assert(0);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
static inline u128d rotate(u128d in,int n){
|
||||
u128d out;
|
||||
switch(n){
|
||||
case 0:
|
||||
out.f[0] = in.f[0];
|
||||
out.f[1] = in.f[1];
|
||||
break;
|
||||
case 1:
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
break;
|
||||
default: assert(0);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
|
||||
return 0;
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
|
||||
return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
|
||||
return 0;
|
||||
inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
|
||||
return in.f[0] + in.f[1] + in.f[2] + in.f[3];
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
|
||||
return 0;
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
|
||||
return Grid::ComplexD(in.f[0],in.f[1]);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
|
||||
return 0;
|
||||
inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
|
||||
return in.f[0] + in.f[1];
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
@ -282,10 +464,9 @@ namespace Optimization {
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
namespace Grid {
|
||||
|
||||
typedef float SIMD_Ftype; // Single precision type
|
||||
typedef double SIMD_Dtype; // Double precision type
|
||||
typedef Optimization::u128f SIMD_Ftype; // Single precision type
|
||||
typedef Optimization::u128d SIMD_Dtype; // Double precision type
|
||||
typedef int SIMD_Itype; // Integer type
|
||||
|
||||
// prefetch utilities
|
||||
|
@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
|
||||
namespace Grid{
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
@ -316,6 +318,54 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m512 rotate(__m512 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
|
||||
case 8 : return tRotate<8>(in);break;
|
||||
case 9 : return tRotate<9>(in);break;
|
||||
case 10: return tRotate<10>(in);break;
|
||||
case 11: return tRotate<11>(in);break;
|
||||
case 12: return tRotate<12>(in);break;
|
||||
case 13: return tRotate<13>(in);break;
|
||||
case 14: return tRotate<14>(in);break;
|
||||
case 15: return tRotate<15>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m512d rotate(__m512d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> static inline __m512 tRotate(__m512 in){
|
||||
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
template<int n> static inline __m512d tRotate(__m512d in){
|
||||
return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -358,7 +408,7 @@ namespace Optimization {
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
namespace Grid {
|
||||
|
||||
typedef __m512 SIMD_Ftype; // Single precision type
|
||||
typedef __m512d SIMD_Dtype; // Double precision type
|
||||
typedef __m512i SIMD_Itype; // Integer type
|
||||
|
@ -267,10 +267,10 @@ namespace Optimization {
|
||||
struct Permute{
|
||||
|
||||
static inline __m128 Permute0(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
|
||||
};
|
||||
static inline __m128 Permute1(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
|
||||
};
|
||||
static inline __m128 Permute2(__m128 in){
|
||||
return in;
|
||||
@ -279,7 +279,7 @@ namespace Optimization {
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m128d Permute0(__m128d in){
|
||||
static inline __m128d Permute0(__m128d in){ //AB -> BA
|
||||
return _mm_shuffle_pd(in,in,0x1);
|
||||
};
|
||||
static inline __m128d Permute1(__m128d in){
|
||||
@ -294,6 +294,32 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m128 rotate(__m128 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m128d rotate(__m128d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
|
||||
#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
|
||||
|
||||
template<int n> static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
|
||||
template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
|
||||
|
||||
};
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
|
@ -299,16 +299,44 @@ namespace Grid {
|
||||
}
|
||||
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
|
||||
{
|
||||
if (perm==3) permute3(y,b);
|
||||
else if (perm==2) permute2(y,b);
|
||||
else if (perm==1) permute1(y,b);
|
||||
else if (perm==0) permute0(y,b);
|
||||
if ( perm & RotateBit ) {
|
||||
int dist = perm&0xF;
|
||||
y=rotate(b,dist);
|
||||
return;
|
||||
}
|
||||
switch(perm){
|
||||
case 3: permute3(y,b); break;
|
||||
case 2: permute2(y,b); break;
|
||||
case 1: permute1(y,b); break;
|
||||
case 0: permute0(y,b); break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
};// end of Grid_simd class definition
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General rotate
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class S, class V, IfNotComplex<S> =0>
|
||||
inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
|
||||
{
|
||||
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||
Grid_simd<S,V> ret;
|
||||
// std::cout << "Rotate Real by "<<nrot<<std::endl;
|
||||
ret.v = Optimization::Rotate::rotate(b.v,nrot);
|
||||
return ret;
|
||||
}
|
||||
template <class S, class V, IfComplex<S> =0>
|
||||
inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
|
||||
{
|
||||
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||
Grid_simd<S,V> ret;
|
||||
// std::cout << "Rotate Complex by "<<nrot<<std::endl;
|
||||
ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
|
||||
return ret;
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
@ -339,6 +367,9 @@ namespace Grid {
|
||||
template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) { vsplat(ret,S(0.0,0.0)); }// use xor?
|
||||
template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));}
|
||||
|
||||
template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));}
|
||||
template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));}
|
||||
|
||||
// if not complex overload here
|
||||
template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
|
||||
template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
|
||||
|
197
lib/simd/Intel512avx.h
Normal file
197
lib/simd/Intel512avx.h
Normal file
@ -0,0 +1,197 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_H
|
||||
#define GRID_ASM_AV512_H
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Knights Landing specials
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||
|
||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||
|
||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||
|
||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMULMEMf(O,P,B,Biirr) \
|
||||
VMULMEMf(O,P,C,Ciirr) \
|
||||
VMULf(tmp,B,Briir) \
|
||||
VMULf(tmp,C,Criir)
|
||||
|
||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMULMEMd(O,P,B,Biirr) \
|
||||
VMULMEMd(O,P,C,Ciirr) \
|
||||
VMULd(tmp,B,Briir) \
|
||||
VMULd(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMADDMEMf(O,P,B,Biirr) \
|
||||
VMADDMEMf(O,P,C,Ciirr) \
|
||||
VMADDf(tmp,B,Briir) \
|
||||
VMADDf(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMADDMEMd(O,P,B,Biirr) \
|
||||
VMADDMEMd(O,P,C,Ciirr) \
|
||||
VMADDd(tmp,B,Briir) \
|
||||
VMADDd(tmp,C,Criir)
|
||||
|
||||
// Merges accumulation for complex dot chain; less efficient under avx512
|
||||
#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\
|
||||
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||
"vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
|
||||
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||
"vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
|
||||
|
||||
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VRDUPd(SRC,DEST) "vpshufd $0x44," #SRC"," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VRDUPf(SRC,DEST) "vmovsldup " #SRC ", " #DEST ";\n"
|
||||
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
|
||||
|
||||
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
|
||||
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
|
||||
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
|
||||
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
|
||||
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
|
||||
#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps " #O"*64("#P "),"#B "," #accum ";\n"
|
||||
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
|
||||
|
||||
|
||||
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||
|
||||
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||
/*
|
||||
* TimesI is used only in the XP recon
|
||||
* Could zero the regs and use RECON_ACCUM
|
||||
*/
|
||||
|
||||
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST)
|
||||
#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST)
|
||||
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST)
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#if 0
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#else
|
||||
|
||||
// o_p must point to floating 1.0f/d
|
||||
//
|
||||
// Ai, Ar -> tmp (r i)
|
||||
// tmp *1.0
|
||||
// ACC i - Ar ; ACC r + Ai
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) VMADDMEMf(1,%r10,tmp,ACC)
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||
|
||||
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) VMADDMEMd(1,%r10,tmp,ACC)
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||
|
||||
// Ai, Ar -> tmp (r i)
|
||||
// tmp *1.0
|
||||
// ACC i + Ar ; ACC r - Ai
|
||||
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) VMADDMEMf(0,%r10,tmp,ACC)
|
||||
#define VACCTIMESI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) VMADDMEMd(0,%r10,tmp,ACC)
|
||||
#define VACCTIMESI2d(A,ACC,tmp)
|
||||
|
||||
#endif
|
||||
|
||||
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
|
||||
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n"
|
||||
|
||||
#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n"
|
||||
#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n"
|
||||
#define VPERM3d(A,B) VMOVd(A,B)
|
||||
|
||||
|
||||
#endif
|
141
lib/simd/Intel512common.h
Normal file
141
lib/simd/Intel512common.h
Normal file
@ -0,0 +1,141 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_INTEL_COMMON_512_H
|
||||
#define GRID_ASM_INTEL_COMMON_512_H
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Opcodes common
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define MASK_REGS \
|
||||
__asm__ ("mov $0xAAAA, %%eax \n"\
|
||||
"kmovw %%eax, %%k6 \n"\
|
||||
"mov $0x5555, %%eax \n"\
|
||||
"kmovw %%eax, %%k7 \n" : : : "%eax");
|
||||
|
||||
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||
|
||||
#define VTIMESIf(A,DEST, Z) \
|
||||
VTIMESI0f(A,DEST, Z) \
|
||||
VTIMESI1f(A,DEST, Z) \
|
||||
VTIMESI2f(A,DEST, Z)
|
||||
|
||||
#define VTIMESId(A,DEST, Z) \
|
||||
VTIMESI0d(A,DEST, Z) \
|
||||
VTIMESI1d(A,DEST, Z) \
|
||||
VTIMESI2d(A,DEST, Z)
|
||||
|
||||
#define VTIMESMINUSIf(A,DEST, Z) \
|
||||
VTIMESMINUSI0f(A,DEST, Z) \
|
||||
VTIMESMINUSI1f(A,DEST, Z) \
|
||||
VTIMESMINUSI2f(A,DEST, Z)
|
||||
|
||||
#define VTIMESMINUSId(A,DEST, Z) \
|
||||
VTIMESMINUSI0d(A,DEST, Z) \
|
||||
VTIMESMINUSI1d(A,DEST, Z) \
|
||||
VTIMESMINUSI2d(A,DEST, Z)
|
||||
|
||||
#define VACCTIMESIf(A,ACC,tmp) \
|
||||
VACCTIMESI0f(A,ACC,tmp) \
|
||||
VACCTIMESI1f(A,ACC,tmp) \
|
||||
VACCTIMESI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESId(A,ACC,tmp) \
|
||||
VACCTIMESI0d(A,ACC,tmp) \
|
||||
VACCTIMESI1d(A,ACC,tmp) \
|
||||
VACCTIMESI2d(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESMINUSIf(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI0f(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI1f(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESMINUSId(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI0d(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI1d(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||
|
||||
#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A );
|
||||
#define LOAD64(A,ptr) LOAD64i(A,ptr)
|
||||
|
||||
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
||||
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
||||
|
||||
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
|
||||
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
||||
#define VEVICT(O,A)
|
||||
|
||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||
// "clevict0 "#O"*64("#A");\n"
|
||||
|
||||
#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
|
||||
#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n"
|
||||
#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n"
|
||||
#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n"
|
||||
#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||
#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define VPREFETCHNTA(O,A)
|
||||
#define VPREFETCH(O,A)
|
||||
|
||||
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
|
||||
// Swaps Re/Im ; could unify this with IMCI
|
||||
#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
|
||||
#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n"
|
||||
#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1
|
||||
|
||||
#define TRAP " int3 ;\n"
|
||||
|
||||
#endif
|
154
lib/simd/Intel512double.h
Normal file
154
lib/simd/Intel512double.h
Normal file
@ -0,0 +1,154 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard can be multiply included as undef clearage
|
||||
#undef VZERO
|
||||
#undef VMOV
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#define VZERO(A) VZEROd(A)
|
||||
#define VMOV(A,B) VMOVd(A,B)
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC)
|
||||
|
||||
#undef VADD
|
||||
#undef VSUB
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#define VADD(A,B,C) VADDd(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBd(A,B,C)
|
||||
#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi)
|
||||
#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi)
|
||||
|
||||
|
||||
#undef VTIMESI
|
||||
#undef VTIMESI0
|
||||
#undef VTIMESI1
|
||||
#undef VTIMESI2
|
||||
#define VTIMESI(A,B,C) VTIMESId(A,B,C)
|
||||
#define VTIMESI0(A,B,C) VTIMESI0d(A,B,C)
|
||||
#define VTIMESI1(A,B,C) VTIMESI1d(A,B,C)
|
||||
#define VTIMESI2(A,B,C) VTIMESI2d(A,B,C)
|
||||
|
||||
#undef VTIMESMINUSI
|
||||
#undef VTIMESMINUSI0
|
||||
#undef VTIMESMINUSI1
|
||||
#undef VTIMESMINUSI2
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
|
||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0d(A,B,C)
|
||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1d(A,B,C)
|
||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESI
|
||||
#undef VACCTIMESI0
|
||||
#undef VACCTIMESI1
|
||||
#undef VACCTIMESI2
|
||||
#define VACCTIMESI(A,B,C) VACCTIMESId(A,B,C)
|
||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0d(A,B,C)
|
||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1d(A,B,C)
|
||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESMINUSI
|
||||
#undef VACCTIMESMINUSI0
|
||||
#undef VACCTIMESMINUSI1
|
||||
#undef VACCTIMESMINUSI2
|
||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSId(A,B,C)
|
||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0d(A,B,C)
|
||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1d(A,B,C)
|
||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESI1MEM
|
||||
#undef VACCTIMESI2MEM
|
||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMd(A,ACC,O,P)
|
||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMd(A,ACC,O,P)
|
||||
|
||||
#undef VACCTIMESMINUSI1MEM
|
||||
#undef VACCTIMESMINUSI2MEM
|
||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
|
||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
|
||||
|
||||
#undef VPERM0
|
||||
#undef VPERM1
|
||||
#undef VPERM2
|
||||
#undef VPERM3
|
||||
#define VPERM0(A,B) VPERM0d(A,B)
|
||||
#define VPERM1(A,B) VPERM1d(A,B)
|
||||
#define VPERM2(A,B) VPERM2d(A,B)
|
||||
#define VPERM3(A,B) VPERM3d(A,B)
|
||||
|
||||
#undef VSHUFMEM
|
||||
#undef VADDMEM
|
||||
#undef VSUBMEM
|
||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMd(OFF,A,DEST)
|
||||
#define VADDMEM(O,A,B,C) VADDMEMd(O,A,B,C)
|
||||
#define VSUBMEM(O,A,B,C) VSUBMEMd(O,A,B,C)
|
||||
|
||||
#undef VMOVIDUP
|
||||
#undef VMOVRDUP
|
||||
#undef VMADDSUB
|
||||
#undef VSHUF
|
||||
#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C)
|
||||
#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C)
|
||||
#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum)
|
||||
#define VSHUF(A,B) VSHUFd(A,B)
|
||||
|
||||
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
#undef ZMULMEM2SP
|
||||
#undef ZMADDMEM2SP
|
||||
|
||||
#define ZEND1(A,B,C) ZEND1d(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2d(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E)
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
||||
|
||||
#undef VRDUP
|
||||
#undef VIDUP
|
||||
#undef VMADDSUBMEM
|
||||
#undef VMADDMEM
|
||||
#undef VMULMEM
|
||||
#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST)
|
||||
#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST)
|
||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
|
||||
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
|
||||
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
|
||||
#undef VMADDSUBRDUP
|
||||
#undef VMADDSUBIDUP
|
||||
#undef VMULRDUP
|
||||
#undef VMULIDUP
|
||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
|
||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
|
||||
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)
|
||||
#define VMULIDUP(O,P,B,accum) VMULIDUPd(O,P,B,accum)
|
127
lib/simd/Intel512imci.h
Normal file
127
lib/simd/Intel512imci.h
Normal file
@ -0,0 +1,127 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_H
|
||||
#define GRID_ASM_AV512_H
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Knights Corner specials
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||
|
||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||
|
||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||
|
||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMULMEMf(O,P,B,Biirr) \
|
||||
VMULMEMf(O,P,C,Ciirr) \
|
||||
VMULf(tmp,B,Briir) \
|
||||
VMULf(tmp,C,Criir)
|
||||
|
||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMULMEMd(O,P,B,Biirr) \
|
||||
VMULMEMd(O,P,C,Ciirr) \
|
||||
VMULd(tmp,B,Briir) \
|
||||
VMULd(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMADDMEMf(O,P,B,Biirr) \
|
||||
VMADDMEMf(O,P,C,Ciirr) \
|
||||
VMADDf(tmp,B,Briir) \
|
||||
VMADDf(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMADDMEMd(O,P,B,Biirr) \
|
||||
VMADDMEMd(O,P,C,Ciirr) \
|
||||
VMADDd(tmp,B,Briir) \
|
||||
VMADDd(tmp,C,Criir)
|
||||
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESI0f(A,DEST, Z)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESI0d(A,DEST, Z)
|
||||
#define VTIMESI1d(A,DEST, Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2d(A,DEST, Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0f(A,DEST,Z)
|
||||
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0d(A,DEST,Z)
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0f(A,ACC,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
// Acc = Acc - i A
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
|
||||
//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
|
||||
|
||||
#define VPERM0f(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||
#define VPERM1f(A,B) "vpermf32x4 $0xb1," #A "," #B ";\n"
|
||||
#define VPERM2f(A,B) "vmovaps " #A "{badc}," #B ";\n"
|
||||
#define VPERM3f(A,B) "vmovaps " #A "{cdab}," #B ";\n"
|
||||
|
||||
#define VPERM0d(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||
#define VPERM1d(A,B) "vmovapd " #A "{badc}," #B ";\n"
|
||||
#define VPERM2d(A,B) "vmovapd " #A "{cdab}," #B ";\n"
|
||||
#define VPERM3d(A,B) VMOVd(A,B)
|
||||
|
||||
#endif
|
155
lib/simd/Intel512single.h
Normal file
155
lib/simd/Intel512single.h
Normal file
@ -0,0 +1,155 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard can be multiply included as undef clearge of macros
|
||||
#undef VZERO
|
||||
#undef VMOV
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#define VZERO(A) VZEROf(A)
|
||||
#define VMOV(A,B) VMOVf(A,B)
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
||||
|
||||
#undef VADD
|
||||
#undef VSUB
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#define VADD(A,B,C) VADDf(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBf(A,B,C)
|
||||
#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi)
|
||||
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
|
||||
|
||||
|
||||
#undef VTIMESI
|
||||
#undef VTIMESI0
|
||||
#undef VTIMESI1
|
||||
#undef VTIMESI2
|
||||
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
||||
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
|
||||
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
|
||||
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
|
||||
|
||||
#undef VTIMESMINUSI
|
||||
#undef VTIMESMINUSI0
|
||||
#undef VTIMESMINUSI1
|
||||
#undef VTIMESMINUSI2
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
|
||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
|
||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESI
|
||||
#undef VACCTIMESI0
|
||||
#undef VACCTIMESI1
|
||||
#undef VACCTIMESI2
|
||||
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
|
||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
|
||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
|
||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESMINUSI
|
||||
#undef VACCTIMESMINUSI0
|
||||
#undef VACCTIMESMINUSI1
|
||||
#undef VACCTIMESMINUSI2
|
||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
|
||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
|
||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
|
||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESI1MEM
|
||||
#undef VACCTIMESI2MEM
|
||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
|
||||
|
||||
#undef VACCTIMESMINUSI1MEM
|
||||
#undef VACCTIMESMINUSI2MEM
|
||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
|
||||
|
||||
#undef VPERM0
|
||||
#undef VPERM1
|
||||
#undef VPERM2
|
||||
#undef VPERM3
|
||||
#define VPERM0(A,B) VPERM0f(A,B)
|
||||
#define VPERM1(A,B) VPERM1f(A,B)
|
||||
#define VPERM2(A,B) VPERM2f(A,B)
|
||||
#define VPERM3(A,B) VPERM3f(A,B)
|
||||
|
||||
#undef VSHUFMEM
|
||||
#undef VADDMEM
|
||||
#undef VSUBMEM
|
||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
|
||||
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
|
||||
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
|
||||
|
||||
#undef VMOVIDUP
|
||||
#undef VMOVRDUP
|
||||
#undef VMADDSUB
|
||||
#undef VSHUF
|
||||
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
|
||||
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
|
||||
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
||||
#define VSHUF(A,B) VSHUFf(A,B)
|
||||
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
#undef ZMULMEM2SP
|
||||
#undef ZMADDMEM2SP
|
||||
|
||||
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
||||
#undef VRDUP
|
||||
#undef VIDUP
|
||||
#undef VMADDSUBMEM
|
||||
#undef VMADDMEM
|
||||
#undef VMULMEM
|
||||
|
||||
#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST)
|
||||
#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST)
|
||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
|
||||
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
|
||||
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
|
||||
|
||||
#undef VMADDSUBRDUP
|
||||
#undef VMADDSUBIDUP
|
||||
#undef VMULRDUP
|
||||
#undef VMULIDUP
|
||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
|
||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
|
||||
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)
|
||||
#define VMULIDUP(O,P,B,accum) VMULIDUPf(O,P,B,accum)
|
||||
|
849
lib/simd/Intel512wilson.h
Normal file
849
lib/simd/Intel512wilson.h
Normal file
@ -0,0 +1,849 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_INTEL_512_QCD_H
|
||||
#define GRID_ASM_INTEL_512_QCD_H
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Register allocations for Wilson Kernel are precision indept
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define result_00 %zmm0
|
||||
#define result_01 %zmm1
|
||||
#define result_02 %zmm2
|
||||
|
||||
#define result_10 %zmm3
|
||||
#define result_11 %zmm4
|
||||
#define result_12 %zmm5
|
||||
|
||||
#define result_20 %zmm6
|
||||
#define result_21 %zmm7
|
||||
#define result_22 %zmm8
|
||||
|
||||
#define result_30 %zmm9
|
||||
#define result_31 %zmm10
|
||||
#define result_32 %zmm11
|
||||
|
||||
#define Chi_00 %zmm12
|
||||
#define Chi_01 %zmm13
|
||||
#define Chi_02 %zmm14
|
||||
|
||||
#define Chi_10 %zmm15
|
||||
#define Chi_11 %zmm16
|
||||
#define Chi_12 %zmm17
|
||||
|
||||
#define UChi_00 %zmm18
|
||||
#define UChi_01 %zmm19
|
||||
#define UChi_02 %zmm20
|
||||
|
||||
#define UChi_10 %zmm21
|
||||
#define UChi_11 %zmm22
|
||||
#define UChi_12 %zmm23
|
||||
|
||||
#define Uir %zmm24
|
||||
#define Uri %zmm25
|
||||
#define T1 %zmm24
|
||||
#define T2 %zmm25
|
||||
|
||||
#define Z0 %zmm26
|
||||
#define Z1 %zmm27
|
||||
#define Z2 %zmm28
|
||||
#define Z3 %zmm29
|
||||
#define Z4 %zmm30
|
||||
#define Z5 %zmm31
|
||||
|
||||
#define TMP Chi_00
|
||||
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_12
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
#include <simd/Intel512common.h>
|
||||
#include <simd/Intel512avx.h>
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Macros used to build wilson kernel -- can rationalise and simplify
|
||||
// a little as some duplication developed during trying different
|
||||
// variants during optimisation. Could cut back to only those used.
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
||||
// const SiteSpinor * ptr = & in._odata[offset];
|
||||
#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR)
|
||||
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
||||
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
||||
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
||||
#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
|
||||
|
||||
#define LOAD_CHIMUi \
|
||||
LOAD_CHIMU01i \
|
||||
LOAD_CHIMU23i );
|
||||
|
||||
|
||||
#define LOAD_CHIMU01i\
|
||||
VLOAD(0,%r8,Chimu_00) \
|
||||
VLOAD(1,%r8,Chimu_01) \
|
||||
VLOAD(2,%r8,Chimu_02) \
|
||||
VLOAD(3,%r8,Chimu_10) \
|
||||
VLOAD(4,%r8,Chimu_11) \
|
||||
VLOAD(5,%r8,Chimu_12)
|
||||
|
||||
#define LOAD_CHIMU23i\
|
||||
VLOAD(6,%r8,Chimu_20) \
|
||||
VLOAD(7,%r8,Chimu_21) \
|
||||
VLOAD(8,%r8,Chimu_22) \
|
||||
VLOAD(9,%r8,Chimu_30) \
|
||||
VLOAD(10,%r8,Chimu_31) \
|
||||
VLOAD(11,%r8,Chimu_32)
|
||||
|
||||
#define SHUF_CHIMU23i\
|
||||
VSHUFMEM(6,%r8,Chimu_20) \
|
||||
VSHUFMEM(7,%r8,Chimu_21) \
|
||||
VSHUFMEM(8,%r8,Chimu_22) \
|
||||
VSHUFMEM(9,%r8,Chimu_30) \
|
||||
VSHUFMEM(10,%r8,Chimu_31) \
|
||||
VSHUFMEM(11,%r8,Chimu_32)
|
||||
|
||||
|
||||
// const SiteHalfSpinor *ptr = &buf[offset];
|
||||
|
||||
#define LOAD_CHIi \
|
||||
VLOAD(0,%r8,Chi_00) \
|
||||
VLOAD(1,%r8,Chi_01) \
|
||||
VLOAD(2,%r8,Chi_02) \
|
||||
VLOAD(3,%r8,Chi_10) \
|
||||
VLOAD(4,%r8,Chi_11) \
|
||||
VLOAD(5,%r8,Chi_12)
|
||||
|
||||
|
||||
#define SAVE_UCHIi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
VSTORE(0,%r8,UChi_00) \
|
||||
VSTORE(1,%r8,UChi_01) \
|
||||
VSTORE(2,%r8,UChi_02) \
|
||||
VSTORE(3,%r8,UChi_10) \
|
||||
VSTORE(4,%r8,UChi_11) \
|
||||
VSTORE(5,%r8,UChi_12) \
|
||||
);
|
||||
|
||||
#define SAVE_CHIi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
VSTORE(0,%r8,Chi_00) \
|
||||
VSTORE(1,%r8,Chi_01) \
|
||||
VSTORE(2,%r8,Chi_02) \
|
||||
VSTORE(3,%r8,Chi_10) \
|
||||
VSTORE(4,%r8,Chi_11) \
|
||||
VSTORE(5,%r8,Chi_12) \
|
||||
);
|
||||
|
||||
#define SAVE_RESULTi(PTR)\
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
VSTORE(0,%r8,result_00) \
|
||||
VSTORE(1,%r8,result_01) \
|
||||
VSTORE(2,%r8,result_02) \
|
||||
VSTORE(3,%r8,result_10) \
|
||||
VSTORE(4,%r8,result_11) \
|
||||
VSTORE(5,%r8,result_12) \
|
||||
VSTORE(6,%r8,result_20) \
|
||||
VSTORE(7,%r8,result_21) \
|
||||
VSTORE(8,%r8,result_22) \
|
||||
VSTORE(9,%r8,result_30) \
|
||||
VSTORE(10,%r8,result_31) \
|
||||
VSTORE(11,%r8,result_32) \
|
||||
);
|
||||
|
||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
|
||||
|
||||
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
||||
|
||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Dirac algebra
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIi \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \
|
||||
VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \
|
||||
VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \
|
||||
VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \
|
||||
VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \
|
||||
VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \
|
||||
VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \
|
||||
VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \
|
||||
VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \
|
||||
VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \
|
||||
VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \
|
||||
VACCTIMESI2(Chi_12,Chi_12,Chimu_22) );
|
||||
|
||||
|
||||
#define YP_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \
|
||||
VSUBMEM(10,%r8,Chimu_01,Chi_01) \
|
||||
VSUBMEM(11,%r8,Chimu_02,Chi_02) \
|
||||
VADDMEM(6,%r8,Chimu_10,Chi_10) \
|
||||
VADDMEM(7,%r8,Chimu_11,Chi_11) \
|
||||
VADDMEM(8,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
#define ZP_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIi \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \
|
||||
VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \
|
||||
VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \
|
||||
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \
|
||||
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \
|
||||
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \
|
||||
VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \
|
||||
VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \
|
||||
VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \
|
||||
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \
|
||||
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \
|
||||
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) );
|
||||
|
||||
|
||||
#define TP_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VADDMEM(6,%r8 ,Chimu_00,Chi_00) \
|
||||
VADDMEM(7,%r8,Chimu_01,Chi_01) \
|
||||
VADDMEM(8,%r8,Chimu_02,Chi_02) \
|
||||
VADDMEM(9,%r8,Chimu_10,Chi_10) \
|
||||
VADDMEM(10,%r8,Chimu_11,Chi_11) \
|
||||
VADDMEM(11,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
// hspin(0)=fspin(0)-timesI(fspin(3))
|
||||
// hspin(1)=fspin(1)-timesI(fspin(2))
|
||||
|
||||
#define XM_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR)\
|
||||
__asm__ ( \
|
||||
SHUF_CHIMU23i \
|
||||
LOAD_CHIi \
|
||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
|
||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
|
||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
|
||||
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
|
||||
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
|
||||
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
|
||||
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
|
||||
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
|
||||
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
|
||||
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
|
||||
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
|
||||
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
|
||||
|
||||
#define YM_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VADDMEM(9,%r8 ,Chimu_00,Chi_00) \
|
||||
VADDMEM(10,%r8,Chimu_01,Chi_01) \
|
||||
VADDMEM(11,%r8,Chimu_02,Chi_02) \
|
||||
VSUBMEM(6,%r8,Chimu_10,Chi_10) \
|
||||
VSUBMEM(7,%r8,Chimu_11,Chi_11) \
|
||||
VSUBMEM(8,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
#define ZM_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
SHUF_CHIMU23i \
|
||||
LOAD_CHIi \
|
||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
|
||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
|
||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
|
||||
VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
|
||||
VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
|
||||
VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
|
||||
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
|
||||
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
|
||||
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
|
||||
VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
|
||||
VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
|
||||
VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
|
||||
|
||||
#define TM_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VSUBMEM(6,%r8,Chimu_00,Chi_00) \
|
||||
VSUBMEM(7,%r8,Chimu_01,Chi_01) \
|
||||
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
|
||||
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
|
||||
VSUBMEM(10,%r8,Chimu_11,Chi_11) \
|
||||
VSUBMEM(11,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
// fspin(0)=hspin(0)
|
||||
// fspin(1)=hspin(1)
|
||||
// fspin(2)=timesMinusI(hspin(1))
|
||||
// fspin(3)=timesMinusI(hspin(0))
|
||||
#define XP_RECON __asm__ ( \
|
||||
VZERO(TMP) \
|
||||
VTIMESMINUSI0(UChi_00,result_30,TMP) \
|
||||
VTIMESMINUSI0(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI0(UChi_01,result_31,TMP) \
|
||||
VTIMESMINUSI0(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI0(UChi_02,result_32,TMP) \
|
||||
VTIMESMINUSI0(UChi_12,result_22,TMP) \
|
||||
VMOV(UChi_00,result_00) \
|
||||
VMOV(UChi_10,result_10) \
|
||||
VMOV(UChi_01,result_01) \
|
||||
VMOV(UChi_11,result_11) \
|
||||
VMOV(UChi_02,result_02) \
|
||||
VMOV(UChi_12,result_12) \
|
||||
VTIMESMINUSI1(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI1(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI1(UChi_12,result_22,TMP) \
|
||||
VTIMESMINUSI1(UChi_00,result_30,TMP) \
|
||||
VTIMESMINUSI1(UChi_01,result_31,TMP) \
|
||||
VTIMESMINUSI1(UChi_02,result_32,TMP) \
|
||||
VTIMESMINUSI2(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI2(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI2(UChi_12,result_22,TMP) \
|
||||
VTIMESMINUSI2(UChi_00,result_30,TMP) \
|
||||
VTIMESMINUSI2(UChi_01,result_31,TMP) \
|
||||
VTIMESMINUSI2(UChi_02,result_32,TMP) \
|
||||
);
|
||||
// NB could save 6 ops using addsub => 12 cycles
|
||||
#define XP_RECON_ACCUM __asm__ ( \
|
||||
VZERO(TMP)\
|
||||
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
|
||||
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
|
||||
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
|
||||
VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
|
||||
VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
|
||||
);
|
||||
|
||||
#define XM_RECON __asm__ ( \
|
||||
VZERO(TMP)\
|
||||
VTIMESI0(UChi_00,result_30,TMP)\
|
||||
VTIMESI0(UChi_10,result_20,TMP)\
|
||||
VTIMESI0(UChi_01,result_31,TMP)\
|
||||
VTIMESI0(UChi_11,result_21,TMP)\
|
||||
VTIMESI0(UChi_02,result_32,TMP)\
|
||||
VTIMESI0(UChi_12,result_22,TMP)\
|
||||
VMOV(UChi_00,result_00)\
|
||||
VMOV(UChi_10,result_10)\
|
||||
VMOV(UChi_01,result_01)\
|
||||
VMOV(UChi_11,result_11)\
|
||||
VMOV(UChi_02,result_02)\
|
||||
VMOV(UChi_12,result_12)\
|
||||
VTIMESI1(UChi_00,result_30,TMP)\
|
||||
VTIMESI1(UChi_10,result_20,TMP)\
|
||||
VTIMESI1(UChi_01,result_31,TMP)\
|
||||
VTIMESI1(UChi_11,result_21,TMP)\
|
||||
VTIMESI1(UChi_02,result_32,TMP)\
|
||||
VTIMESI1(UChi_12,result_22,TMP)\
|
||||
VTIMESI2(UChi_10,result_20,TMP)\
|
||||
VTIMESI2(UChi_11,result_21,TMP)\
|
||||
VTIMESI2(UChi_12,result_22,TMP)\
|
||||
VTIMESI2(UChi_00,result_30,TMP)\
|
||||
VTIMESI2(UChi_01,result_31,TMP)\
|
||||
VTIMESI2(UChi_02,result_32,TMP)\
|
||||
);
|
||||
|
||||
#define XM_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESI0(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI0(UChi_00,result_30,Z3)\
|
||||
VACCTIMESI0(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI0(UChi_01,result_31,Z4)\
|
||||
VACCTIMESI0(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI0(UChi_02,result_32,Z5)\
|
||||
\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
\
|
||||
VACCTIMESI1(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI1(UChi_00,result_30,Z3)\
|
||||
VACCTIMESI1(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI1(UChi_01,result_31,Z4)\
|
||||
VACCTIMESI1(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI1(UChi_02,result_32,Z5)\
|
||||
VACCTIMESI2(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI2(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI2(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI2(UChi_00,result_30,Z3)\
|
||||
VACCTIMESI2(UChi_01,result_31,Z4)\
|
||||
VACCTIMESI2(UChi_02,result_32,Z5)\
|
||||
);
|
||||
|
||||
#define YP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_10,result_20,result_20)\
|
||||
VADD(UChi_11,result_21,result_21)\
|
||||
VADD(UChi_12,result_22,result_22)\
|
||||
VSUB(UChi_00,result_30,result_30)\
|
||||
VSUB(UChi_01,result_31,result_31)\
|
||||
VSUB(UChi_02,result_32,result_32) );
|
||||
|
||||
#define YM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VSUB(UChi_10,result_20,result_20)\
|
||||
VSUB(UChi_11,result_21,result_21)\
|
||||
VSUB(UChi_12,result_22,result_22)\
|
||||
VADD(UChi_00,result_30,result_30)\
|
||||
VADD(UChi_01,result_31,result_31)\
|
||||
VADD(UChi_02,result_32,result_32) );
|
||||
|
||||
#define ZP_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI0(UChi_10,result_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI0(UChi_11,result_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI0(UChi_12,result_32,Z5)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI1(UChi_10,result_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI1(UChi_11,result_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI1(UChi_12,result_32,Z5)\
|
||||
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI2(UChi_10,result_30,Z3)\
|
||||
VACCTIMESI2(UChi_11,result_31,Z4)\
|
||||
VACCTIMESI2(UChi_12,result_32,Z5)\
|
||||
);
|
||||
|
||||
#define ZM_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESI0(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
|
||||
VACCTIMESI0(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
|
||||
VACCTIMESI0(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESI1(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
|
||||
VACCTIMESI1(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
|
||||
VACCTIMESI1(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
|
||||
VACCTIMESI2(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI2(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI2(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
|
||||
VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
|
||||
VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
|
||||
);
|
||||
|
||||
#define TP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_00,result_20,result_20)\
|
||||
VADD(UChi_10,result_30,result_30)\
|
||||
VADD(UChi_01,result_21,result_21)\
|
||||
VADD(UChi_11,result_31,result_31)\
|
||||
VADD(UChi_02,result_22,result_22)\
|
||||
VADD(UChi_12,result_32,result_32) );
|
||||
|
||||
#define TM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VSUB(UChi_00,result_20,result_20)\
|
||||
VSUB(UChi_10,result_30,result_30)\
|
||||
VSUB(UChi_01,result_21,result_21)\
|
||||
VSUB(UChi_11,result_31,result_31)\
|
||||
VSUB(UChi_02,result_22,result_22)\
|
||||
VSUB(UChi_12,result_32,result_32) );
|
||||
|
||||
#define PREFETCH_CHIMU(A) \
|
||||
LOAD64(%r9,A) \
|
||||
__asm__ ( \
|
||||
VPREFETCHG(12,%r9)\
|
||||
VPREFETCHG(13,%r9)\
|
||||
VPREFETCHG(14,%r9)\
|
||||
VPREFETCHG(15,%r9)\
|
||||
VPREFETCHG(16,%r9)\
|
||||
VPREFETCHG(17,%r9)\
|
||||
VPREFETCHG(18,%r9)\
|
||||
VPREFETCHG(19,%r9)\
|
||||
VPREFETCHG(20,%r9)\
|
||||
VPREFETCHG(21,%r9)\
|
||||
VPREFETCHG(22,%r9)\
|
||||
VPREFETCHG(23,%r9));
|
||||
|
||||
#define PERMUTE_DIR0 __asm__ ( \
|
||||
VPERM0(Chi_00,Chi_00) \
|
||||
VPERM0(Chi_01,Chi_01) \
|
||||
VPERM0(Chi_02,Chi_02) \
|
||||
VPERM0(Chi_10,Chi_10) \
|
||||
VPERM0(Chi_11,Chi_11) \
|
||||
VPERM0(Chi_12,Chi_12) );
|
||||
|
||||
#define PERMUTE_DIR1 __asm__ ( \
|
||||
VPERM1(Chi_00,Chi_00) \
|
||||
VPERM1(Chi_01,Chi_01) \
|
||||
VPERM1(Chi_02,Chi_02) \
|
||||
VPERM1(Chi_10,Chi_10) \
|
||||
VPERM1(Chi_11,Chi_11) \
|
||||
VPERM1(Chi_12,Chi_12));
|
||||
|
||||
#define PERMUTE_DIR2 __asm__ ( \
|
||||
VPERM2(Chi_00,Chi_00) \
|
||||
VPERM2(Chi_01,Chi_01) \
|
||||
VPERM2(Chi_02,Chi_02) \
|
||||
VPERM2(Chi_10,Chi_10) \
|
||||
VPERM2(Chi_11,Chi_11) \
|
||||
VPERM2(Chi_12,Chi_12) );
|
||||
|
||||
#define PERMUTE_DIR3 __asm__ ( \
|
||||
VPERM3(Chi_00,Chi_00) \
|
||||
VPERM3(Chi_01,Chi_01) \
|
||||
VPERM3(Chi_02,Chi_02) \
|
||||
VPERM3(Chi_10,Chi_10) \
|
||||
VPERM3(Chi_11,Chi_11) \
|
||||
VPERM3(Chi_12,Chi_12) );
|
||||
|
||||
|
||||
#define MULT_ADDSUB_2SPIN(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VPREFETCH2(9,%r8) \
|
||||
VPREFETCH2(10,%r8) \
|
||||
VPREFETCH2(11,%r8) \
|
||||
VPREFETCH2(12,%r8) \
|
||||
VPREFETCH2(13,%r8) \
|
||||
VPREFETCH2(14,%r8) \
|
||||
VPREFETCH2(15,%r8) \
|
||||
VPREFETCH2(16,%r8) \
|
||||
VPREFETCH2(17,%r8) \
|
||||
VSHUF(Chi_00,T1) \
|
||||
VMOVIDUP(0,%r8,Z0 ) \
|
||||
VMOVIDUP(3,%r8,Z1 ) \
|
||||
VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
|
||||
/*6*/ \
|
||||
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
|
||||
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
|
||||
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
|
||||
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
||||
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
||||
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
||||
VPREFETCHG(0,%r9) \
|
||||
VPREFETCHG(1,%r9) \
|
||||
VPREFETCHG(2,%r9) \
|
||||
VPREFETCHG(3,%r9) \
|
||||
/*18*/ \
|
||||
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
||||
VMADDSUB(Z3,Chi_10,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
|
||||
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_10,UChi_12) \
|
||||
VPREFETCHG(4,%r9) \
|
||||
VPREFETCHG(5,%r9) \
|
||||
VPREFETCHG(6,%r9) \
|
||||
VPREFETCHG(7,%r9) \
|
||||
/*28*/ \
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
|
||||
VMADDSUB(Z1,T2,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
||||
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
||||
VPREFETCH2(12,%r9) \
|
||||
VPREFETCH2(13,%r9) \
|
||||
VPREFETCH2(14,%r9) \
|
||||
VPREFETCH2(15,%r9) \
|
||||
VPREFETCH2(16,%r9) \
|
||||
VPREFETCH2(17,%r9) \
|
||||
VPREFETCH2(18,%r9) \
|
||||
VPREFETCH2(19,%r9) \
|
||||
VPREFETCH2(20,%r9) \
|
||||
VPREFETCH2(21,%r9) \
|
||||
VPREFETCH2(22,%r9) \
|
||||
VPREFETCH2(23,%r9) \
|
||||
/*38*/ \
|
||||
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
||||
VMADDSUB(Z3,Chi_11,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
|
||||
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_11,UChi_12) \
|
||||
VPREFETCHG(9,%r8) \
|
||||
VPREFETCHG(10,%r8) \
|
||||
VPREFETCHG(11,%r8) \
|
||||
VPREFETCHG(12,%r8) \
|
||||
VPREFETCHG(13,%r8) \
|
||||
VPREFETCHG(14,%r8) \
|
||||
VPREFETCHG(15,%r8) \
|
||||
VPREFETCHG(16,%r8) \
|
||||
VPREFETCHG(17,%r8) \
|
||||
/*48*/ \
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_01) \
|
||||
VMADDSUB(Z1,T2,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_02) \
|
||||
VMADDSUB(Z2,T2,UChi_12) \
|
||||
VPREFETCHG(8,%r9) \
|
||||
VPREFETCHG(9,%r9) \
|
||||
VPREFETCHG(10,%r9) \
|
||||
VPREFETCHG(11,%r9) \
|
||||
/*55*/ \
|
||||
VMADDSUB(Z3,Chi_02,UChi_00) \
|
||||
VMADDSUB(Z3,Chi_12,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_02,UChi_01) \
|
||||
VMADDSUB(Z4,Chi_12,UChi_11) \
|
||||
VMADDSUB(Z5,Chi_02,UChi_02) \
|
||||
VMADDSUB(Z5,Chi_12,UChi_12) \
|
||||
/*61 insns*/ );
|
||||
|
||||
|
||||
#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||
VPREFETCHG(0,%r9) \
|
||||
VPREFETCHG(1,%r9) \
|
||||
VPREFETCHG(2,%r9) \
|
||||
VPREFETCHG(3,%r9) \
|
||||
/*8*/ \
|
||||
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||
VPREFETCHG(4,%r9) \
|
||||
VPREFETCHG(5,%r9) \
|
||||
VPREFETCHG(6,%r9) \
|
||||
VPREFETCHG(7,%r9) \
|
||||
/*16*/ \
|
||||
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||
VPREFETCHG(8,%r9) \
|
||||
VPREFETCHG(9,%r9) \
|
||||
VPREFETCHG(10,%r9) \
|
||||
VPREFETCHG(11,%r9) \
|
||||
/*22*/ \
|
||||
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||
VPREFETCH2(12,%r9) \
|
||||
VPREFETCH2(13,%r9) \
|
||||
VPREFETCH2(14,%r9) \
|
||||
VPREFETCH2(15,%r9) \
|
||||
/*30*/ \
|
||||
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||
VPREFETCH2(16,%r9) \
|
||||
VPREFETCH2(17,%r9) \
|
||||
VPREFETCH2(18,%r9) \
|
||||
VPREFETCH2(19,%r9) \
|
||||
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||
/*36*/ \
|
||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||
VPREFETCH2(20,%r9) \
|
||||
VPREFETCH2(21,%r9) \
|
||||
VPREFETCH2(22,%r9) \
|
||||
VPREFETCH2(23,%r9) \
|
||||
VPREFETCHG(2,%r8) \
|
||||
VPREFETCHG(3,%r8) \
|
||||
VPREFETCH2(4,%r8) \
|
||||
VPREFETCH2(5,%r8) \
|
||||
/*42 insns*/ );
|
||||
|
||||
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||
/*8*/ \
|
||||
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||
/*16*/ \
|
||||
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||
/*22*/ \
|
||||
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||
/*30*/ \
|
||||
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||
/*36*/ \
|
||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||
/* VPREFETCHG(2,%r8)*/ \
|
||||
/* VPREFETCHG(3,%r8)*/ \
|
||||
/*42 insns*/ );
|
||||
|
||||
|
||||
#define Z6 Chi_00
|
||||
#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
VSHUFMEM(0,%r8,Z0) \
|
||||
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
|
||||
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
|
||||
VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \
|
||||
VSHUFMEM(3,%r8,Z0) \
|
||||
VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \
|
||||
VSHUFMEM(6,%r8,Z0) \
|
||||
VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \
|
||||
VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
|
||||
VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
|
||||
VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
|
||||
/*11 cycles*/ \
|
||||
VSHUFMEM(1,%r8,Z0) \
|
||||
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
|
||||
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
|
||||
VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \
|
||||
VSHUFMEM(4,%r8,Z0) \
|
||||
VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \
|
||||
VSHUFMEM(7,%r8,Z0) \
|
||||
VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \
|
||||
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
|
||||
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
|
||||
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
|
||||
/*22 cycles*/ \
|
||||
VSHUFMEM(2,%r8,Z0) \
|
||||
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
|
||||
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
|
||||
VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \
|
||||
VSHUFMEM(5,%r8,Z0) \
|
||||
VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \
|
||||
VSHUFMEM(8,%r8,Z0) \
|
||||
VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \
|
||||
/*33 cycles*/ \
|
||||
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
|
||||
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
|
||||
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
|
||||
/*stall*/ \
|
||||
/*stall*/ \
|
||||
/*stall*/ \
|
||||
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) )
|
||||
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user