1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Implemented all SSE4 functions.

A test code Grid_simd_new.cc has been created to test the new class.
Tests are all OK.
This commit is contained in:
neo
2015-05-20 17:22:40 +09:00
parent b29caead32
commit 3a3f54932a
5 changed files with 458 additions and 92 deletions

View File

@ -1,8 +1,10 @@
//----------------------------------------------------------------------
/*! @file Grid_sse4.h
@brief Optimization libraries
@brief Optimization libraries for SSE4 instructions set
Using intrinsics
*/
// Time-stamp: <2015-05-19 17:06:51 neo>
// Time-stamp: <2015-05-20 16:45:39 neo>
//----------------------------------------------------------------------
#include <pmmintrin.h>
@ -49,6 +51,20 @@ namespace Optimization {
};
struct Vstream{
//Float
inline void operator()(__m128 a, __m128 b){
_mm_stream_ps((float *)&a,b);
}
//Double
inline void operator()(__m128d a, __m128d b){
_mm_stream_pd((double *)&a,b);
}
};
struct Vset{
// Complex float
@ -75,27 +91,20 @@ namespace Optimization {
};
template <typename Out_type, typename In_type>
struct Reduce{
//Complex float
inline Grid::ComplexF operator()(__m128 in){
union {
__m128 v1;
float f[4];
} u128;
u128.v1 = _mm_add_ps(in, _mm_shuffle_ps(in,in, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
return Grid::ComplexF(u128.f[0], u128.f[1]);
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
//Complex double
inline Grid::ComplexD operator()(__m128d in){
printf("Missing complex double implementation -> FIX\n");
return Grid::ComplexD(0,0); // FIXME wrong
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
@ -129,25 +138,26 @@ namespace Optimization {
}
};
struct MultComplex{
// Complex float
inline __m128 operator()(__m128 a, __m128 b){
__m128 ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm_addsub_ps(ymm0,ymm1);
}
// Complex double
inline __m128d operator()(__m128d a, __m128d b){
__m128d ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar,
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar,
ymm0 = _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01
ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11
ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01
ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11
ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm_addsub_pd(ymm0,ymm1);
}
};
@ -165,14 +175,112 @@ namespace Optimization {
inline __m128i operator()(__m128i a, __m128i b){
return _mm_mul_epi32(a,b);
}
};
struct Conj{
// Complex single
inline __m128 operator()(__m128 in){
return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f));
}
// Complex double
inline __m128d operator()(__m128d in){
return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline __m128 operator()(__m128 in, __m128 ret){
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
}
//Complex double
inline __m128d operator()(__m128d in, __m128d ret){
__m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i
return _mm_shuffle_pd(tmp,tmp,0x1);
}
};
struct TimesI{
//Complex single
inline __m128 operator()(__m128 in, __m128 ret){
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1));
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
}
//Complex double
inline __m128d operator()(__m128d in, __m128d ret){
__m128d tmp = _mm_shuffle_pd(in,in,0x1);
return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
}
};
//////////////////////////////////////////////
// Some Template specialization
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
union {
__m128 v1;
float f[4];
} u128;
u128.v1 = _mm_add_ps(in, _mm_shuffle_ps(in,in, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
return Grid::ComplexF(u128.f[0], u128.f[1]);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
// FIXME Hack
const Grid::RealF * ptr = (const Grid::RealF *) &in;
Grid::RealF ret = 0;
for(int i=0;i< 4 ;i++){ // 4 number of simd lanes for float
ret = ret+ptr[i];
}
return ret;
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
printf("Reduce : Missing good complex double implementation -> FIX\n");
return Grid::ComplexD(in[0], in[1]); // inefficient
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
// FIXME Hack
const Grid::RealD * ptr =(const Grid::RealD *) &in;
Grid::RealD ret = 0;
for(int i=0;i< 2 ;i++){// 2 number of simd lanes for float
ret = ret+ptr[i];
}
return ret;
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
namespace Grid {
typedef __m128 SIMD_Ftype; // Single precision type
@ -180,15 +288,21 @@ namespace Grid {
typedef __m128i SIMD_Itype; // Integer type
// Function names
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

View File

@ -2,7 +2,7 @@
/*! @file Grid_vector_types.h
@brief Defines templated class Grid_simd to deal with inner vector types
*/
// Time-stamp: <2015-05-19 17:20:36 neo>
// Time-stamp: <2015-05-20 17:21:52 neo>
//---------------------------------------------------------------------------
#ifndef GRID_VECTOR_TYPES
#define GRID_VECTOR_TYPES
@ -22,6 +22,16 @@ namespace Grid {
typedef T type;
};
// type alias used to simplify the syntax of std::enable_if
template <typename T> using Invoke =
typename T::type;
template <typename Condition, typename ReturnType> using EnableIf =
Invoke<std::enable_if<Condition::value, ReturnType>>;
template <typename Condition, typename ReturnType> using NotEnableIf =
Invoke<std::enable_if<!Condition::value, ReturnType>>;
////////////////////////////////////////////////////////
// Check for complexity with type traits
template <typename T>
@ -93,31 +103,32 @@ namespace Grid {
// Initialise to 1,0,i for the correct types
///////////////////////////////////////////////
// if not complex overload here
template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 >
template < class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); }
template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 >
template < class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 >
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); }
// overload for complex type
template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 >
// For complex types
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); }
template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 >
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0,0.0); }// use xor?
// For integral type
template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 >
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);}
// For integral types
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1); }
template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 >
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0); }
template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 >
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 >
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
// do not compile if real or integer, send an error message from the compiler
template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 >
friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
@ -137,7 +148,7 @@ namespace Grid {
};
// Distinguish between complex types and others
template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 >
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
{
Grid_simd ret;
@ -146,7 +157,7 @@ namespace Grid {
};
// Real/Integer types
template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 >
template < class S = Scalar_type, NotEnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
{
Grid_simd ret;
@ -155,8 +166,6 @@ namespace Grid {
};
////////////////////////////////////////////////////////////////////////
// FIXME: gonna remove these load/store, get, set, prefetch
////////////////////////////////////////////////////////////////////////
@ -169,14 +178,14 @@ namespace Grid {
///////////////////////
// overload if complex
template < class S = Scalar_type >
friend inline void vsplat(Grid_simd &ret, typename std::enable_if< is_complex < S >::value, S>::type c){
friend inline void vsplat(Grid_simd &ret, EnableIf<is_complex < S >, S> c){
Real a = real(c);
Real b = imag(c);
vsplat(ret,a,b);
}
// this only for the complex version
template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 >
// this is only for the complex version
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline void vsplat(Grid_simd &ret,Real a, Real b){
ret.v = binary<Vector_type>(a, b, VsplatSIMD());
}
@ -186,22 +195,45 @@ namespace Grid {
ret.v = unary<Vector_type>(a, VsplatSIMD());
}
///////////////////////
// Vstore
///////////////////////
friend inline void vstore(const Grid_simd &ret, Scalar_type *a){
binary<void>(ret.v, (Real*)a, VstoreSIMD());
}
///////////////////////
// Vstream
///////////////////////
friend inline void vstream(Grid_simd &out,const Grid_simd &in){
binary<void>(out.v, in.v, VstreamSIMD());
}
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
friend inline void vstream(Grid_simd &out,const Grid_simd &in){
out=in;
}
///////////////////////
// Vprefetch
///////////////////////
friend inline void vprefetch(const Grid_simd &v)
{
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
}
///////////////////////
// Reduce
///////////////////////
friend inline Scalar_type Reduce(const Grid_simd & in)
{
// FIXME add operator
return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
}
////////////////////////////
// opreator scalar * simd
////////////////////////////
friend inline Grid_simd operator * (const Scalar_type &a, Grid_simd b){
Grid_simd va;
vsplat(va,a);
@ -214,25 +246,63 @@ namespace Grid {
///////////////////////
// Conjugate
///////////////////////
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd conj(const Grid_simd &in){
Grid_simd ret ; vzero(ret);
// FIXME add operator
Grid_simd ret ;
ret.v = unary<Vector_type>(in.v, ConjSIMD());
return ret;
}
template < class S = Scalar_type, NotEnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd conj(const Grid_simd &in){
return in; // for real objects
}
///////////////////////
// timesMinusI
///////////////////////
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline void timesMinusI( Grid_simd &ret,const Grid_simd &in){
ret.v = binary<Vector_type>(in.v, ret.v, TimesMinusISIMD());
}
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd timesMinusI(const Grid_simd &in){
Grid_simd ret;
vzero(ret);
// FIXME add operator
timesMinusI(ret,in);
return ret;
}
friend inline Grid_simd timesI(const Grid_simd &in){
Grid_simd ret; vzero(ret);
// FIXME add operator
return ret;
template < class S = Scalar_type, NotEnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd timesMinusI(const Grid_simd &in){
return in;
}
///////////////////////
// timesI
///////////////////////
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline void timesI(Grid_simd &ret,const Grid_simd &in){
ret.v = binary<Vector_type>(in.v, ret.v, TimesISIMD());
}
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd timesI(const Grid_simd &in){
Grid_simd ret;
timesI(ret,in);
return ret;
}
template < class S = Scalar_type, NotEnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd timesI(const Grid_simd &in){
return in;
}
///////////////////////
// Unary negation
///////////////////////
friend inline Grid_simd operator -(const Grid_simd &r) {
vComplexF ret;
vzero(ret);
@ -256,41 +326,22 @@ namespace Grid {
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
{
Gpermute<Grid_simd>(y,b,perm);
}
/*
////////////////////////////////////////////////////////////////////
// General permute; assumes vector length is same across
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
{
Gpermute<Grid_simd>(y,b,perm);
}
friend inline void merge(Grid_simd &y,std::vector<Scalar_type *> &extracted)
{
Gmerge<Grid_simd,Scalar_type >(y,extracted);
}
friend inline void extract(const Grid_simd &y,std::vector<Scalar_type *> &extracted)
{
Gextract<Grid_simd,Scalar_type>(y,extracted);
}
friend inline void merge(Grid_simd &y,std::vector<Scalar_type > &extracted)
{
Gmerge<Grid_simd,Scalar_type >(y,extracted);
}
friend inline void extract(const Grid_simd &y,std::vector<Scalar_type > &extracted)
{
Gextract<Grid_simd,Scalar_type>(y,extracted);
}
*/
};// end of Grid_simd class definition
template<class scalar_type, class vector_type >
inline Grid_simd< scalar_type, vector_type> innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r)
{
@ -314,7 +365,7 @@ namespace Grid {
}
// Define available types (now change names to avoid clashing)
// Define available types (now change names to avoid clashing with the rest of the code)
typedef Grid_simd< float , SIMD_Ftype > MyRealF;
typedef Grid_simd< double , SIMD_Dtype > MyRealD;
@ -323,6 +374,29 @@ namespace Grid {
////////////////////////////////////////////////////////////////////
// Temporary hack to keep independent from the rest of the code
template<> struct isGridTensor<MyRealD > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<MyRealF > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<MyComplexD > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<MyComplexF > {
static const bool value = false;
static const bool notvalue = true;
};
}
#endif