From 933c54d9c419183c934b0b23b819c3a083d8af90 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 16 Apr 2015 14:47:28 +0100 Subject: [PATCH] Improving the trace support to support any index tracing and simplifying implmentation in some ways --- Grid_QCD.h | 10 +- Grid_config.h | 4 +- Grid_main.cc | 21 +++- Grid_math_type_mapper.h | 41 +++++--- Grid_math_types.h | 224 ++++++++++++++++++++++++++++++++++++++-- Grid_simd.h | 24 +++-- Grid_vComplexD.h | 18 ++-- Grid_vComplexF.h | 20 ++-- Grid_vInteger.h | 14 +-- Grid_vRealD.h | 14 +-- Grid_vRealF.h | 14 +-- test_Grid_stencil.cc | 4 +- 12 files changed, 321 insertions(+), 87 deletions(-) diff --git a/Grid_QCD.h b/Grid_QCD.h index 72a8ee67..2f411f16 100644 --- a/Grid_QCD.h +++ b/Grid_QCD.h @@ -7,12 +7,11 @@ namespace QCD { static const int Ns=4; static const int CbRed =0; static const int CbBlack=1; - + // QCD iMatrix types template using iSinglet = iScalar > ; template using iSpinMatrix = iMatrix, Ns>; template using iSpinColourMatrix = iMatrix, Ns>; - template using iColourMatrix = iScalar> ; template using iSpinVector = iVector, Ns>; @@ -20,10 +19,8 @@ namespace QCD { template using iSpinColourVector = iVector, Ns>; typedef iSinglet TComplex; // This is painful. Tensor singlet complex type. - typedef iSinglet vTComplex; - typedef iSinglet TReal; // This is painful. Tensor singlet complex type. - - + typedef iSinglet vTComplex; // what if we don't know the tensor structure + typedef iSinglet TReal; // Shouldn't need these. typedef iSinglet vTInteger; typedef iSpinMatrix SpinMatrix; @@ -44,7 +41,6 @@ namespace QCD { typedef iSpinColourVector vSpinColourVector; typedef Lattice LatticeComplex; - typedef Lattice LatticeInteger; // Predicates for "where" typedef Lattice LatticeColourMatrix; diff --git a/Grid_config.h b/Grid_config.h index bb885708..5faa2157 100644 --- a/Grid_config.h +++ b/Grid_config.h @@ -73,8 +73,8 @@ /* Define to the version of this package. */ #define PACKAGE_VERSION "1.0" -/* SSE2 */ -/* #undef SSE2 */ +/* SSE4 */ +/* #undef SSE4 */ /* Define to 1 if you have the ANSI C header files. */ #define STDC_HEADERS 1 diff --git a/Grid_main.cc b/Grid_main.cc index 8cac55f5..e122fcbe 100644 --- a/Grid_main.cc +++ b/Grid_main.cc @@ -152,6 +152,8 @@ int main (int argc, char ** argv) cMat= adj(cMat); cm=adj(cm); scm=adj(scm); + scm=transpose(scm); + scm=transposeIndex<1>(scm); // Foo = Foo+scalar; // LatticeColourMatrix+Scalar // Foo = Foo*scalar; // LatticeColourMatrix*Scalar @@ -162,6 +164,23 @@ int main (int argc, char ** argv) LatticeComplex trscMat(&Fine); trscMat = trace(scMat); // Trace + + { + TComplex c; + ColourMatrix c_m; + SpinMatrix s_m; + SpinColourMatrix sc_m; + + s_m = traceIndex<1>(sc_m); + c_m = traceIndex<2>(sc_m); + + c = traceIndex<2>(s_m); + c = traceIndex<1>(c_m); + + printf("c. Level %d\n",c_m.TensorLevel); + printf("c. Level %d\n",c_m._internal.TensorLevel); + + } FooBar = Bar; @@ -203,7 +222,7 @@ int main (int argc, char ** argv) peekSite(bar,Bar,coor); for(int r=0;r<3;r++){ for(int c=0;c<3;c++){ - cout<<"bar "< >::scalar_type == ComplexD. +// Use of a helper class like this allows us to template specialise and "dress" +// other classes such as RealD == double, ComplexD == std::complex with these +// traits. +// +// It is possible that we could do this more elegantly if I introduced a +// queryable trait in iScalar, iMatrix and iVector and used the query on vtype in +// place of the type mapper? +// +// Not sure how to do this, but probably could be done with a research effort +// to study C++11's type_traits.h file. (std::enable_if >) +// ////////////////////////////////////////////////////////////////////////////////// template class GridTypeMapper { @@ -10,7 +24,7 @@ namespace Grid { typedef typename T::scalar_type scalar_type; typedef typename T::vector_type vector_type; typedef typename T::tensor_reduced tensor_reduced; - // enum { TensorLevel = T::TensorLevel }; + enum { TensorLevel = T::TensorLevel }; }; ////////////////////////////////////////////////////////////////////////////////// @@ -21,28 +35,28 @@ namespace Grid { typedef RealF scalar_type; typedef RealF vector_type; typedef RealF tensor_reduced ; - // enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; template<> class GridTypeMapper { public: typedef RealD scalar_type; typedef RealD vector_type; typedef RealD tensor_reduced; - // enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; template<> class GridTypeMapper { public: typedef ComplexF scalar_type; typedef ComplexF vector_type; typedef ComplexF tensor_reduced; - // enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; template<> class GridTypeMapper { public: typedef ComplexD scalar_type; typedef ComplexD vector_type; typedef ComplexD tensor_reduced; - // enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; template<> class GridTypeMapper { @@ -50,44 +64,37 @@ namespace Grid { typedef RealF scalar_type; typedef vRealF vector_type; typedef vRealF tensor_reduced; - // enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; template<> class GridTypeMapper { public: typedef RealD scalar_type; typedef vRealD vector_type; typedef vRealD tensor_reduced; - //enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; template<> class GridTypeMapper { public: typedef ComplexF scalar_type; typedef vComplexF vector_type; typedef vComplexF tensor_reduced; - //enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; template<> class GridTypeMapper { public: typedef ComplexD scalar_type; typedef vComplexD vector_type; typedef vComplexD tensor_reduced; - //enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; template<> class GridTypeMapper { public: typedef Integer scalar_type; typedef vInteger vector_type; typedef vInteger tensor_reduced; - //enum { TensorLevel = 0 }; + enum { TensorLevel = 0 }; }; - // Again terminate the recursion. - inline vRealD TensorRemove(vRealD arg){ return arg;} - inline vRealF TensorRemove(vRealF arg){ return arg;} - inline vComplexF TensorRemove(vComplexF arg){ return arg;} - inline vComplexD TensorRemove(vComplexD arg){ return arg;} - inline vInteger TensorRemove(vInteger arg){ return arg;} - } #endif diff --git a/Grid_math_types.h b/Grid_math_types.h index db90bda7..58de9c15 100644 --- a/Grid_math_types.h +++ b/Grid_math_types.h @@ -2,15 +2,76 @@ #define GRID_MATH_TYPES_H #include +#include namespace Grid { + // First some of my own traits + template struct isGridTensor { + static const bool value = true; + static const bool notvalue = false; + }; + + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + template<> struct isGridTensor { + static const bool value = false; + static const bool notvalue = true; + }; + + // Match the index + template struct matchGridTensorIndex { + static const bool value = (Level==T::TensorLevel); + static const bool notvalue = (Level!=T::TensorLevel); + }; + /////////////////////////////////////////////////// // Scalar, Vector, Matrix objects. // These can be composed to form tensor products of internal indices. /////////////////////////////////////////////////// - + + // Terminates the recursion for temoval of all Grids tensors + inline vRealD TensorRemove(vRealD arg){ return arg;} + inline vRealF TensorRemove(vRealF arg){ return arg;} + inline vComplexF TensorRemove(vComplexF arg){ return arg;} + inline vComplexD TensorRemove(vComplexD arg){ return arg;} + inline vInteger TensorRemove(vInteger arg){ return arg;} + template class iScalar { public: @@ -19,9 +80,12 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; - //enum { TensorLevel = GridTypeMapper::TensorLevel + 1}; typedef iScalar tensor_reduced; + enum { TensorLevel = GridTypeMapper::TensorLevel + 1}; + + // Scalar no action + // template using tensor_reduce_level = typename iScalar::tensor_reduce_level >; iScalar(){}; @@ -79,9 +143,9 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; - // enum { TensorLevel = GridTypeMapper::TensorLevel + 1}; - typedef iScalar tensor_reduced; + enum { TensorLevel = GridTypeMapper::TensorLevel + 1}; + typedef iScalar tensor_reduced; iVector(Zero &z){ *this = zero; }; iVector() {}; @@ -137,8 +201,10 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; - // enum { TensorLevel = GridTypeMapper::TensorLevel + 1}; + + enum { TensorLevel = GridTypeMapper::TensorLevel + 1}; typedef iScalar tensor_reduced; iMatrix(Zero &z){ *this = zero; }; @@ -1018,6 +1084,137 @@ template inline iMatrix adj(const iMatrix & }} return ret; } +///////////////////////////////////////////////////////////////// +// Transpose +///////////////////////////////////////////////////////////////// +template + inline typename std::enable_if::value, iMatrix >::type + transpose(iMatrix arg) + { + iMatrix ret; + for(int i=0;i + inline typename std::enable_if::notvalue, iMatrix >::type + transpose(iMatrix arg) + { + iMatrix ret; + for(int i=0;i + inline typename std::enable_if::value, iScalar >::type + transpose(iScalar arg) + { + iScalar ret; + ret._internal = transpose(arg._internal); // NB recurses + return ret; + } + +template + inline typename std::enable_if::notvalue, iScalar >::type + transpose(iScalar arg) + { + iScalar ret; + ret._internal = arg._internal; // NB recursion stops + return ret; + } + + + +/* + * No need to implement transpose on the primitive types + * Not sure that this idiom is any more elegant that the trace idiom below however! +inline ComplexD transpose(ComplexD &rhs){ return rhs;} +inline ComplexF transpose(ComplexF &rhs){ return rhs;} +inline RealD transpose(RealD &rhs){ return rhs;} +inline RealF transpose(RealF &rhs){ return rhs;} + */ + +//////////////////////////////////////////////////////////////////////////////////////////// +// Transpose a specific index +//////////////////////////////////////////////////////////////////////////////////////////// +template inline + typename std::enable_if,Level>::value, iMatrix >::type +transposeIndex (const iMatrix &arg) +{ + iMatrix ret; + for(int i=0;i inline +typename std::enable_if,Level>::notvalue, iMatrix >::type +transposeIndex (const iMatrix &arg) +{ + iMatrix ret; + for(int i=0;i(arg._internal[i][j]); + }} + return ret; +} +template inline +typename std::enable_if,Level>::notvalue, iScalar >::type +transposeIndex (const iScalar &arg) +{ + return transposeIndex(arg._internal); +} +//////////////////////////////// +// Trace a specific index +//////////////////////////////// + +template inline ComplexF traceIndex(const ComplexF arg) { return arg;} +template inline ComplexD traceIndex(const ComplexD arg) { return arg;} +template inline RealF traceIndex(const RealF arg) { return arg;} +template inline RealD traceIndex(const RealD arg) { return arg;} + +template inline +auto traceIndex(const iScalar &arg) -> iScalar(arg._internal)) > +{ + iScalar(arg._internal))> ret; + ret._internal = traceIndex(arg._internal); + return ret; +} + +// If we hit the right index, return scalar and trace it with no further recursion +template inline +auto traceIndex(const iMatrix &arg) -> + typename std::enable_if,Level>::value, // Index matches + iScalar >::type // return scalar +{ + iScalar ret; + zeroit(ret._internal); + for(int i=0;i inline +auto traceIndex(const iMatrix &arg) -> + typename std::enable_if,Level>::notvalue,// No index match + iMatrix(arg._internal[0][0])),N> >::type // return matrix +{ + iMatrix(arg._internal[0][0])),N> ret; + for(int i=0;i(arg._internal[i][j]); + }} + return ret; +} ///////////////////////////////////////////////////////////////// // Can only take the real/imag part of scalar objects, since @@ -1075,17 +1272,24 @@ template inline auto imag(const iVector &z) -> iVect // Trace of scalar and matrix ///////////////////////////////// -inline Complex trace( const Complex &arg){ +inline ComplexF trace( const ComplexF &arg){ return arg; } -//inline vComplex trace(const vComplex &arg){ -// return arg; -//} +inline ComplexD trace( const ComplexD &arg){ + return arg; +} +inline RealF trace( const RealF &arg){ + return arg; +} +inline RealD trace( const RealD &arg){ + return arg; +} + template inline auto trace(const iMatrix &arg) -> iScalar { iScalar ret; - ZeroIt(ret._internal); + zeroit(ret._internal); for(int i=0;i #endif #if defined(AVX1) || defined (AVX2) @@ -33,6 +33,12 @@ namespace Grid { inline RealF adj(const RealF & r){ return r; } inline RealF conj(const RealF & r){ return r; } + inline RealF real(const RealF & r){ return r; } + + inline RealD adj(const RealD & r){ return r; } + inline RealD conj(const RealD & r){ return r; } + inline RealD real(const RealD & r){ return r; } + inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } @@ -60,8 +66,6 @@ namespace Grid { inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);} - inline RealD adj(const RealD & r){ return r; } // No-op for real - inline RealD conj(const RealD & r){ return r; } inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); } inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } @@ -72,14 +76,14 @@ namespace Grid { class Zero{}; static Zero zero; - template inline void ZeroIt(itype &arg){ arg=zero;}; - template<> inline void ZeroIt(ComplexF &arg){ arg=0; }; - template<> inline void ZeroIt(ComplexD &arg){ arg=0; }; - template<> inline void ZeroIt(RealF &arg){ arg=0; }; - template<> inline void ZeroIt(RealD &arg){ arg=0; }; + template inline void zeroit(itype &arg){ arg=zero;}; + template<> inline void zeroit(ComplexF &arg){ arg=0; }; + template<> inline void zeroit(ComplexD &arg){ arg=0; }; + template<> inline void zeroit(RealF &arg){ arg=0; }; + template<> inline void zeroit(RealD &arg){ arg=0; }; -#if defined (SSE2) +#if defined (SSE4) typedef __m128 fvec; typedef __m128d dvec; typedef __m128 cvec; @@ -202,7 +206,7 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){ case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break; case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break; #endif -#ifdef SSE2 +#ifdef SSE4 case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break; case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break; #endif diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h index 133e354b..ffcff3a9 100644 --- a/Grid_vComplexD.h +++ b/Grid_vComplexD.h @@ -49,7 +49,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_add_pd(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_add_pd(a.v,b.v); #endif #ifdef AVX512 @@ -67,7 +67,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_sub_pd(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_sub_pd(a.v,b.v); #endif #ifdef AVX512 @@ -114,7 +114,7 @@ namespace Grid { ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi ret.v= _mm256_addsub_pd(ymm0,ymm1); #endif -#ifdef SSE2 +#ifdef SSE4 zvec ymm0,ymm1,ymm2; ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, ymm0 = _mm_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br @@ -200,14 +200,14 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_pd(ig,rl,ig,rl); #endif -#ifdef SSE2 - ret.v = _mm_set_pd(a,b); +#ifdef SSE4 + ret.v = _mm_set_pd(ig,rl); #endif #ifdef AVX512 ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl); #endif #ifdef QPX - ret.v = {a,b,a,b}; + ret.v = {ig,rl,ig,rl}; #endif } @@ -215,7 +215,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_set_pd(a[0].imag(),a[0].real()); #endif #ifdef AVX512 @@ -231,7 +231,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_pd((double *)a,ret.v); #endif -#ifdef SSE2 +#ifdef SSE4 _mm_store_pd((double *)a,ret.v); #endif #ifdef AVX512 @@ -257,7 +257,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5)); ret.v=_mm256_shuffle_pd(tmp,tmp,0x5); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_addsub_pd(ret.v,in.v); #endif #ifdef AVX512 diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h index 75c47248..42fe5163 100644 --- a/Grid_vComplexF.h +++ b/Grid_vComplexF.h @@ -63,7 +63,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_add_ps(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_add_ps(a.v,b.v); #endif #ifdef AVX512 @@ -81,7 +81,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_sub_ps(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_sub_ps(a.v,b.v); #endif #ifdef AVX512 @@ -120,7 +120,7 @@ namespace Grid { ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi ret.v= _mm256_addsub_ps(ymm0,ymm1); #endif -#ifdef SSE2 +#ifdef SSE4 cvec ymm0,ymm1,ymm2; ymm0 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm_mul_ps(ymm0,b.v); // ymm0 <- ar bi, ar br @@ -156,8 +156,8 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); #endif -#ifdef SSE2 - ret.v = _mm_set_ps(a[1].imag, a[1].real(),a[0].imag(),a[0].real()); +#ifdef SSE4 + ret.v = _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real()); #endif #ifdef AVX512 ret.v = _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); @@ -181,7 +181,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_ps((float *)a,ret.v); #endif -#ifdef SSE2 +#ifdef SSE4 _mm_store_ps((float *)a,ret.v); #endif #ifdef AVX512 @@ -201,7 +201,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_ps(b,a,b,a,b,a,b,a); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_set_ps(a,b,a,b); #endif #ifdef AVX512 @@ -213,7 +213,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ } friend inline ComplexF Reduce(const vComplexF & in) { +#ifdef SSE4 +#error +#endif #if defined (AVX1) || defined(AVX2) + // FIXME this is inefficient and use __attribute__ ((aligned(32))) float c_[8]; _mm256_store_ps(c_,in.v); return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]); @@ -305,7 +309,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_addsub_ps(ret.v,in.v); #endif #ifdef AVX512 diff --git a/Grid_vInteger.h b/Grid_vInteger.h index 82adbd8e..672d4938 100644 --- a/Grid_vInteger.h +++ b/Grid_vInteger.h @@ -48,7 +48,7 @@ namespace Grid { #if defined (AVX2) ret.v = _mm256_add_epi32(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_add_epi32(a.v,b.v); #endif #ifdef AVX512 @@ -78,7 +78,7 @@ namespace Grid { #if defined (AVX2) ret.v = _mm256_sub_epi32(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_sub_epi32(a.v,b.v); #endif #ifdef AVX512 @@ -108,7 +108,7 @@ namespace Grid { #if defined (AVX2) ret.v = _mm256_mul_epi32(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_mul_epi32(a.v,b.v); #endif #ifdef AVX512 @@ -147,7 +147,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set1_epi32(a); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_set1_epi32(a); #endif #ifdef AVX512 @@ -161,7 +161,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]); #endif #ifdef AVX512 @@ -177,8 +177,8 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) _mm256_store_si256((__m256i*)a,ret.v); #endif -#ifdef SSE2 - _mm_store_si128(a,ret.v); +#ifdef SSE4 + _mm_store_si128((__m128i *)a,ret.v); #endif #ifdef AVX512 _mm512_store_si512(a,ret.v); diff --git a/Grid_vRealD.h b/Grid_vRealD.h index 86f738f3..0f88f7da 100644 --- a/Grid_vRealD.h +++ b/Grid_vRealD.h @@ -24,7 +24,7 @@ namespace Grid { friend inline vRealD conj(const vRealD &in){ return in; } friend inline void mac (vRealD &y,const vRealD a,const vRealD x){ -#if defined (AVX1) || defined (SSE2) +#if defined (AVX1) || defined (SSE4) y = a*x+y; #endif #ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3 @@ -55,7 +55,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_add_pd(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_add_pd(a.v,b.v); #endif #ifdef AVX512 @@ -72,7 +72,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_sub_pd(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_sub_pd(a.v,b.v); #endif #ifdef AVX512 @@ -90,7 +90,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_mul_pd(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_mul_pd(a.v,b.v); #endif #ifdef AVX512 @@ -133,7 +133,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_pd(a,a,a,a); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_set_pd(a,a); #endif #ifdef AVX512 @@ -147,7 +147,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_set_pd(a[0],a[1]); #endif #ifdef AVX512 @@ -163,7 +163,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) _mm256_store_pd(a,ret.v); #endif -#ifdef SSE2 +#ifdef SSE4 _mm_store_pd(a,ret.v); #endif #ifdef AVX512 diff --git a/Grid_vRealF.h b/Grid_vRealF.h index b7c127ce..78c37177 100644 --- a/Grid_vRealF.h +++ b/Grid_vRealF.h @@ -26,7 +26,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_add_ps(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_add_ps(a.v,b.v); #endif #ifdef AVX512 @@ -48,7 +48,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_sub_ps(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_sub_ps(a.v,b.v); #endif #ifdef AVX512 @@ -70,7 +70,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_mul_ps(a.v,b.v); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_mul_ps(a.v,b.v); #endif #ifdef AVX512 @@ -96,7 +96,7 @@ namespace Grid { friend inline vRealF conj(const vRealF &in){ return in; } friend inline void mac (vRealF &y,const vRealF a,const vRealF x){ -#if defined (AVX1) || defined (SSE2) +#if defined (AVX1) || defined (SSE4) y = a*x+y; #endif #ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3 @@ -158,7 +158,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_set_ps(a,a,a,a); #endif #ifdef AVX512 @@ -175,7 +175,7 @@ namespace Grid { #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); #endif -#ifdef SSE2 +#ifdef SSE4 ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]); #endif #ifdef AVX512 @@ -195,7 +195,7 @@ friend inline void vstore(const vRealF &ret, float *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_ps(a,ret.v); #endif -#ifdef SSE2 +#ifdef SSE4 _mm_store_ps(a,ret.v); #endif #ifdef AVX512 diff --git a/test_Grid_stencil.cc b/test_Grid_stencil.cc index 8077562f..fd2a9503 100644 --- a/test_Grid_stencil.cc +++ b/test_Grid_stencil.cc @@ -29,14 +29,14 @@ int main (int argc, char ** argv) #ifdef AVX512 simd_layout[0] = 1; - simd_layout[1] = 1; + simd_layout[1] = 2; simd_layout[2] = 2; simd_layout[3] = 2; #endif #if defined (AVX1)|| defined (AVX2) simd_layout[0] = 1; simd_layout[1] = 1; - simd_layout[2] = 1; + simd_layout[2] = 2; simd_layout[3] = 2; #endif #if defined (SSE2)