From e80012896a1aa983879e282d352767f228a84f5d Mon Sep 17 00:00:00 2001 From: neo Date: Tue, 9 Jun 2015 18:59:45 +0900 Subject: [PATCH] Adding support for iMatrix exponentiation --- lib/simd/Old/Grid_vComplexD.h | 430 ------------------------------- lib/simd/Old/Grid_vComplexF.h | 463 ---------------------------------- lib/simd/Old/Grid_vInteger.h | 259 ------------------- lib/simd/Old/Grid_vRealD.h | 276 -------------------- lib/simd/Old/Grid_vRealF.h | 313 ----------------------- lib/tensors/Tensor_Ta.h | 84 +++++- tests/InvSqrt.gnu | 0 tests/Sqrt.gnu | 2 + tests/Test_main.cc | 3 + 9 files changed, 83 insertions(+), 1747 deletions(-) delete mode 100644 lib/simd/Old/Grid_vComplexD.h delete mode 100644 lib/simd/Old/Grid_vComplexF.h delete mode 100644 lib/simd/Old/Grid_vInteger.h delete mode 100644 lib/simd/Old/Grid_vRealD.h delete mode 100644 lib/simd/Old/Grid_vRealF.h create mode 100644 tests/InvSqrt.gnu create mode 100644 tests/Sqrt.gnu diff --git a/lib/simd/Old/Grid_vComplexD.h b/lib/simd/Old/Grid_vComplexD.h deleted file mode 100644 index 116c2866..00000000 --- a/lib/simd/Old/Grid_vComplexD.h +++ /dev/null @@ -1,430 +0,0 @@ -#ifndef GRID_VCOMPLEXD_H -#define GRID_VCOMPLEXD_H - -namespace Grid { - class vComplexD { - public: - zvec v; - public: - typedef zvec vector_type; - typedef ComplexD scalar_type; - - vComplexD & operator = ( Zero & z){ - vzero(*this); - return (*this); - } - vComplexD( Zero & z){ - vzero(*this); - } - vComplexD()=default; - vComplexD(ComplexD a){ - vsplat(*this,a); - }; - vComplexD(double a){ - vsplat(*this,ComplexD(a)); - }; - - /////////////////////////////////////////////// - // mac, mult, sub, add, adj - // Should do an AVX2 version with mac. - /////////////////////////////////////////////// - friend inline void mac (vComplexD * __restrict__ y,const vComplexD * __restrict__ a,const vComplexD *__restrict__ x) {*y = (*a)*(*x)+(*y);}; - friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);} - friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);} - friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);} - friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); } - - ////////////////////////////////// - // Initialise to 1,0,i - ////////////////////////////////// - friend inline void vone (vComplexD &ret){ vsplat(ret,1.0,0.0);} - friend inline void vzero (vComplexD &ret){ vsplat(ret,0.0,0.0);} - friend inline void vcomplex_i(vComplexD &ret){ vsplat(ret,0.0,1.0);} - - //////////////////////////////////// - // Arithmetic operator overloads +,-,* - //////////////////////////////////// - friend inline vComplexD operator + (vComplexD a, vComplexD b) - { - vComplexD ret; -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_add_pd(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_add_pd(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_add_pd(a.v,b.v); -#endif -#ifdef QPX - ret.v = vec_add(a.v,b.v); -#endif - return ret; - }; - - friend inline vComplexD operator - (vComplexD a, vComplexD b) - { - vComplexD ret; -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_sub_pd(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_sub_pd(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_sub_pd(a.v,b.v); -#endif -#ifdef QPX - ret.v = vec_sub(a.v,b.v); -#endif - return ret; - }; - - friend inline vComplexD operator * (vComplexD a, vComplexD b) - { - vComplexD ret; - - //Multiplicationof (ak+ibk)*(ck+idk) - // a + i b can be stored as a data structure - //From intel optimisation reference guide - /* - movsldup xmm0, Src1; load real parts into the destination, - ; a1, a1, a0, a0 - movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0 - mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0 - shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0 - movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0 - mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0 - addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d - VSHUFPD (VEX.256 encoded version) - IF IMM0[0] = 0 - THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI; - IF IMM0[1] = 0 - THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI; - IF IMM0[2] = 0 - THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI; - IF IMM0[3] = 0 - THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged - */ -#if defined (AVX1)|| defined (AVX2) - zvec ymm0,ymm1,ymm2; - ymm0 = _mm256_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, ar,ar b'00,00 - ymm0 = _mm256_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br - ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi b'01,01 - ymm2 = _mm256_shuffle_pd(a.v,a.v,0xF); // ymm2 <- ai,ai b'11,11 - ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi - ret.v= _mm256_addsub_pd(ymm0,ymm1); -#endif -#ifdef SSE4 - zvec ymm0,ymm1,ymm2; - ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, - ymm0 = _mm_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br - ymm1 = _mm_shuffle_pd(b.v,b.v,0x1); // ymm1 <- br,bi b01 - ymm2 = _mm_shuffle_pd(a.v,a.v,0x3); // ymm2 <- ai,ai b11 - ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi - ret.v= _mm_addsub_pd(ymm0,ymm1); -#endif -#ifdef AVX512 - /* This is from - * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets - * @inproceedings{McFarlin:2011:ASV:1995896.1995938, - * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus}, - * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets}, - * booktitle = {Proceedings of the International Conference on Supercomputing}, - * series = {ICS '11}, - * year = {2011}, - * isbn = {978-1-4503-0102-2}, - * location = {Tucson, Arizona, USA}, - * pages = {265--274}, - * numpages = {10}, - * url = {http://doi.acm.org/10.1145/1995896.1995938}, - * doi = {10.1145/1995896.1995938}, - * acmid = {1995938}, - * publisher = {ACM}, - * address = {New York, NY, USA}, - * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization}, - * } - */ - zvec vzero,ymm0,ymm1,real,imag; - vzero =(zvec)_mm512_setzero(); - ymm0 = _mm512_swizzle_pd(a.v, _MM_SWIZ_REG_CDAB); // - real =(zvec)_mm512_mask_or_epi64((__m512i)a.v, 0xAA,(__m512i)vzero,(__m512i) ymm0); - imag = _mm512_mask_sub_pd(a.v, 0x55,vzero, ymm0); - ymm1 = _mm512_mul_pd(real, b.v); - ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK - ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1); - /* Imag OK */ -#endif -#ifdef QPX - ret.v = vec_mul(a.v,b.v); -#endif - return ret; - }; - - //////////////////////////////////////////////////////////////////// - // General permute; assumes vector length is same across - // all subtypes; may not be a good assumption, but could - // add the vector width as a template param for BG/Q for example - //////////////////////////////////////////////////////////////////// - friend inline void permute(vComplexD &y,vComplexD b,int perm) - { - Gpermute(y,b,perm); - } - /* - friend inline void merge(vComplexD &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vComplexD &y,std::vector &extracted) - { - Gextract(y,extracted); - } - friend inline void merge(vComplexD &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vComplexD &y,std::vector &extracted) - { - Gextract(y,extracted); - } - */ - - /////////////////////// - // Splat - /////////////////////// - friend inline void vsplat(vComplexD &ret,ComplexD c){ - float a= real(c); - float b= imag(c); - vsplat(ret,a,b); - } - - - friend inline void vsplat(vComplexD &ret,double rl,double ig){ -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_set_pd(ig,rl,ig,rl); -#endif -#ifdef SSE4 - ret.v = _mm_set_pd(ig,rl); -#endif -#ifdef AVX512 - ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl); -#endif -#ifdef QPX - ret.v = {ig,rl,ig,rl}; -#endif - } - - friend inline void vset(vComplexD &ret,ComplexD *a){ -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); -#endif -#ifdef SSE4 - ret.v = _mm_set_pd(a[0].imag(),a[0].real()); -#endif -#ifdef AVX512 - ret.v = _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); - // Note v has a0 a1 a2 a3 -#endif -#ifdef QPX - ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[3].imag()}; -#endif - } - -friend inline void vstore(const vComplexD &ret, ComplexD *a){ -#if defined (AVX1)|| defined (AVX2) - _mm256_store_pd((double *)a,ret.v); -#endif -#ifdef SSE4 - _mm_store_pd((double *)a,ret.v); -#endif -#ifdef AVX512 - _mm512_store_pd((double *)a,ret.v); - //Note v has a3 a2 a1 a0 -#endif -#ifdef QPX - assert(0); -#endif - } - friend inline void vstream(vComplexD &out,const vComplexD &in){ -#if defined (AVX1)|| defined (AVX2) - _mm256_stream_pd((double *)&out.v,in.v); -#endif -#ifdef SSE4 - _mm_stream_pd((double *)&out.v,in.v); -#endif -#ifdef AVX512 - _mm512_storenrngo_pd((double *)&out.v,in.v); - // _mm512_stream_pd((double *)&out.v,in.v); - //Note v has a3 a2 a1 a0 -#endif -#ifdef QPX - assert(0); -#endif - } - friend inline void prefetch(const vComplexD &v) - { - _mm_prefetch((const char*)&v.v,_MM_HINT_T0); - } - - //////////////////////// - // Conjugate - //////////////////////// - friend inline vComplexD conjugate(const vComplexD &in){ - vComplexD ret ; vzero(ret); -#if defined (AVX1)|| defined (AVX2) - // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ... - zvec tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5)); - ret.v =_mm256_shuffle_pd(tmp,tmp,0x5); -#endif -#ifdef SSE4 - zvec tmp = _mm_addsub_pd(ret.v,_mm_shuffle_pd(in.v,in.v,0x1)); - ret.v = _mm_shuffle_pd(tmp,tmp,0x1); -#endif -#ifdef AVX512 - ret.v = _mm512_mask_sub_pd(in.v, 0xaa,ret.v, in.v); -#endif -#ifdef QPX - assert(0); -#endif - return ret; - } - - friend inline void timesMinusI(vComplexD &ret,const vComplexD &in){ - vzero(ret); - vComplexD tmp; -#if defined (AVX1)|| defined (AVX2) - tmp.v =_mm256_addsub_pd(ret.v,in.v); // r,-i - ret.v =_mm256_shuffle_pd(tmp.v,tmp.v,0x5); -#endif -#ifdef SSE4 - tmp.v =_mm_addsub_pd(ret.v,in.v); // r,-i - ret.v =_mm_shuffle_pd(tmp.v,tmp.v,0x1); -#endif -#ifdef AVX512 - ret.v = _mm512_mask_sub_pd(in.v,0xaa,ret.v,in.v); // real -imag - ret.v = _mm512_swizzle_pd(ret.v, _MM_SWIZ_REG_CDAB);// OK -#endif -#ifdef QPX - assert(0); -#endif - } - - friend inline void timesI(vComplexD &ret, const vComplexD &in){ - vzero(ret); - vComplexD tmp; -#if defined (AVX1)|| defined (AVX2) - tmp.v =_mm256_shuffle_pd(in.v,in.v,0x5); - ret.v =_mm256_addsub_pd(ret.v,tmp.v); // i,-r -#endif -#ifdef SSE4 - tmp.v =_mm_shuffle_pd(in.v,in.v,0x1); - ret.v =_mm_addsub_pd(ret.v,tmp.v); // r,-i -#endif -#ifdef AVX512 - tmp.v = _mm512_swizzle_pd(in.v, _MM_SWIZ_REG_CDAB);// OK - ret.v = _mm512_mask_sub_pd(tmp.v,0xaa,ret.v,tmp.v); // real -imag -#endif -#ifdef QPX - assert(0); -#endif - } - - friend inline vComplexD timesMinusI(const vComplexD &in){ - vComplexD ret; - timesMinusI(ret,in); - return ret; - } - - friend inline vComplexD timesI(const vComplexD &in){ - vComplexD ret; - timesI(ret,in); - return ret; - } - - -// REDUCE FIXME must be a cleaner implementation - friend inline ComplexD Reduce(const vComplexD & in) - { - vComplexD v1,v2; - union { - zvec v; - double f[sizeof(zvec)/sizeof(double)]; - } conv; - -#ifdef SSE4 - v1=in; -#endif -#if defined(AVX1) || defined (AVX2) - permute(v1,in,0); // sse 128; paired complex single - v1=v1+in; -#endif -#ifdef AVX512 - permute(v1,in,0); // sse 128; paired complex single - v1=v1+in; - permute(v2,v1,1); // avx 256; quad complex single - v1=v1+v2; -#endif -#ifdef QPX -#error -#endif - conv.v = v1.v; - return ComplexD(conv.f[0],conv.f[1]); - } - - // Unary negation - friend inline vComplexD operator -(const vComplexD &r) { - vComplexD ret; - vzero(ret); - ret = ret - r; - return ret; - } - // *=,+=,-= operators - inline vComplexD &operator *=(const vComplexD &r) { - *this = (*this)*r; - return *this; - } - inline vComplexD &operator +=(const vComplexD &r) { - *this = *this+r; - return *this; - } - inline vComplexD &operator -=(const vComplexD &r) { - *this = *this-r; - return *this; - } - - public: - static int Nsimd(void) { return sizeof(zvec)/sizeof(double)/2;} - }; - - - inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; } - - - typedef vComplexD vDComplex; - inline void zeroit(vComplexD &z){ vzero(z);} - - inline vComplexD outerProduct(const vComplexD &l, const vComplexD& r) - { - return l*r; - } - inline vComplexD trace(const vComplexD &arg){ - return arg; - } -///////////////////////////////////////////////////////////////////////// -//// Generic routine to promote object -> object -//// Supports the array reordering transformation that gives me SIMD utilisation -/////////////////////////////////////////////////////////////////////////// -/* -template class object> -inline object splat(objects){ - object ret; - vComplex * v_ptr = (vComplex *)& ret; - Complex * s_ptr = (Complex *) &s; - for(int i=0;i(y,b,perm); - } - friend inline ComplexF Reduce(const vComplexF & in) - { - vComplexF v1,v2; - union { - cvec v; - float f[sizeof(cvec)/sizeof(float)]; - } conv; -#ifdef SSE4 - permute(v1,in,0); // sse 128; paired complex single - v1=v1+in; -#endif -#if defined(AVX1) || defined (AVX2) - permute(v1,in,0); // sse 128; paired complex single - v1=v1+in; - permute(v2,v1,1); // avx 256; quad complex single - v1=v1+v2; -#endif -#ifdef AVX512 - permute(v1,in,0); // avx512 octo-complex single - v1=v1+in; - permute(v2,v1,1); - v1=v1+v2; - permute(v2,v1,2); - v1=v1+v2; -#endif -#ifdef QPX -#error -#endif - conv.v = v1.v; - return ComplexF(conv.f[0],conv.f[1]); - } - - friend inline vComplexF operator * (const ComplexF &a, vComplexF b){ - vComplexF va; - vsplat(va,a); - return va*b; - } - friend inline vComplexF operator * (vComplexF b,const ComplexF &a){ - return a*b; - } - - /* - template - friend inline vComplexF operator * (vComplexF b,const real &a){ - vComplexF va; - Complex ca(a,0); - vsplat(va,ca); - return va*b; - } - template - friend inline vComplexF operator * (const real &a,vComplexF b){ - return a*b; - } - - friend inline vComplexF operator + (const Complex &a, vComplexF b){ - vComplexF va; - vsplat(va,a); - return va+b; - } - friend inline vComplexF operator + (vComplexF b,const Complex &a){ - return a+b; - } - template - friend inline vComplexF operator + (vComplexF b,const real &a){ - vComplexF va; - Complex ca(a,0); - vsplat(va,ca); - return va+b; - } - template - friend inline vComplexF operator + (const real &a,vComplexF b){ - return a+b; - } - friend inline vComplexF operator - (const Complex &a, vComplexF b){ - vComplexF va; - vsplat(va,a); - return va-b; - } - friend inline vComplexF operator - (vComplexF b,const Complex &a){ - vComplexF va; - vsplat(va,a); - return b-va; - } - template - friend inline vComplexF operator - (vComplexF b,const real &a){ - vComplexF va; - Complex ca(a,0); - vsplat(va,ca); - return b-va; - } - template - friend inline vComplexF operator - (const real &a,vComplexF b){ - vComplexF va; - Complex ca(a,0); - vsplat(va,ca); - return va-b; - } - */ - - - /////////////////////// - // Conjugate - /////////////////////// - - friend inline vComplexF conjugate(const vComplexF &in){ - vComplexF ret ; vzero(ret); -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f)); -#endif -#ifdef SSE4 - ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f)); -#endif -#ifdef AVX512 - ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag -#endif -#ifdef QPX - assert(0); -#endif - return ret; - } - friend inline void timesMinusI( vComplexF &ret,const vComplexF &in){ - vzero(ret); -#if defined (AVX1)|| defined (AVX2) - cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i - ret.v = _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r -#endif -#ifdef SSE4 - cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i - ret.v = _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); -#endif -#ifdef AVX512 - ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag - ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK -#endif -#ifdef QPX - assert(0); -#endif - } - friend inline void timesI(vComplexF &ret,const vComplexF &in){ - vzero(ret); -#if defined (AVX1)|| defined (AVX2) - cvec tmp =_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));//i,r - ret.v =_mm256_addsub_ps(ret.v,tmp); //i,-r -#endif -#ifdef SSE4 - cvec tmp =_mm_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1)); - ret.v = _mm_addsub_ps(ret.v,tmp); // r,-i -#endif -#ifdef AVX512 - cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK - ret.v = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag -#endif -#ifdef QPX - assert(0); -#endif - } - friend inline vComplexF timesMinusI(const vComplexF &in){ - vComplexF ret; - timesMinusI(ret,in); - return ret; - } - friend inline vComplexF timesI(const vComplexF &in){ - vComplexF ret; - timesI(ret,in); - return ret; - } - - - // Unary negation - friend inline vComplexF operator -(const vComplexF &r) { - vComplexF ret; - vzero(ret); - ret = ret - r; - return ret; - } - // *=,+=,-= operators - inline vComplexF &operator *=(const vComplexF &r) { - *this = (*this)*r; - return *this; - } - inline vComplexF &operator +=(const vComplexF &r) { - *this = *this+r; - return *this; - } - inline vComplexF &operator -=(const vComplexF &r) { - *this = *this-r; - return *this; - } - - /* - friend inline void merge(vComplexF &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vComplexF &y,std::vector &extracted) - { - Gextract(y,extracted); - } - friend inline void merge(vComplexF &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vComplexF &y,std::vector &extracted) - { - Gextract(y,extracted); - } - */ - - }; - - inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) - { - return conjugate(l)*r; - } - - inline void zeroit(vComplexF &z){ vzero(z);} - - inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r) - { - return l*r; - } - inline vComplexF trace(const vComplexF &arg){ - return arg; - } - - -} - -#endif diff --git a/lib/simd/Old/Grid_vInteger.h b/lib/simd/Old/Grid_vInteger.h deleted file mode 100644 index ee4a3633..00000000 --- a/lib/simd/Old/Grid_vInteger.h +++ /dev/null @@ -1,259 +0,0 @@ -#ifndef GRID_VINTEGER_H -#define GRID_VINTEGER_H - -namespace Grid { - - - class vInteger { - protected: - - public: - - ivec v; - - typedef ivec vector_type; - typedef Integer scalar_type; - - vInteger(){}; - vInteger & operator = (const Zero & z){ - vzero(*this); - return (*this); - } - vInteger(Integer a){ - vsplat(*this,a); - }; - //////////////////////////////////// - // Arithmetic operator overloads +,-,* - //////////////////////////////////// - friend inline vInteger operator + ( vInteger a, vInteger b) - { - vInteger ret; -#if defined (AVX1) - __m128i a0,a1; - __m128i b0,b1; - a0 = _mm256_extractf128_si256(a.v,0); - b0 = _mm256_extractf128_si256(b.v,0); - a1 = _mm256_extractf128_si256(a.v,1); - b1 = _mm256_extractf128_si256(b.v,1); - a0 = _mm_add_epi32(a0,b0); - a1 = _mm_add_epi32(a1,b1); - ret.v = _mm256_set_m128i(a1,a0); -#endif -#if defined (AVX2) - ret.v = _mm256_add_epi32(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_add_epi32(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_add_epi32(a.v,b.v); -#endif -#ifdef QPX - // Implement as array of ints is only option -#error -#endif - return ret; - }; - - friend inline vInteger operator - ( vInteger a, vInteger b) - { - vInteger ret; -#if defined (AVX1) - __m128i a0,a1; - __m128i b0,b1; - a0 = _mm256_extractf128_si256(a.v,0); - b0 = _mm256_extractf128_si256(b.v,0); - a1 = _mm256_extractf128_si256(a.v,1); - b1 = _mm256_extractf128_si256(b.v,1); - a0 = _mm_sub_epi32(a0,b0); - a1 = _mm_sub_epi32(a1,b1); - ret.v = _mm256_set_m128i(a1,a0); -#endif -#if defined (AVX2) - ret.v = _mm256_sub_epi32(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_sub_epi32(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_sub_epi32(a.v,b.v); -#endif -#ifdef QPX - // Implement as array of ints is only option -#error -#endif - return ret; - }; - - friend inline vInteger operator * ( vInteger a, vInteger b) - { - vInteger ret; -#if defined (AVX1) - __m128i a0,a1; - __m128i b0,b1; - a0 = _mm256_extractf128_si256(a.v,0); - b0 = _mm256_extractf128_si256(b.v,0); - a1 = _mm256_extractf128_si256(a.v,1); - b1 = _mm256_extractf128_si256(b.v,1); - a0 = _mm_mul_epi32(a0,b0); - a1 = _mm_mul_epi32(a1,b1); - ret.v = _mm256_set_m128i(a1,a0); -#endif -#if defined (AVX2) - ret.v = _mm256_mul_epi32(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_mul_epi32(a.v,b.v); -#endif -#ifdef AVX512 - // ret.v = _mm512_mul_epi32(a.v,b.v); - ret.v = _mm512_mullo_epi32(a.v,b.v); -#endif -#ifdef QPX - // Implement as array of ints is only option -#error -#endif - return ret; - }; - - /////////////////////////////////////////////// - // mult, sub, add, adj,conjugate, mac functions - /////////////////////////////////////////////// - friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);} - friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);} - friend inline void add (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) + (*r);} - friend inline void mac (vInteger &y,const vInteger a,const vInteger x){ - y = a*x+y; - } - - ////////////////////////////////// - // Initialise to 1,0,i - ////////////////////////////////// - friend inline void vone (vInteger &ret){vsplat(ret,1);} - friend inline void vzero(vInteger &ret){vsplat(ret,0);} - friend inline void vtrue (vInteger &ret){vsplat(ret,0xFFFFFFFF);} - friend inline void vfalse(vInteger &ret){vsplat(ret,0);} - - - ///////////////////////////////////////////////////// - // Broadcast a value across Nsimd copies. - ///////////////////////////////////////////////////// - friend inline void vsplat(vInteger &ret,scalar_type a){ -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_set1_epi32(a); -#endif -#ifdef SSE4 - ret.v = _mm_set1_epi32(a); -#endif -#ifdef AVX512 - ret.v = _mm512_set1_epi32(a); -#endif -#ifdef QPX -#error -#endif - } - friend inline void vset(vInteger &ret,scalar_type *a){ -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); -#endif -#ifdef SSE4 - ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]); -#endif -#ifdef AVX512 - ret.v = _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], - a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); -#endif -#ifdef QPX -#error -#endif - } - - friend inline void vstore(const vInteger &ret, Integer *a){ -#if defined (AVX1)|| defined (AVX2) - _mm256_store_si256((__m256i*)a,ret.v); -#endif -#ifdef SSE4 - _mm_store_si128((__m128i *)a,ret.v); -#endif -#ifdef AVX512 - _mm512_store_si512(a,ret.v); -#endif -#ifdef QPX - assert(0); -#endif - } - - friend inline void vstream(vInteger & out,const vInteger &in){ - out=in; - } - - friend inline void prefetch(const vInteger &v) - { - _mm_prefetch((const char*)&v.v,_MM_HINT_T0); - } - - // Unary negation - friend inline vInteger operator -(const vInteger &r) { - vInteger ret; - vzero(ret); - ret = ret - r; - return ret; - } - friend inline Integer Reduce(const vInteger & in) - { - // unimplemented - assert(0); - } - // *=,+=,-= operators - inline vInteger &operator *=(const vInteger &r) { - *this = (*this)*r; - return *this; - } - inline vInteger &operator +=(const vInteger &r) { - *this = *this+r; - return *this; - } - inline vInteger &operator -=(const vInteger &r) { - *this = *this-r; - return *this; - } - - friend inline void permute(vInteger &y,const vInteger b,int perm) - { - Gpermute(y,b,perm); - } - /* - friend inline void merge(vInteger &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vInteger &y,std::vector &extracted) - { - Gextract(y,extracted); - } - friend inline void merge(vInteger &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vInteger &y,std::vector &extracted) - { - Gextract(y,extracted); - } - */ - - public: - static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);} - }; - - inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; } - - inline void zeroit(vInteger &z){ vzero(z);} - - inline vInteger outerProduct(const vInteger &l, const vInteger& r) - { - return l*r; - } - -} - -#endif diff --git a/lib/simd/Old/Grid_vRealD.h b/lib/simd/Old/Grid_vRealD.h deleted file mode 100644 index a32e579e..00000000 --- a/lib/simd/Old/Grid_vRealD.h +++ /dev/null @@ -1,276 +0,0 @@ -#ifndef GRID_VREALD_H -#define GRID_VREALD_H - -namespace Grid { - class vRealD { - public: - dvec v; // dvec is double precision vector - - public: - typedef dvec vector_type; - typedef RealD scalar_type; - - vRealD()=default; - vRealD(RealD a){ - vsplat(*this,a); - }; - vRealD(Zero &zero){ - zeroit(*this); - } - vRealD & operator = ( Zero & z){ - vzero(*this); - return (*this); - } - - friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);} - friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);} - friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);} - friend inline vRealD adj(const vRealD &in) { return in; } - friend inline vRealD conjugate(const vRealD &in){ return in; } - - friend inline void mac (vRealD &y,const vRealD a,const vRealD x){ -#if defined (AVX1) || defined (SSE4) - y = a*x+y; -#endif -#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3 - // accelerates multiply accumulate, but not general multiply add - y.v = _mm256_fmadd_pd(a.v,x.v,y.v); -#endif -#ifdef AVX512 - // here precision of vector are still single - y.v = _mm512_fmadd_pd(a.v,x.v,y.v); -#endif -#ifdef QPX - y.v = vec_madd(a.v,x.v,y.v); -#endif - } - ////////////////////////////////// - // Initialise to 1,0 - ////////////////////////////////// - friend inline void vone (vRealD &ret){ vsplat(ret,1.0);} - friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);} - - - //////////////////////////////////// - // Arithmetic operator overloads +,-,* - //////////////////////////////////// - friend inline vRealD operator + (vRealD a, vRealD b) - { - vRealD ret; -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_add_pd(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_add_pd(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_add_pd(a.v,b.v); -#endif -#ifdef QPX - ret.v = vec_add(a.v,b.v); -#endif - return ret; - }; - friend inline vRealD operator - (vRealD a, vRealD b) - { - vRealD ret; -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_sub_pd(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_sub_pd(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_sub_pd(a.v,b.v); -#endif -#ifdef QPX - ret.v = vec_sub(a.v,b.v); -#endif - return ret; - }; - - friend inline vRealD operator * (vRealD a, vRealD b) - { - vRealD ret; -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_mul_pd(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_mul_pd(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_mul_pd(a.v,b.v); -#endif -#ifdef QPX - ret.v = vec_mul(a.v,b.v); -#endif - return ret; - }; - - //////////////////////////////////////////////////////////////////// - // General permute; assumes vector length is same across - // all subtypes; may not be a good assumption, but could - // add the vector width as a template param for BG/Q for example - //////////////////////////////////////////////////////////////////// - - friend inline void permute(vRealD &y,vRealD b,int perm) - { - Gpermute(y,b,perm); - } - /* - friend inline void merge(vRealD &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vRealD &y,std::vector &extracted) - { - Gextract(y,extracted); - } - friend inline void merge(vRealD &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vRealD &y,std::vector &extracted) - { - Gextract(y,extracted); - } - */ - - friend inline void vsplat(vRealD &ret,double a){ -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_set_pd(a,a,a,a); -#endif -#ifdef SSE4 - ret.v = _mm_set_pd(a,a); -#endif -#ifdef AVX512 - ret.v = _mm512_set1_pd(a); -#endif -#ifdef QPX - ret.v = {a,a,a,a}; -#endif - } - friend inline void vset(vRealD &ret, double *a){ -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]); -#endif -#ifdef SSE4 - ret.v = _mm_set_pd(a[1],a[0]); -#endif -#ifdef AVX512 - ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - // Note v has a0 a1 a2 a3 a4 a5 a6 a7 -#endif -#ifdef QPX - ret.v = {a[0],a[1],a[2],a[3]}; -#endif - } - - friend inline void vstore(const vRealD &ret, double *a){ -#if defined (AVX1)|| defined (AVX2) - _mm256_store_pd(a,ret.v); -#endif -#ifdef SSE4 - _mm_store_pd(a,ret.v); -#endif -#ifdef AVX512 - _mm512_store_pd(a,ret.v); - // Note v has a7 a6 a5ba4 a3 a2 a1 a0 -#endif -#ifdef QPX - assert(0); -#endif - } - friend inline void vstream(vRealD &out,const vRealD &in){ -#if defined (AVX1)|| defined (AVX2) - _mm256_stream_pd((double *)&out.v,in.v); -#endif -#ifdef SSE4 - _mm_stream_pd((double *)&out.v,in.v); -#endif -#ifdef AVX512 - _mm512_storenrngo_pd((double *)&out.v,in.v); - //Note v has a3 a2 a1 a0 -#endif -#ifdef QPX - assert(0); -#endif - } - friend inline void prefetch(const vRealD &v) - { - _mm_prefetch((const char*)&v.v,_MM_HINT_T0); - } - // Unary negation - friend inline vRealD operator -(const vRealD &r) { - vRealD ret; - vzero(ret); - ret = ret - r; - return ret; - } - - friend inline RealD Reduce(const vRealD & in) - { - vRealD v1,v2; - union { - dvec v; - double f[sizeof(dvec)/sizeof(double)]; - } conv; -#ifdef SSE4 - permute(v1,in,0); // sse 128; paired real double - v1=v1+in; -#endif -#if defined(AVX1) || defined (AVX2) - permute(v1,in,0); // avx 256; quad double - v1=v1+in; - permute(v2,v1,1); - v1=v1+v2; -#endif -#ifdef AVX512 - permute(v1,in,0); // avx 512; octo-double - v1=v1+in; - permute(v2,v1,1); - v1=v1+v2; - permute(v2,v1,2); - v1=v1+v2; -#endif -#ifdef QPX -#endif - conv.v=v1.v; - return conv.f[0]; - } - - // *=,+=,-= operators - inline vRealD &operator *=(const vRealD &r) { - *this = (*this)*r; - return *this; - } - inline vRealD &operator +=(const vRealD &r) { - *this = *this+r; - return *this; - } - inline vRealD &operator -=(const vRealD &r) { - *this = *this-r; - return *this; - } - - public: - static int Nsimd(void) { return sizeof(dvec)/sizeof(double);} - }; - - inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; } - inline void zeroit(vRealD &z){ vzero(z);} - - inline vRealD outerProduct(const vRealD &l, const vRealD& r) - { - return l*r; - } - inline vRealD trace(const vRealD &arg){ - return arg; - } - inline vRealD real(const vRealD &arg){ - return arg; - } - - -} -#endif diff --git a/lib/simd/Old/Grid_vRealF.h b/lib/simd/Old/Grid_vRealF.h deleted file mode 100644 index 05bfa2f5..00000000 --- a/lib/simd/Old/Grid_vRealF.h +++ /dev/null @@ -1,313 +0,0 @@ -#ifndef GRID_VREALF_H -#define GRID_VREALF_H - - -namespace Grid { - class vRealF { - public: - fvec v; - - public: - typedef fvec vector_type; - typedef RealF scalar_type; - - vRealF()=default; - vRealF(RealF a){ - vsplat(*this,a); - }; - vRealF(Zero &zero){ - zeroit(*this); - } - vRealF & operator = ( Zero & z){ - vzero(*this); - return (*this); - } - //////////////////////////////////// - // Arithmetic operator overloads +,-,* - //////////////////////////////////// - friend inline vRealF operator + ( vRealF a, vRealF b) - { - vRealF ret; -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_add_ps(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_add_ps(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_add_ps(a.v,b.v); -#endif -#ifdef QPX - vector4double aa,bb,cc; - aa = vec_lda(0,(float *)&a); - bb = vec_lda(0,(float *)&b); - cc = vec_add(aa,bb); - vec_sta(cc,0,(float *)&ret.v); -#endif - return ret; - }; - - friend inline vRealF operator - ( vRealF a, vRealF b) - { - vRealF ret; -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_sub_ps(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_sub_ps(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_sub_ps(a.v,b.v); -#endif -#ifdef QPX - vector4double aa,bb,cc; - aa = vec_lda(0,(float *)&a); - bb = vec_lda(0,(float *)&b); - cc = vec_sub(aa,bb); - vec_sta(cc,0,(float *)&ret.v); -#endif - return ret; - }; - - friend inline vRealF operator * ( vRealF a, vRealF b) - { - vRealF ret; -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_mul_ps(a.v,b.v); -#endif -#ifdef SSE4 - ret.v = _mm_mul_ps(a.v,b.v); -#endif -#ifdef AVX512 - ret.v = _mm512_mul_ps(a.v,b.v); -#endif -#ifdef QPX - vector4double aa,bb,cc; // QPX single we are forced to load as this promotes single mem->double regs. - aa = vec_lda(0,(float *)&a); - bb = vec_lda(0,(float *)&b); - cc = vec_mul(aa,bb); - vec_sta(cc,0,(float *)&ret.v); -#endif - return ret; - }; - - /////////////////////////////////////////////// - // mult, sub, add, adj,conjugate, mac functions - /////////////////////////////////////////////// - friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);} - friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);} - friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);} - friend inline vRealF adj(const vRealF &in) { return in; } - friend inline vRealF conjugate(const vRealF &in){ return in; } - - friend inline void mac (vRealF &y,const vRealF a,const vRealF x){ -#if defined (AVX1) || defined (SSE4) - y = a*x+y; -#endif -#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3 - // accelerates multiply accumulate, but not general multiply add - y.v = _mm256_fmadd_ps(a.v,x.v,y.v); -#endif -#ifdef AVX512 - y.v = _mm512_fmadd_ps(a.v,x.v,y.v); -#endif -#ifdef QPX - vector4double aa,xx,yy; // QPX single we are forced to load as this promotes single mem->double regs. - aa = vec_lda(0,(float *)&a.v); - xx = vec_lda(0,(float *)&x.v); - yy = vec_lda(0,(float *)&y.v); - yy = vec_madd(aa,xx,yy); - vec_sta(yy,0,(float *)&y.v); -#endif - } - - ////////////////////////////////// - // Initialise to 1,0,i - ////////////////////////////////// - friend inline void vone (vRealF &ret){vsplat(ret,1.0);} - friend inline void vzero(vRealF &ret){vsplat(ret,0.0);} - - - //////////////////////////////////////////////////////////////////// - // General permute; assumes vector length is same across - // all subtypes; may not be a good assumption, but could - // add the vector width as a template param for BG/Q for example - //////////////////////////////////////////////////////////////////// - - friend inline void permute(vRealF &y,vRealF b,int perm) - { - Gpermute(y,b,perm); - } - /* - friend inline void merge(vRealF &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vRealF &y,std::vector &extracted) - { - Gextract(y,extracted); - } - friend inline void merge(vRealF &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(const vRealF &y,std::vector &extracted) - { - Gextract(y,extracted); - } - */ - - ///////////////////////////////////////////////////// - // Broadcast a value across Nsimd copies. - ///////////////////////////////////////////////////// - friend inline void vsplat(vRealF &ret,float a){ -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a); -#endif -#ifdef SSE4 - ret.v = _mm_set_ps(a,a,a,a); -#endif -#ifdef AVX512 - //ret.v = _mm512_set_ps(a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a); - ret.v = _mm512_set1_ps(a); -#endif -#ifdef QPX - ret.v = {a,a,a,a}; -#endif - } - - - friend inline void vset(vRealF &ret, float *a){ -#if defined (AVX1)|| defined (AVX2) - ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); -#endif -#ifdef SSE4 - ret.v = _mm_set_ps(a[3],a[2],a[1],a[0]); -#endif -#ifdef AVX512 - ret.v = _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], - a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); - // Note v has a0 a1 a2 a3 a4 a5 a6 a7 -#endif -#ifdef QPX - ret.v = {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]}; -#endif - } - - //////////////////////////////////////////////////////////////////////// - // FIXME: gonna remove these load/store, get, set, prefetch - //////////////////////////////////////////////////////////////////////// -friend inline void vstore(const vRealF &ret, float *a){ -#if defined (AVX1)|| defined (AVX2) - _mm256_store_ps(a,ret.v); -#endif -#ifdef SSE4 - _mm_store_ps(a,ret.v); -#endif -#ifdef AVX512 - _mm512_store_ps(a,ret.v); - // Note v has a7 a6 a5ba4 a3 a2 a1 a0 -#endif -#ifdef QPX - assert(0); -#endif - } - friend inline void vstream(vRealF &out,const vRealF &in){ -#if defined (AVX1)|| defined (AVX2) - _mm256_stream_ps((float *)&out.v,in.v); -#endif -#ifdef SSE4 - _mm_stream_ps((float *)&out.v,in.v); -#endif -#ifdef AVX512 - _mm512_storenrngo_ps((float *)&out.v,in.v); - // _mm512_stream_ps((float *)&out.v,in.v); - //Note v has a3 a2 a1 a0 -#endif -#ifdef QPX - assert(0); -#endif - } - - - friend inline void prefetch(const vRealF &v) - { - _mm_prefetch((const char*)&v.v,_MM_HINT_T0); - } - // Unary negation - friend inline vRealF operator -(const vRealF &r) { - vRealF ret; - vzero(ret); - ret = ret - r; - return ret; - } - friend inline RealF Reduce(const vRealF & in) - { - vRealF v1,v2; - union { - fvec v; - float f[sizeof(fvec)/sizeof(double)]; - } conv; -#ifdef SSE4 - permute(v1,in,0); // sse 128; quad single - v1=v1+in; - permute(v2,v1,1); - v1=v1+v2; -#endif -#if defined(AVX1) || defined (AVX2) - permute(v1,in,0); // avx 256; octo-double - v1=v1+in; - permute(v2,v1,1); - v1=v1+v2; - permute(v2,v1,2); - v1=v1+v2; -#endif -#ifdef AVX512 - permute(v1,in,0); // avx 256; octo-double - v1=v1+in; - permute(v2,v1,1); - v1=v1+v2; - permute(v2,v1,2); - v1=v1+v2; - permute(v2,v1,3); - v1=v1+v2; -#endif -#ifdef QPX -#endif - conv.v=v1.v; - return conv.f[0]; - } - - // *=,+=,-= operators - inline vRealF &operator *=(const vRealF &r) { - *this = (*this)*r; - return *this; - } - inline vRealF &operator +=(const vRealF &r) { - *this = *this+r; - return *this; - } - inline vRealF &operator -=(const vRealF &r) { - *this = *this-r; - return *this; - } - public: - static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);} - }; - inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conjugate(l)*r; } - inline void zeroit(vRealF &z){ vzero(z);} - - inline vRealF outerProduct(const vRealF &l, const vRealF& r) - { - return l*r; - } - inline vRealF trace(const vRealF &arg){ - return arg; - } - inline vRealF real(const vRealF &arg){ - return arg; - } - - -} -#endif diff --git a/lib/tensors/Tensor_Ta.h b/lib/tensors/Tensor_Ta.h index 3a903364..f06dbf04 100644 --- a/lib/tensors/Tensor_Ta.h +++ b/lib/tensors/Tensor_Ta.h @@ -29,15 +29,87 @@ namespace Grid { { iMatrix ret(arg); double factor = (1/(double)N); - for(int c1=0;c1 inline iScalar ProjectOnGroup(const iScalar&r) + { + iScalar ret; + ret._internal = ProjectOnGroup(r._internal); + return ret; + } + template inline iVector ProjectOnGroup(const iVector&r) + { + iVector ret; + for(int i=0;i::TensorLevel == 0 >::type * =nullptr> + inline iMatrix ProjectOnGroup(const iMatrix &arg) + { + // need a check for the group type? + iMatrix ret(arg); + double nrm; + for(int c1=0;c1 inline iScalar Exponentiate(const iScalar&r, double alpha, int Nexp) + { + iScalar ret; + ret._internal = Exponentiate(r._internal, alpha, Nexp); + return ret; + } + + + template::TensorLevel == 0 >::type * =nullptr> + inline iMatrix Exponentiate(const iMatrix &arg, double alpha, int Nexp) + { + iMatrix unit(1.0); + iMatrix temp(unit); + + for(int i=Nexp; i>=1;--i){ + temp *= alpha/double(i); + temp = unit + temp*arg; + } + return ProjectOnGroup(temp); + } + + + } #endif diff --git a/tests/InvSqrt.gnu b/tests/InvSqrt.gnu new file mode 100644 index 00000000..e69de29b diff --git a/tests/Sqrt.gnu b/tests/Sqrt.gnu new file mode 100644 index 00000000..ae56ab97 --- /dev/null +++ b/tests/Sqrt.gnu @@ -0,0 +1,2 @@ +f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866)); +f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422)); diff --git a/tests/Test_main.cc b/tests/Test_main.cc index cef2d848..88d69192 100644 --- a/tests/Test_main.cc +++ b/tests/Test_main.cc @@ -223,6 +223,9 @@ int main (int argc, char ** argv) //TComplex tracecm= trace(cm); //std::cout << cm << " "<< tracecm << std::endl; + cm = ProjectOnGroup(cm); + + cm = Exponentiate(cm, 1.0, 10); // Foo = Foo+scalar; // LatticeColourMatrix+Scalar // Foo = Foo*scalar; // LatticeColourMatrix*Scalar