From 9e29ac65491bc560516297ef1b2750b6e6356bfe Mon Sep 17 00:00:00 2001 From: neo Date: Fri, 22 May 2015 17:33:15 +0900 Subject: [PATCH] Completed implementation of new Grid_simd classes Tested performance for SSE4, Ok. AVX1/2, AVX512 yet untested --- configure.ac | 4 +- lib/Grid_simd.h | 6 +- lib/Makefile.am | 10 +- lib/cshift/Grid_cshift_common.h | 3 +- lib/simd/Grid_avx.h | 401 ++++++++++++++++++++++++++++ lib/simd/Grid_knc.h | 323 ++++++++++++++++++++++ lib/simd/Grid_qpx.h | 272 +++++++++++++++++++ lib/simd/Grid_sse4.h | 10 +- lib/simd/Grid_vector_types.h | 94 ++++--- lib/simd/{ => Old}/Grid_vComplexD.h | 0 lib/simd/{ => Old}/Grid_vComplexF.h | 2 +- lib/simd/{ => Old}/Grid_vInteger.h | 0 lib/simd/{ => Old}/Grid_vRealD.h | 0 lib/simd/{ => Old}/Grid_vRealF.h | 0 tests/Grid_main.cc | 42 ++- tests/Makefile.am | 6 +- 16 files changed, 1091 insertions(+), 82 deletions(-) create mode 100644 lib/simd/Grid_avx.h create mode 100644 lib/simd/Grid_knc.h create mode 100644 lib/simd/Grid_qpx.h rename lib/simd/{ => Old}/Grid_vComplexD.h (100%) rename lib/simd/{ => Old}/Grid_vComplexF.h (99%) rename lib/simd/{ => Old}/Grid_vInteger.h (100%) rename lib/simd/{ => Old}/Grid_vRealD.h (100%) rename lib/simd/{ => Old}/Grid_vRealF.h (100%) diff --git a/configure.ac b/configure.ac index 93e2b574..bfcbfcef 100644 --- a/configure.ac +++ b/configure.ac @@ -3,9 +3,9 @@ # # Project Grid package # -# Time-stamp: <2015-05-19 13:51:08 neo> +# Time-stamp: <2015-05-22 15:46:09 neo> -AC_PREREQ([2.69]) +AC_PREREQ([2.63]) AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk]) AC_CANONICAL_SYSTEM AM_INIT_AUTOMAKE(subdir-objects) diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 6d7411d3..9484c0d6 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -203,11 +203,7 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){ }; }; -#include -#include -#include -#include -#include +#include namespace Grid { diff --git a/lib/Makefile.am b/lib/Makefile.am index 82459763..cdb36c71 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -91,12 +91,10 @@ nobase_include_HEADERS = algorithms/approx/bigfloat.h \ qcd/Grid_qcd_2spinor.h \ qcd/Grid_qcd_dirac.h \ qcd/Grid_qcd_wilson_dop.h \ - simd/Grid_vComplexD.h \ - simd/Grid_vComplexF.h \ - simd/Grid_vInteger.h \ - simd/Grid_vRealD.h \ - simd/Grid_vRealF.h \ simd/Grid_vector_types.h \ - simd/Grid_sse4.h + simd/Grid_sse4.h \ + simd/Grid_avx.h \ + simd/Grid_knc.h + diff --git a/lib/cshift/Grid_cshift_common.h b/lib/cshift/Grid_cshift_common.h index cf1f88af..65c0cb87 100644 --- a/lib/cshift/Grid_cshift_common.h +++ b/lib/cshift/Grid_cshift_common.h @@ -154,7 +154,6 @@ template void Copy_plane(Lattice& lhs,Lattice &rhs, int cbmask=0x3; } - int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane @@ -170,10 +169,12 @@ PARALLEL_NESTED_LOOP2 } } + } template void Copy_plane_permute(Lattice& lhs,Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) { + int rd = rhs._grid->_rdimensions[dimension]; if ( !rhs._grid->CheckerBoarded(dimension) ) { diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h new file mode 100644 index 00000000..ea796122 --- /dev/null +++ b/lib/simd/Grid_avx.h @@ -0,0 +1,401 @@ +//---------------------------------------------------------------------- +/*! @file Grid_avx.h + @brief Optimization libraries for AVX1/2 instructions set + + Using intrinsics +*/ +// Time-stamp: <2015-05-22 15:51:24 neo> +//---------------------------------------------------------------------- + +#include +// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h +#ifndef _mm256_set_m128i +#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1) +#endif + +namespace Optimization { + + struct Vsplat{ + //Complex float + inline __m256 operator()(float a, float b){ + return _mm256_set_ps(b,a,b,a,b,a,b,a); + } + // Real float + inline __m256 operator()(float a){ + return _mm256_set_ps(a,a,a,a,a,a,a,a); + } + //Complex double + inline __m256d operator()(double a, double b){ + return _mm256_set_pd(b,a,b,a); + } + //Real double + inline __m256d operator()(double a){ + return _mm256_set_pd(a,a,a,a); + } + //Integer + inline __m256i operator()(Integer a){ + return _mm256_set1_epi32(a); + } + }; + + struct Vstore{ + //Float + inline void operator()(__m256 a, float* F){ + _mm256_store_ps(F,a); + } + //Double + inline void operator()(__m256d a, double* D){ + _mm256_store_pd(D,a); + } + //Integer + inline void operator()(__m256i a, Integer* I){ + _mm256_store_si256((__m256i*)I,a); + } + + }; + + + struct Vstream{ + //Float + inline void operator()(float * a, __m256 b){ + _mm256_stream_ps(a,b); + } + //Double + inline void operator()(double * a, __m256d b){ + _mm256_stream_pd(a,b); + } + + + }; + + + + struct Vset{ + // Complex float + inline __m256 operator()(Grid::ComplexF *a){ + return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m256d operator()(Grid::ComplexD *a){ + return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Real float + inline __m256 operator()(float *a){ + return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Real double + inline __m256d operator()(double *a){ + return _mm256_set_pd(a[3],a[2],a[1],a[0]); + } + // Integer + inline __m256i operator()(Integer *a){ + return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + + + }; + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + + + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_add_ps(a,b); + } + //Complex/Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_add_pd(a,b); + } + //Integer + inline __m256i operator()(__m256i a, __m256i b){ +#if defined (AVX1) + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_add_epi32(a0,b0); + a1 = _mm_add_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); +#endif +#if defined (AVX2) + return _mm256_add_epi32(a,b); +#endif + + } + }; + + struct Sub{ + //Complex/Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_sub_ps(a,b); + } + //Complex/Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_sub_pd(a,b); + } + //Integer + inline __m256i operator()(__m256i a, __m256i b){ +#if defined (AVX1) + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_sub_epi32(a0,b0); + a1 = _mm_sub_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); +#endif +#if defined (AVX2) + return _mm256_sub_epi32(a,b); +#endif + + } + }; + + + struct MultComplex{ + // Complex float + inline __m256 operator()(__m256 a, __m256 b){ + __m256 ymm0,ymm1,ymm2; + ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar, + ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + // FIXME AVX2 could MAC + ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi + ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai + ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm256_addsub_ps(ymm0,ymm1); + } + // Complex double + inline __m256d operator()(__m256d a, __m256d b){ + //Multiplication of (ak+ibk)*(ck+idk) + // a + i b can be stored as a data structure + //From intel optimisation reference guide + /* + movsldup xmm0, Src1; load real parts into the destination, + ; a1, a1, a0, a0 + movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0 + mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0 + shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0 + movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0 + mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0 + addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d + VSHUFPD (VEX.256 encoded version) + IF IMM0[0] = 0 + THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI; + IF IMM0[1] = 0 + THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI; + IF IMM0[2] = 0 + THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI; + IF IMM0[3] = 0 + THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged + */ + + __m256d ymm0,ymm1,ymm2; + ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi b'01,01 + ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11 + ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm256_addsub_pd(ymm0,ymm1); + } + }; + + struct Mult{ + // Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_mul_ps(a,b); + } + // Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_mul_pd(a,b); + } + // Integer + inline __m256i operator()(__m256i a, __m256i b){ +#if defined (AVX1) + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_mul_epi32(a0,b0); + a1 = _mm_mul_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); +#endif +#if defined (AVX2) + return _mm256_mul_epi32(a,b); +#endif + + } + }; + + + struct Conj{ + // Complex single + inline __m256 operator()(__m256 in){ + return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f)); + } + // Complex double + inline __m256d operator()(__m256d in){ + return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));//untested + /* + // original + // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ... + __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),_mm256_shuffle_pd(in,in,0x5)); + return _mm256_shuffle_pd(tmp,tmp,0x5); + */ + } + // do not define for integer input + }; + + struct TimesMinusI{ + //Complex single + inline __m256 operator()(__m256 in, __m256 ret){ + __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i + return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r + } + //Complex double + inline __m256d operator()(__m256d in, __m256d ret){ + __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i + return _mm256_shuffle_pd(tmp,tmp,0x5); + } + }; + + struct TimesI{ + //Complex single + inline __m256 operator()(__m256 in, __m256 ret){ + __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r + return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r + } + //Complex double + inline __m256d operator()(__m256d in, __m256d ret){ + __m256d tmp = _mm256_shuffle_pd(in,in,0x5); + return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r + } + }; + + + + + + ////////////////////////////////////////////// + // Some Template specialization + template < typename vtype > + void permute(vtype a, vtype b, int perm) { + union { + __m256 f; + vtype v; + } conv; + conv.v = b; + switch (perm){ + // 8x32 bits=>3 permutes + case 2: + conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); + break; + case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; + case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; + default: assert(0); break; + } + a = conv.v; + + } + + //Complex float Reduce + template<> + inline Grid::ComplexF Reduce::operator()(__m256 in){ + __m256 v1,v2; + Optimization::permute(v1,in,0); // sse 128; paired complex single + v1 = _mm256_add_ps(v1,in); + Optimization::permute(v2,v1,1); // avx 256; quad complex single + v1 = _mm256_add_ps(v1,v2); + return Grid::ComplexF(v1[0],v1[1]); + } + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(__m256 in){ + __m256 v1,v2; + Optimization::permute(v1,in,0); // avx 256; octo-double + v1 = _mm256_add_ps(v1,in); + Optimization::permute(v2,v1,1); + v1 = _mm256_add_ps(v1,v2); + Optimization::permute(v2,v1,2); + v1 = _mm256_add_ps(v1,v2); + return v1[0]; + } + + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(__m256d in){ + __m256d v1; + Optimization::permute(v1,in,0); // sse 128; paired complex single + v1 = _mm256_add_pd(v1,in); + return Grid::ComplexD(v1[0],v1[1]); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(__m256d in){ + __m256d v1,v2; + Optimization::permute(v1,in,0); // avx 256; quad double + v1 = _mm256_add_pd(v1,in); + Optimization::permute(v2,v1,1); + v1 = _mm256_add_pd(v1,v2); + return v1[0]; + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(__m256i in){ + // FIXME unimplemented + printf("Reduce : Missing integer implementation -> FIX\n"); + assert(0); + } + + +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types +namespace Grid { + typedef __m256 SIMD_Ftype; // Single precision type + typedef __m256d SIMD_Dtype; // Double precision type + typedef __m256i SIMD_Itype; // Integer type + + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/lib/simd/Grid_knc.h b/lib/simd/Grid_knc.h new file mode 100644 index 00000000..daec3973 --- /dev/null +++ b/lib/simd/Grid_knc.h @@ -0,0 +1,323 @@ +//---------------------------------------------------------------------- +/*! @file Grid_knc.h + @brief Optimization libraries for AVX512 instructions set for KNC + + Using intrinsics +*/ +// Time-stamp: <2015-05-22 17:12:44 neo> +//---------------------------------------------------------------------- + +#include +#ifndef KNC_ONLY_STORES +#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512 +#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512 +#endif + + +namespace Optimization { + + struct Vsplat{ + //Complex float + inline __m512 operator()(float a, float b){ + return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a); + } + // Real float + inline __m512 operator()(float a){ + return _mm512_set1_ps(a); + } + //Complex double + inline __m512d operator()(double a, double b){ + return _mm512_set_pd(b,a,b,a,b,a,b,a); + } + //Real double + inline __m512d operator()(double a){ + return _mm512_set1_pd(a); + } + //Integer + inline __m512i operator()(Integer a){ + return _mm512_set1_epi32(a); + } + }; + + struct Vstore{ + //Float + inline void operator()(__m512 a, float* F){ + _mm512_store_ps(F,a); + } + //Double + inline void operator()(__m512d a, double* D){ + _mm512_store_pd(D,a); + } + //Integer + inline void operator()(__m512i a, Integer* I){ + _mm512_store_si512((__m512i *)I,a); + } + + }; + + + struct Vstream{ + //Float + inline void operator()(float * a, __m512 b){ + _mm512_storenrngo_ps(a,b); + } + //Double + inline void operator()(double * a, __m512d b){ + _mm512_storenrngo_pd(a,b); + } + + + }; + + + + struct Vset{ + // Complex float + inline __m512 operator()(Grid::ComplexF *a){ + return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(), + a[5].imag(),a[5].real(),a[4].imag(),a[4].real(), + a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m512d operator()(Grid::ComplexD *a){ + return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Real float + inline __m512 operator()(float *a){ + return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Real double + inline __m512d operator()(double *a){ + return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Integer + inline __m512i operator()(Integer *a){ + return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + + + }; + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + + + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_add_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_add_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_add_epi32(a,b); + } + }; + + struct Sub{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_sub_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_sub_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_sub_epi32(a,b); + } + }; + + + struct MultComplex{ + // Complex float + inline __m512 operator()(__m512 a, __m512 b){ + __m512 vzero,ymm0,ymm1,real, imag; + vzero = _mm512_setzero_ps(); + ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // + real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0); + imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0); + ymm1 = _mm512_mul_ps(real, b); + ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK + return _mm512_fmadd_ps(ymm0,imag,ymm1); + } + // Complex double + inline __m512d operator()(__m512d a, __m512d b){ + /* This is from + * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets + * @inproceedings{McFarlin:2011:ASV:1995896.1995938, + * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus}, + * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets}, + * booktitle = {Proceedings of the International Conference on Supercomputing}, + * series = {ICS '11}, + * year = {2011}, + * isbn = {978-1-4503-0102-2}, + * location = {Tucson, Arizona, USA}, + * pages = {265--274}, + * numpages = {10}, + * url = {http://doi.acm.org/10.1145/1995896.1995938}, + * doi = {10.1145/1995896.1995938}, + * acmid = {1995938}, + * publisher = {ACM}, + * address = {New York, NY, USA}, + * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization}, + * } + */ + __m512d vzero,ymm0,ymm1,real,imag; + vzero =_mm512_setzero_pd(); + ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // + real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0); + imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0); + ymm1 = _mm512_mul_pd(real, b); + ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK + return _mm512_fmadd_pd(ymm0,imag,ymm1); + } + }; + + struct Mult{ + // Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_mul_ps(a,b); + } + // Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_mul_pd(a,b); + } + // Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_mullo_epi32(a,b); + } + }; + + + struct Conj{ + // Complex single + inline __m512 operator()(__m512 in){ + return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag + } + // Complex double + inline __m512d operator()(__m512d in){ + return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in); + } + // do not define for integer input + }; + + struct TimesMinusI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag + return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag + return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK + } + + + }; + + struct TimesI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK + return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK + return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag + } + + + }; + + + + + + ////////////////////////////////////////////// + // Some Template specialization + + //Complex float Reduce + template<> + inline Grid::ComplexF Reduce::operator()(__m512 in){ + return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in)); + } + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(__m512 in){ + return _mm512_reduce_add_ps(in); + } + + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(__m512d in){ + return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in)); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(__m512d in){ + return _mm512_reduce_add_pd(in); + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(__m512i in){ + // FIXME unimplemented + printf("Reduce : Missing integer implementation -> FIX\n"); + assert(0); + } + + +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types +namespace Grid { + typedef __m512 SIMD_Ftype; // Single precision type + typedef __m512d SIMD_Dtype; // Double precision type + typedef __m512i SIMD_Itype; // Integer type + + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h new file mode 100644 index 00000000..dc0251f8 --- /dev/null +++ b/lib/simd/Grid_qpx.h @@ -0,0 +1,272 @@ +//---------------------------------------------------------------------- +/*! @file Grid_qpx.h + @brief Optimization libraries for QPX instructions set for BG/Q + + Using intrinsics +*/ +// Time-stamp: <2015-05-22 17:29:26 neo> +//---------------------------------------------------------------------- + +// lot of undefined functions + +namespace Optimization { + + struct Vsplat{ + //Complex float + inline float operator()(float a, float b){ + return {a,b,a,b}; + } + // Real float + inline float operator()(float a){ + return {a,a,a,a}; + } + //Complex double + inline vector4double operator()(double a, double b){ + return {a,b,a,b}; + } + //Real double + inline vector4double operator()(double a){ + return {a,a,a,a}; + } + //Integer + inline int operator()(Integer a){ +#error + } + }; + + struct Vstore{ + //Float + inline void operator()(float a, float* F){ + assert(0); + } + //Double + inline void operator()(vector4double a, double* D){ + assert(0); + } + //Integer + inline void operator()(int a, Integer* I){ + assert(0); + } + + }; + + + struct Vstream{ + //Float + inline void operator()(float * a, float b){ + assert(0); + } + //Double + inline void operator()(double * a, vector4double b){ + assert(0); + } + + + }; + + + + struct Vset{ + // Complex float + inline float operator()(Grid::ComplexF *a){ + return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()}; + } + // Complex double + inline vector4double operator()(Grid::ComplexD *a){ + return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()}; + } + // Real float + inline float operator()(float *a){ + return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]}; + } + // Real double + inline vector4double operator()(double *a){ + return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]}; + } + // Integer + inline int operator()(Integer *a){ +#error + } + + + }; + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + + + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline float operator()(float a, float b){ +#error + } + //Complex/Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_add(a,b); + } + //Integer + inline int operator()(int a, int b){ +#error + } + }; + + struct Sub{ + //Complex/Real float + inline float operator()(float a, float b){ +#error + } + //Complex/Real double + inline vector4double operator()(vector4double a, vector4double b){ +#error + } + //Integer + inline floati operator()(int a, int b){ +#error + } + }; + + + struct MultComplex{ + // Complex float + inline float operator()(float a, float b){ +#error + } + // Complex double + inline vector4double operator()(vector4double a, vector4double b){ +#error + } + }; + + struct Mult{ + // Real float + inline float operator()(float a, float b){ +#error + } + // Real double + inline vector4double operator()(vector4double a, vector4double b){ +#error + } + // Integer + inline int operator()(int a, int b){ +#error + } + }; + + + struct Conj{ + // Complex single + inline float operator()(float in){ + assert(0); + } + // Complex double + inline vector4double operator()(vector4double in){ + assert(0); + } + // do not define for integer input + }; + + struct TimesMinusI{ + //Complex single + inline float operator()(float in, float ret){ + assert(0); + } + //Complex double + inline vector4double operator()(vector4double in, vector4double ret){ + assert(0); + } + + + }; + + struct TimesI{ + //Complex single + inline float operator()(float in, float ret){ + + } + //Complex double + inline vector4double operator()(vector4double in, vector4double ret){ + + } + + + }; + + + + + + ////////////////////////////////////////////// + // Some Template specialization + + //Complex float Reduce + template<> + inline Grid::ComplexF Reduce::operator()(float in){ + assert(0); + } + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(float in){ + assert(0); + } + + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(vector4double in){ + assert(0); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(vector4double in){ + assert(0); + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(float in){ + assert(0); + } + + +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types +namespace Grid { + typedef float SIMD_Ftype __attribute__ ((vector_size (16))); // Single precision type + typedef vector4double SIMD_Dtype; // Double precision type + typedef int SIMD_Itype; // Integer type + + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index ddc3490b..fa0e3ec3 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -4,7 +4,7 @@ Using intrinsics */ -// Time-stamp: <2015-05-20 16:45:39 neo> +// Time-stamp: <2015-05-21 18:06:30 neo> //---------------------------------------------------------------------- #include @@ -53,12 +53,12 @@ namespace Optimization { struct Vstream{ //Float - inline void operator()(__m128 a, __m128 b){ - _mm_stream_ps((float *)&a,b); + inline void operator()(float * a, __m128 b){ + _mm_stream_ps(a,b); } //Double - inline void operator()(__m128d a, __m128d b){ - _mm_stream_pd((double *)&a,b); + inline void operator()(double * a, __m128d b){ + _mm_stream_pd(a,b); } diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 57480621..34c63964 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -2,12 +2,20 @@ /*! @file Grid_vector_types.h @brief Defines templated class Grid_simd to deal with inner vector types */ -// Time-stamp: <2015-05-20 17:31:55 neo> +// Time-stamp: <2015-05-22 17:08:19 neo> //--------------------------------------------------------------------------- #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES +#ifdef SSE4 #include "Grid_sse4.h" +#endif +#if defined (AVX1)|| defined (AVX2) +#include "Grid_avx.h" +#endif +#if defined AVX512 +#include "Grid_knc.h" +#endif namespace Grid { @@ -43,13 +51,14 @@ namespace Grid { // general forms to allow for vsplat syntax // need explicit declaration of types when used since // clang cannot automatically determine the output type sometimes + // use decltype? template < class Out, class Input1, class Input2, class Operation > Out binary(Input1 src_1, Input2 src_2, Operation op){ return op(src_1, src_2); } - template < class SIMDout, class Input, class Operation > - SIMDout unary(Input src, Operation op){ + template < class Out, class Input, class Operation > + Out unary(Input src, Operation op){ return op(src); } @@ -63,27 +72,34 @@ namespace Grid { public: typedef typename RealPart < Scalar_type >::type Real; + typedef Vector_type vector_type; + typedef Scalar_type scalar_type; + Vector_type v; - + static inline int Nsimd(void) { return sizeof(Vector_type)/sizeof(Scalar_type);} - + // Constructors Grid_simd & operator = ( Zero & z){ vzero(*this); return (*this); } - Grid_simd(){}; - + Grid_simd& operator=(const Grid_simd&& rhs){v=rhs.v;return *this;}; + Grid_simd& operator=(const Grid_simd& rhs){v=rhs.v;return *this;}; //faster than not declaring it and leaving to the compiler + Grid_simd()=default; + Grid_simd(const Grid_simd& rhs):v(rhs.v){}; //compiles in movaps + Grid_simd(const Grid_simd&& rhs):v(rhs.v){}; + //Enable if complex type template < class S = Scalar_type > - Grid_simd(typename std::enable_if< is_complex < S >::value, S>::type a){ + Grid_simd(const typename std::enable_if< is_complex < S >::value, S>::type a){ vsplat(*this,a); }; - Grid_simd(Real a){ + Grid_simd(const Real a){ vsplat(*this,Scalar_type(a)); }; @@ -97,18 +113,13 @@ namespace Grid { friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); } friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); } - //not for integer types... FIXME + //not for integer types... + template < class S = Scalar_type, NotEnableIf, int> = 0 > friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); } /////////////////////////////////////////////// // Initialise to 1,0,i for the correct types /////////////////////////////////////////////// - // if not complex overload here - template < class S = Scalar_type, NotEnableIf,int> = 0 > - friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); } - template < class S = Scalar_type, NotEnableIf,int> = 0 > - friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); } - // For complex types template < class S = Scalar_type, EnableIf, int> = 0 > friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); } @@ -116,6 +127,14 @@ namespace Grid { friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0,0.0); }// use xor? template < class S = Scalar_type, EnableIf, int> = 0 > friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);} + + // if not complex overload here + template < class S = Scalar_type, EnableIf,int> = 0 > + friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); } + template < class S = Scalar_type, EnableIf,int> = 0 > + friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); } + + // For integral types template < class S = Scalar_type, EnableIf, int> = 0 > @@ -125,7 +144,7 @@ namespace Grid { template < class S = Scalar_type, EnableIf, int> = 0 > friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);} template < class S = Scalar_type, EnableIf, int> = 0 > - friend inline void vfalse(vInteger &ret){vsplat(ret,0);} + friend inline void vfalse(Grid_simd &ret){vsplat(ret,0);} @@ -206,8 +225,9 @@ namespace Grid { /////////////////////// // Vstream /////////////////////// + template < class S = Scalar_type, NotEnableIf, int> = 0 > friend inline void vstream(Grid_simd &out,const Grid_simd &in){ - binary(out.v, in.v, VstreamSIMD()); + binary((Real*)&out.v, in.v, VstreamSIMD()); } template < class S = Scalar_type, EnableIf, int> = 0 > @@ -305,7 +325,7 @@ namespace Grid { // Unary negation /////////////////////// friend inline Grid_simd operator -(const Grid_simd &r) { - vComplexF ret; + Grid_simd ret; vzero(ret); ret = ret - r; return ret; @@ -350,7 +370,7 @@ namespace Grid { } template - inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);} + inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);} template @@ -368,35 +388,11 @@ namespace Grid { // Define available types (now change names to avoid clashing with the rest of the code) - typedef Grid_simd< float , SIMD_Ftype > MyRealF; - typedef Grid_simd< double , SIMD_Dtype > MyRealD; - typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF; - typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD; - - - - - //////////////////////////////////////////////////////////////////// - // Temporary hack to keep independent from the rest of the code - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - - - + typedef Grid_simd< float , SIMD_Ftype > vRealF; + typedef Grid_simd< double , SIMD_Dtype > vRealD; + typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF; + typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD; + typedef Grid_simd< Integer , SIMD_Itype > vInteger; } diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Old/Grid_vComplexD.h similarity index 100% rename from lib/simd/Grid_vComplexD.h rename to lib/simd/Old/Grid_vComplexD.h diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Old/Grid_vComplexF.h similarity index 99% rename from lib/simd/Grid_vComplexF.h rename to lib/simd/Old/Grid_vComplexF.h index 713bbdd8..37760b98 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Old/Grid_vComplexF.h @@ -54,7 +54,7 @@ namespace Grid { ////////////////////////////////// friend inline void vone(vComplexF &ret) { vsplat(ret,1.0,0.0); } friend inline void vzero(vComplexF &ret) { vsplat(ret,0.0,0.0); } - friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0);} + friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0); } //////////////////////////////////// // Arithmetic operator overloads +,-,* diff --git a/lib/simd/Grid_vInteger.h b/lib/simd/Old/Grid_vInteger.h similarity index 100% rename from lib/simd/Grid_vInteger.h rename to lib/simd/Old/Grid_vInteger.h diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Old/Grid_vRealD.h similarity index 100% rename from lib/simd/Grid_vRealD.h rename to lib/simd/Old/Grid_vRealD.h diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Old/Grid_vRealF.h similarity index 100% rename from lib/simd/Grid_vRealF.h rename to lib/simd/Old/Grid_vRealF.h diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index 10e74099..c1f1c5d0 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -103,6 +103,9 @@ int main (int argc, char ** argv) random(FineRNG,scVec); fflush(stdout); + + + /* cVec = cMat * cVec; // LatticeColourVector = LatticeColourMatrix * LatticeColourVector sVec = sMat * sVec; // LatticeSpinVector = LatticeSpinMatrix * LatticeSpinVector scVec= scMat * scVec;// LatticeSpinColourVector = LatticeSpinColourMatrix * LatticeSpinColourVector @@ -112,12 +115,14 @@ int main (int argc, char ** argv) cMat = outerProduct(cVec,cVec); scalar = localInnerProduct(cVec,cVec); + scalar += scalar; scalar -= scalar; scalar *= scalar; add(scalar,scalar,scalar); sub(scalar,scalar,scalar); mult(scalar,scalar,scalar); + mac(scalar,scalar,scalar); scalar = scalar+scalar; scalar = scalar-scalar; @@ -141,7 +146,7 @@ int main (int argc, char ** argv) scalar=trace(scalar); scalar=localInnerProduct(cVec,cVec); scalar=localNorm2(cVec); - + */ // -=,+=,*=,() // add,+,sub,-,mult,mac,* // adj,conjugate @@ -153,10 +158,11 @@ int main (int argc, char ** argv) // localNorm2 // localInnerProduct + scMat = sMat*scMat; // LatticeSpinColourMatrix = LatticeSpinMatrix * LatticeSpinColourMatrix - - + + /* #ifdef SSE4 ///////// Tests the new class Grid_simd std::complex ctest(3.0,2.0); @@ -196,8 +202,10 @@ int main (int argc, char ** argv) std::cout << sum<< std::endl; #endif + */ /////////////////////// - + /* + printf("DEBUG: calling 3.5 \n"); // Non-lattice (const objects) * Lattice ColourMatrix cm; SpinColourMatrix scm; @@ -217,6 +225,7 @@ int main (int argc, char ** argv) vscm = vscm*cplx; scMat = scMat*cplx; + printf("DEBUG: calling 3.7 \n"); scm = cplx*scm; vscm = cplx*vscm; scMat = cplx*scMat; @@ -224,12 +233,14 @@ int main (int argc, char ** argv) vscm = myint*vscm; scMat = scMat*myint; + printf("DEBUG: calling 3.9 \n"); scm = scm*mydouble; vscm = vscm*mydouble; scMat = scMat*mydouble; scMat = mydouble*scMat; cMat = mydouble*cMat; - + + printf("DEBUG: calling 4 \n"); sMat = adj(sMat); // LatticeSpinMatrix adjoint sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix sMat = GammaFive*sMat; // SpinMatrix * LatticeSpinMatrix @@ -240,6 +251,9 @@ int main (int argc, char ** argv) scm=transpose(scm); scm=transposeIndex<1>(scm); + + + // Foo = Foo+scalar; // LatticeColourMatrix+Scalar // Foo = Foo*scalar; // LatticeColourMatrix*Scalar // Foo = Foo-scalar; // LatticeColourMatrix-Scalar @@ -279,7 +293,8 @@ int main (int argc, char ** argv) pokeIndex<1> (c_m,c,0,0); } - + */ + FooBar = Bar; /* @@ -332,14 +347,14 @@ int main (int argc, char ** argv) // Lattice SU(3) x SU(3) Fine.Barrier(); FooBar = Foo * Bar; - + // Lattice 12x12 GEMM scFooBar = scFoo * scBar; - + // Benchmark some simple operations LatticeSU3 * Lattice SU3. double t0,t1,flops; double bytes; - int ncall=100; + int ncall=5000; int Nc = Grid::QCD::Nc; LatticeGaugeField U(&Fine); @@ -351,19 +366,21 @@ int main (int argc, char ** argv) if ( Fine.IsBoss() ) { printf("%f flop and %f bytes\n",flops,bytes/ncall); } - FooBar = Foo * Bar; + FooBar = Foo * Bar; Fine.Barrier(); t0=usecond(); for(int i=0;i