From 9e29ac65491bc560516297ef1b2750b6e6356bfe Mon Sep 17 00:00:00 2001 From: neo Date: Fri, 22 May 2015 17:33:15 +0900 Subject: [PATCH 1/4] Completed implementation of new Grid_simd classes Tested performance for SSE4, Ok. AVX1/2, AVX512 yet untested --- configure.ac | 4 +- lib/Grid_simd.h | 6 +- lib/Makefile.am | 10 +- lib/cshift/Grid_cshift_common.h | 3 +- lib/simd/Grid_avx.h | 401 ++++++++++++++++++++++++++++ lib/simd/Grid_knc.h | 323 ++++++++++++++++++++++ lib/simd/Grid_qpx.h | 272 +++++++++++++++++++ lib/simd/Grid_sse4.h | 10 +- lib/simd/Grid_vector_types.h | 94 ++++--- lib/simd/{ => Old}/Grid_vComplexD.h | 0 lib/simd/{ => Old}/Grid_vComplexF.h | 2 +- lib/simd/{ => Old}/Grid_vInteger.h | 0 lib/simd/{ => Old}/Grid_vRealD.h | 0 lib/simd/{ => Old}/Grid_vRealF.h | 0 tests/Grid_main.cc | 42 ++- tests/Makefile.am | 6 +- 16 files changed, 1091 insertions(+), 82 deletions(-) create mode 100644 lib/simd/Grid_avx.h create mode 100644 lib/simd/Grid_knc.h create mode 100644 lib/simd/Grid_qpx.h rename lib/simd/{ => Old}/Grid_vComplexD.h (100%) rename lib/simd/{ => Old}/Grid_vComplexF.h (99%) rename lib/simd/{ => Old}/Grid_vInteger.h (100%) rename lib/simd/{ => Old}/Grid_vRealD.h (100%) rename lib/simd/{ => Old}/Grid_vRealF.h (100%) diff --git a/configure.ac b/configure.ac index 93e2b574..bfcbfcef 100644 --- a/configure.ac +++ b/configure.ac @@ -3,9 +3,9 @@ # # Project Grid package # -# Time-stamp: <2015-05-19 13:51:08 neo> +# Time-stamp: <2015-05-22 15:46:09 neo> -AC_PREREQ([2.69]) +AC_PREREQ([2.63]) AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk]) AC_CANONICAL_SYSTEM AM_INIT_AUTOMAKE(subdir-objects) diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 6d7411d3..9484c0d6 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -203,11 +203,7 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){ }; }; -#include -#include -#include -#include -#include +#include namespace Grid { diff --git a/lib/Makefile.am b/lib/Makefile.am index 82459763..cdb36c71 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -91,12 +91,10 @@ nobase_include_HEADERS = algorithms/approx/bigfloat.h \ qcd/Grid_qcd_2spinor.h \ qcd/Grid_qcd_dirac.h \ qcd/Grid_qcd_wilson_dop.h \ - simd/Grid_vComplexD.h \ - simd/Grid_vComplexF.h \ - simd/Grid_vInteger.h \ - simd/Grid_vRealD.h \ - simd/Grid_vRealF.h \ simd/Grid_vector_types.h \ - simd/Grid_sse4.h + simd/Grid_sse4.h \ + simd/Grid_avx.h \ + simd/Grid_knc.h + diff --git a/lib/cshift/Grid_cshift_common.h b/lib/cshift/Grid_cshift_common.h index cf1f88af..65c0cb87 100644 --- a/lib/cshift/Grid_cshift_common.h +++ b/lib/cshift/Grid_cshift_common.h @@ -154,7 +154,6 @@ template void Copy_plane(Lattice& lhs,Lattice &rhs, int cbmask=0x3; } - int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane @@ -170,10 +169,12 @@ PARALLEL_NESTED_LOOP2 } } + } template void Copy_plane_permute(Lattice& lhs,Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) { + int rd = rhs._grid->_rdimensions[dimension]; if ( !rhs._grid->CheckerBoarded(dimension) ) { diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h new file mode 100644 index 00000000..ea796122 --- /dev/null +++ b/lib/simd/Grid_avx.h @@ -0,0 +1,401 @@ +//---------------------------------------------------------------------- +/*! @file Grid_avx.h + @brief Optimization libraries for AVX1/2 instructions set + + Using intrinsics +*/ +// Time-stamp: <2015-05-22 15:51:24 neo> +//---------------------------------------------------------------------- + +#include +// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h +#ifndef _mm256_set_m128i +#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1) +#endif + +namespace Optimization { + + struct Vsplat{ + //Complex float + inline __m256 operator()(float a, float b){ + return _mm256_set_ps(b,a,b,a,b,a,b,a); + } + // Real float + inline __m256 operator()(float a){ + return _mm256_set_ps(a,a,a,a,a,a,a,a); + } + //Complex double + inline __m256d operator()(double a, double b){ + return _mm256_set_pd(b,a,b,a); + } + //Real double + inline __m256d operator()(double a){ + return _mm256_set_pd(a,a,a,a); + } + //Integer + inline __m256i operator()(Integer a){ + return _mm256_set1_epi32(a); + } + }; + + struct Vstore{ + //Float + inline void operator()(__m256 a, float* F){ + _mm256_store_ps(F,a); + } + //Double + inline void operator()(__m256d a, double* D){ + _mm256_store_pd(D,a); + } + //Integer + inline void operator()(__m256i a, Integer* I){ + _mm256_store_si256((__m256i*)I,a); + } + + }; + + + struct Vstream{ + //Float + inline void operator()(float * a, __m256 b){ + _mm256_stream_ps(a,b); + } + //Double + inline void operator()(double * a, __m256d b){ + _mm256_stream_pd(a,b); + } + + + }; + + + + struct Vset{ + // Complex float + inline __m256 operator()(Grid::ComplexF *a){ + return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m256d operator()(Grid::ComplexD *a){ + return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Real float + inline __m256 operator()(float *a){ + return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Real double + inline __m256d operator()(double *a){ + return _mm256_set_pd(a[3],a[2],a[1],a[0]); + } + // Integer + inline __m256i operator()(Integer *a){ + return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + + + }; + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + + + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_add_ps(a,b); + } + //Complex/Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_add_pd(a,b); + } + //Integer + inline __m256i operator()(__m256i a, __m256i b){ +#if defined (AVX1) + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_add_epi32(a0,b0); + a1 = _mm_add_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); +#endif +#if defined (AVX2) + return _mm256_add_epi32(a,b); +#endif + + } + }; + + struct Sub{ + //Complex/Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_sub_ps(a,b); + } + //Complex/Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_sub_pd(a,b); + } + //Integer + inline __m256i operator()(__m256i a, __m256i b){ +#if defined (AVX1) + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_sub_epi32(a0,b0); + a1 = _mm_sub_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); +#endif +#if defined (AVX2) + return _mm256_sub_epi32(a,b); +#endif + + } + }; + + + struct MultComplex{ + // Complex float + inline __m256 operator()(__m256 a, __m256 b){ + __m256 ymm0,ymm1,ymm2; + ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar, + ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + // FIXME AVX2 could MAC + ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi + ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai + ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm256_addsub_ps(ymm0,ymm1); + } + // Complex double + inline __m256d operator()(__m256d a, __m256d b){ + //Multiplication of (ak+ibk)*(ck+idk) + // a + i b can be stored as a data structure + //From intel optimisation reference guide + /* + movsldup xmm0, Src1; load real parts into the destination, + ; a1, a1, a0, a0 + movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0 + mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0 + shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0 + movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0 + mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0 + addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d + VSHUFPD (VEX.256 encoded version) + IF IMM0[0] = 0 + THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI; + IF IMM0[1] = 0 + THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI; + IF IMM0[2] = 0 + THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI; + IF IMM0[3] = 0 + THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged + */ + + __m256d ymm0,ymm1,ymm2; + ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi b'01,01 + ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11 + ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm256_addsub_pd(ymm0,ymm1); + } + }; + + struct Mult{ + // Real float + inline __m256 operator()(__m256 a, __m256 b){ + return _mm256_mul_ps(a,b); + } + // Real double + inline __m256d operator()(__m256d a, __m256d b){ + return _mm256_mul_pd(a,b); + } + // Integer + inline __m256i operator()(__m256i a, __m256i b){ +#if defined (AVX1) + __m128i a0,a1; + __m128i b0,b1; + a0 = _mm256_extractf128_si256(a,0); + b0 = _mm256_extractf128_si256(b,0); + a1 = _mm256_extractf128_si256(a,1); + b1 = _mm256_extractf128_si256(b,1); + a0 = _mm_mul_epi32(a0,b0); + a1 = _mm_mul_epi32(a1,b1); + return _mm256_set_m128i(a1,a0); +#endif +#if defined (AVX2) + return _mm256_mul_epi32(a,b); +#endif + + } + }; + + + struct Conj{ + // Complex single + inline __m256 operator()(__m256 in){ + return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f)); + } + // Complex double + inline __m256d operator()(__m256d in){ + return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));//untested + /* + // original + // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ... + __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),_mm256_shuffle_pd(in,in,0x5)); + return _mm256_shuffle_pd(tmp,tmp,0x5); + */ + } + // do not define for integer input + }; + + struct TimesMinusI{ + //Complex single + inline __m256 operator()(__m256 in, __m256 ret){ + __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i + return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r + } + //Complex double + inline __m256d operator()(__m256d in, __m256d ret){ + __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i + return _mm256_shuffle_pd(tmp,tmp,0x5); + } + }; + + struct TimesI{ + //Complex single + inline __m256 operator()(__m256 in, __m256 ret){ + __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r + return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r + } + //Complex double + inline __m256d operator()(__m256d in, __m256d ret){ + __m256d tmp = _mm256_shuffle_pd(in,in,0x5); + return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r + } + }; + + + + + + ////////////////////////////////////////////// + // Some Template specialization + template < typename vtype > + void permute(vtype a, vtype b, int perm) { + union { + __m256 f; + vtype v; + } conv; + conv.v = b; + switch (perm){ + // 8x32 bits=>3 permutes + case 2: + conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); + break; + case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; + case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; + default: assert(0); break; + } + a = conv.v; + + } + + //Complex float Reduce + template<> + inline Grid::ComplexF Reduce::operator()(__m256 in){ + __m256 v1,v2; + Optimization::permute(v1,in,0); // sse 128; paired complex single + v1 = _mm256_add_ps(v1,in); + Optimization::permute(v2,v1,1); // avx 256; quad complex single + v1 = _mm256_add_ps(v1,v2); + return Grid::ComplexF(v1[0],v1[1]); + } + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(__m256 in){ + __m256 v1,v2; + Optimization::permute(v1,in,0); // avx 256; octo-double + v1 = _mm256_add_ps(v1,in); + Optimization::permute(v2,v1,1); + v1 = _mm256_add_ps(v1,v2); + Optimization::permute(v2,v1,2); + v1 = _mm256_add_ps(v1,v2); + return v1[0]; + } + + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(__m256d in){ + __m256d v1; + Optimization::permute(v1,in,0); // sse 128; paired complex single + v1 = _mm256_add_pd(v1,in); + return Grid::ComplexD(v1[0],v1[1]); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(__m256d in){ + __m256d v1,v2; + Optimization::permute(v1,in,0); // avx 256; quad double + v1 = _mm256_add_pd(v1,in); + Optimization::permute(v2,v1,1); + v1 = _mm256_add_pd(v1,v2); + return v1[0]; + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(__m256i in){ + // FIXME unimplemented + printf("Reduce : Missing integer implementation -> FIX\n"); + assert(0); + } + + +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types +namespace Grid { + typedef __m256 SIMD_Ftype; // Single precision type + typedef __m256d SIMD_Dtype; // Double precision type + typedef __m256i SIMD_Itype; // Integer type + + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/lib/simd/Grid_knc.h b/lib/simd/Grid_knc.h new file mode 100644 index 00000000..daec3973 --- /dev/null +++ b/lib/simd/Grid_knc.h @@ -0,0 +1,323 @@ +//---------------------------------------------------------------------- +/*! @file Grid_knc.h + @brief Optimization libraries for AVX512 instructions set for KNC + + Using intrinsics +*/ +// Time-stamp: <2015-05-22 17:12:44 neo> +//---------------------------------------------------------------------- + +#include +#ifndef KNC_ONLY_STORES +#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512 +#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512 +#endif + + +namespace Optimization { + + struct Vsplat{ + //Complex float + inline __m512 operator()(float a, float b){ + return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a); + } + // Real float + inline __m512 operator()(float a){ + return _mm512_set1_ps(a); + } + //Complex double + inline __m512d operator()(double a, double b){ + return _mm512_set_pd(b,a,b,a,b,a,b,a); + } + //Real double + inline __m512d operator()(double a){ + return _mm512_set1_pd(a); + } + //Integer + inline __m512i operator()(Integer a){ + return _mm512_set1_epi32(a); + } + }; + + struct Vstore{ + //Float + inline void operator()(__m512 a, float* F){ + _mm512_store_ps(F,a); + } + //Double + inline void operator()(__m512d a, double* D){ + _mm512_store_pd(D,a); + } + //Integer + inline void operator()(__m512i a, Integer* I){ + _mm512_store_si512((__m512i *)I,a); + } + + }; + + + struct Vstream{ + //Float + inline void operator()(float * a, __m512 b){ + _mm512_storenrngo_ps(a,b); + } + //Double + inline void operator()(double * a, __m512d b){ + _mm512_storenrngo_pd(a,b); + } + + + }; + + + + struct Vset{ + // Complex float + inline __m512 operator()(Grid::ComplexF *a){ + return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(), + a[5].imag(),a[5].real(),a[4].imag(),a[4].real(), + a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m512d operator()(Grid::ComplexD *a){ + return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(), + a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); + } + // Real float + inline __m512 operator()(float *a){ + return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Real double + inline __m512d operator()(double *a){ + return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + // Integer + inline __m512i operator()(Integer *a){ + return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8], + a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]); + } + + + }; + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + + + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_add_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_add_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_add_epi32(a,b); + } + }; + + struct Sub{ + //Complex/Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_sub_ps(a,b); + } + //Complex/Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_sub_pd(a,b); + } + //Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_sub_epi32(a,b); + } + }; + + + struct MultComplex{ + // Complex float + inline __m512 operator()(__m512 a, __m512 b){ + __m512 vzero,ymm0,ymm1,real, imag; + vzero = _mm512_setzero_ps(); + ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // + real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0); + imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0); + ymm1 = _mm512_mul_ps(real, b); + ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK + return _mm512_fmadd_ps(ymm0,imag,ymm1); + } + // Complex double + inline __m512d operator()(__m512d a, __m512d b){ + /* This is from + * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets + * @inproceedings{McFarlin:2011:ASV:1995896.1995938, + * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus}, + * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets}, + * booktitle = {Proceedings of the International Conference on Supercomputing}, + * series = {ICS '11}, + * year = {2011}, + * isbn = {978-1-4503-0102-2}, + * location = {Tucson, Arizona, USA}, + * pages = {265--274}, + * numpages = {10}, + * url = {http://doi.acm.org/10.1145/1995896.1995938}, + * doi = {10.1145/1995896.1995938}, + * acmid = {1995938}, + * publisher = {ACM}, + * address = {New York, NY, USA}, + * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization}, + * } + */ + __m512d vzero,ymm0,ymm1,real,imag; + vzero =_mm512_setzero_pd(); + ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // + real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0); + imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0); + ymm1 = _mm512_mul_pd(real, b); + ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK + return _mm512_fmadd_pd(ymm0,imag,ymm1); + } + }; + + struct Mult{ + // Real float + inline __m512 operator()(__m512 a, __m512 b){ + return _mm512_mul_ps(a,b); + } + // Real double + inline __m512d operator()(__m512d a, __m512d b){ + return _mm512_mul_pd(a,b); + } + // Integer + inline __m512i operator()(__m512i a, __m512i b){ + return _mm512_mullo_epi32(a,b); + } + }; + + + struct Conj{ + // Complex single + inline __m512 operator()(__m512 in){ + return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag + } + // Complex double + inline __m512d operator()(__m512d in){ + return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in); + } + // do not define for integer input + }; + + struct TimesMinusI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag + return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag + return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK + } + + + }; + + struct TimesI{ + //Complex single + inline __m512 operator()(__m512 in, __m512 ret){ + __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK + return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag + } + //Complex double + inline __m512d operator()(__m512d in, __m512d ret){ + __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK + return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag + } + + + }; + + + + + + ////////////////////////////////////////////// + // Some Template specialization + + //Complex float Reduce + template<> + inline Grid::ComplexF Reduce::operator()(__m512 in){ + return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in)); + } + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(__m512 in){ + return _mm512_reduce_add_ps(in); + } + + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(__m512d in){ + return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in)); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(__m512d in){ + return _mm512_reduce_add_pd(in); + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(__m512i in){ + // FIXME unimplemented + printf("Reduce : Missing integer implementation -> FIX\n"); + assert(0); + } + + +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types +namespace Grid { + typedef __m512 SIMD_Ftype; // Single precision type + typedef __m512d SIMD_Dtype; // Double precision type + typedef __m512i SIMD_Itype; // Integer type + + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h new file mode 100644 index 00000000..dc0251f8 --- /dev/null +++ b/lib/simd/Grid_qpx.h @@ -0,0 +1,272 @@ +//---------------------------------------------------------------------- +/*! @file Grid_qpx.h + @brief Optimization libraries for QPX instructions set for BG/Q + + Using intrinsics +*/ +// Time-stamp: <2015-05-22 17:29:26 neo> +//---------------------------------------------------------------------- + +// lot of undefined functions + +namespace Optimization { + + struct Vsplat{ + //Complex float + inline float operator()(float a, float b){ + return {a,b,a,b}; + } + // Real float + inline float operator()(float a){ + return {a,a,a,a}; + } + //Complex double + inline vector4double operator()(double a, double b){ + return {a,b,a,b}; + } + //Real double + inline vector4double operator()(double a){ + return {a,a,a,a}; + } + //Integer + inline int operator()(Integer a){ +#error + } + }; + + struct Vstore{ + //Float + inline void operator()(float a, float* F){ + assert(0); + } + //Double + inline void operator()(vector4double a, double* D){ + assert(0); + } + //Integer + inline void operator()(int a, Integer* I){ + assert(0); + } + + }; + + + struct Vstream{ + //Float + inline void operator()(float * a, float b){ + assert(0); + } + //Double + inline void operator()(double * a, vector4double b){ + assert(0); + } + + + }; + + + + struct Vset{ + // Complex float + inline float operator()(Grid::ComplexF *a){ + return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()}; + } + // Complex double + inline vector4double operator()(Grid::ComplexD *a){ + return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()}; + } + // Real float + inline float operator()(float *a){ + return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]}; + } + // Real double + inline vector4double operator()(double *a){ + return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]}; + } + // Integer + inline int operator()(Integer *a){ +#error + } + + + }; + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + + + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline float operator()(float a, float b){ +#error + } + //Complex/Real double + inline vector4double operator()(vector4double a, vector4double b){ + return vec_add(a,b); + } + //Integer + inline int operator()(int a, int b){ +#error + } + }; + + struct Sub{ + //Complex/Real float + inline float operator()(float a, float b){ +#error + } + //Complex/Real double + inline vector4double operator()(vector4double a, vector4double b){ +#error + } + //Integer + inline floati operator()(int a, int b){ +#error + } + }; + + + struct MultComplex{ + // Complex float + inline float operator()(float a, float b){ +#error + } + // Complex double + inline vector4double operator()(vector4double a, vector4double b){ +#error + } + }; + + struct Mult{ + // Real float + inline float operator()(float a, float b){ +#error + } + // Real double + inline vector4double operator()(vector4double a, vector4double b){ +#error + } + // Integer + inline int operator()(int a, int b){ +#error + } + }; + + + struct Conj{ + // Complex single + inline float operator()(float in){ + assert(0); + } + // Complex double + inline vector4double operator()(vector4double in){ + assert(0); + } + // do not define for integer input + }; + + struct TimesMinusI{ + //Complex single + inline float operator()(float in, float ret){ + assert(0); + } + //Complex double + inline vector4double operator()(vector4double in, vector4double ret){ + assert(0); + } + + + }; + + struct TimesI{ + //Complex single + inline float operator()(float in, float ret){ + + } + //Complex double + inline vector4double operator()(vector4double in, vector4double ret){ + + } + + + }; + + + + + + ////////////////////////////////////////////// + // Some Template specialization + + //Complex float Reduce + template<> + inline Grid::ComplexF Reduce::operator()(float in){ + assert(0); + } + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(float in){ + assert(0); + } + + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(vector4double in){ + assert(0); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(vector4double in){ + assert(0); + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(float in){ + assert(0); + } + + +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types +namespace Grid { + typedef float SIMD_Ftype __attribute__ ((vector_size (16))); // Single precision type + typedef vector4double SIMD_Dtype; // Double precision type + typedef int SIMD_Itype; // Integer type + + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index ddc3490b..fa0e3ec3 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -4,7 +4,7 @@ Using intrinsics */ -// Time-stamp: <2015-05-20 16:45:39 neo> +// Time-stamp: <2015-05-21 18:06:30 neo> //---------------------------------------------------------------------- #include @@ -53,12 +53,12 @@ namespace Optimization { struct Vstream{ //Float - inline void operator()(__m128 a, __m128 b){ - _mm_stream_ps((float *)&a,b); + inline void operator()(float * a, __m128 b){ + _mm_stream_ps(a,b); } //Double - inline void operator()(__m128d a, __m128d b){ - _mm_stream_pd((double *)&a,b); + inline void operator()(double * a, __m128d b){ + _mm_stream_pd(a,b); } diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 57480621..34c63964 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -2,12 +2,20 @@ /*! @file Grid_vector_types.h @brief Defines templated class Grid_simd to deal with inner vector types */ -// Time-stamp: <2015-05-20 17:31:55 neo> +// Time-stamp: <2015-05-22 17:08:19 neo> //--------------------------------------------------------------------------- #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES +#ifdef SSE4 #include "Grid_sse4.h" +#endif +#if defined (AVX1)|| defined (AVX2) +#include "Grid_avx.h" +#endif +#if defined AVX512 +#include "Grid_knc.h" +#endif namespace Grid { @@ -43,13 +51,14 @@ namespace Grid { // general forms to allow for vsplat syntax // need explicit declaration of types when used since // clang cannot automatically determine the output type sometimes + // use decltype? template < class Out, class Input1, class Input2, class Operation > Out binary(Input1 src_1, Input2 src_2, Operation op){ return op(src_1, src_2); } - template < class SIMDout, class Input, class Operation > - SIMDout unary(Input src, Operation op){ + template < class Out, class Input, class Operation > + Out unary(Input src, Operation op){ return op(src); } @@ -63,27 +72,34 @@ namespace Grid { public: typedef typename RealPart < Scalar_type >::type Real; + typedef Vector_type vector_type; + typedef Scalar_type scalar_type; + Vector_type v; - + static inline int Nsimd(void) { return sizeof(Vector_type)/sizeof(Scalar_type);} - + // Constructors Grid_simd & operator = ( Zero & z){ vzero(*this); return (*this); } - Grid_simd(){}; - + Grid_simd& operator=(const Grid_simd&& rhs){v=rhs.v;return *this;}; + Grid_simd& operator=(const Grid_simd& rhs){v=rhs.v;return *this;}; //faster than not declaring it and leaving to the compiler + Grid_simd()=default; + Grid_simd(const Grid_simd& rhs):v(rhs.v){}; //compiles in movaps + Grid_simd(const Grid_simd&& rhs):v(rhs.v){}; + //Enable if complex type template < class S = Scalar_type > - Grid_simd(typename std::enable_if< is_complex < S >::value, S>::type a){ + Grid_simd(const typename std::enable_if< is_complex < S >::value, S>::type a){ vsplat(*this,a); }; - Grid_simd(Real a){ + Grid_simd(const Real a){ vsplat(*this,Scalar_type(a)); }; @@ -97,18 +113,13 @@ namespace Grid { friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); } friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); } - //not for integer types... FIXME + //not for integer types... + template < class S = Scalar_type, NotEnableIf, int> = 0 > friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); } /////////////////////////////////////////////// // Initialise to 1,0,i for the correct types /////////////////////////////////////////////// - // if not complex overload here - template < class S = Scalar_type, NotEnableIf,int> = 0 > - friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); } - template < class S = Scalar_type, NotEnableIf,int> = 0 > - friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); } - // For complex types template < class S = Scalar_type, EnableIf, int> = 0 > friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); } @@ -116,6 +127,14 @@ namespace Grid { friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0,0.0); }// use xor? template < class S = Scalar_type, EnableIf, int> = 0 > friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);} + + // if not complex overload here + template < class S = Scalar_type, EnableIf,int> = 0 > + friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); } + template < class S = Scalar_type, EnableIf,int> = 0 > + friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); } + + // For integral types template < class S = Scalar_type, EnableIf, int> = 0 > @@ -125,7 +144,7 @@ namespace Grid { template < class S = Scalar_type, EnableIf, int> = 0 > friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);} template < class S = Scalar_type, EnableIf, int> = 0 > - friend inline void vfalse(vInteger &ret){vsplat(ret,0);} + friend inline void vfalse(Grid_simd &ret){vsplat(ret,0);} @@ -206,8 +225,9 @@ namespace Grid { /////////////////////// // Vstream /////////////////////// + template < class S = Scalar_type, NotEnableIf, int> = 0 > friend inline void vstream(Grid_simd &out,const Grid_simd &in){ - binary(out.v, in.v, VstreamSIMD()); + binary((Real*)&out.v, in.v, VstreamSIMD()); } template < class S = Scalar_type, EnableIf, int> = 0 > @@ -305,7 +325,7 @@ namespace Grid { // Unary negation /////////////////////// friend inline Grid_simd operator -(const Grid_simd &r) { - vComplexF ret; + Grid_simd ret; vzero(ret); ret = ret - r; return ret; @@ -350,7 +370,7 @@ namespace Grid { } template - inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);} + inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);} template @@ -368,35 +388,11 @@ namespace Grid { // Define available types (now change names to avoid clashing with the rest of the code) - typedef Grid_simd< float , SIMD_Ftype > MyRealF; - typedef Grid_simd< double , SIMD_Dtype > MyRealD; - typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF; - typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD; - - - - - //////////////////////////////////////////////////////////////////// - // Temporary hack to keep independent from the rest of the code - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - template<> struct isGridTensor { - static const bool value = false; - static const bool notvalue = true; - }; - - - + typedef Grid_simd< float , SIMD_Ftype > vRealF; + typedef Grid_simd< double , SIMD_Dtype > vRealD; + typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF; + typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD; + typedef Grid_simd< Integer , SIMD_Itype > vInteger; } diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Old/Grid_vComplexD.h similarity index 100% rename from lib/simd/Grid_vComplexD.h rename to lib/simd/Old/Grid_vComplexD.h diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Old/Grid_vComplexF.h similarity index 99% rename from lib/simd/Grid_vComplexF.h rename to lib/simd/Old/Grid_vComplexF.h index 713bbdd8..37760b98 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Old/Grid_vComplexF.h @@ -54,7 +54,7 @@ namespace Grid { ////////////////////////////////// friend inline void vone(vComplexF &ret) { vsplat(ret,1.0,0.0); } friend inline void vzero(vComplexF &ret) { vsplat(ret,0.0,0.0); } - friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0);} + friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0); } //////////////////////////////////// // Arithmetic operator overloads +,-,* diff --git a/lib/simd/Grid_vInteger.h b/lib/simd/Old/Grid_vInteger.h similarity index 100% rename from lib/simd/Grid_vInteger.h rename to lib/simd/Old/Grid_vInteger.h diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Old/Grid_vRealD.h similarity index 100% rename from lib/simd/Grid_vRealD.h rename to lib/simd/Old/Grid_vRealD.h diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Old/Grid_vRealF.h similarity index 100% rename from lib/simd/Grid_vRealF.h rename to lib/simd/Old/Grid_vRealF.h diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index 10e74099..c1f1c5d0 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -103,6 +103,9 @@ int main (int argc, char ** argv) random(FineRNG,scVec); fflush(stdout); + + + /* cVec = cMat * cVec; // LatticeColourVector = LatticeColourMatrix * LatticeColourVector sVec = sMat * sVec; // LatticeSpinVector = LatticeSpinMatrix * LatticeSpinVector scVec= scMat * scVec;// LatticeSpinColourVector = LatticeSpinColourMatrix * LatticeSpinColourVector @@ -112,12 +115,14 @@ int main (int argc, char ** argv) cMat = outerProduct(cVec,cVec); scalar = localInnerProduct(cVec,cVec); + scalar += scalar; scalar -= scalar; scalar *= scalar; add(scalar,scalar,scalar); sub(scalar,scalar,scalar); mult(scalar,scalar,scalar); + mac(scalar,scalar,scalar); scalar = scalar+scalar; scalar = scalar-scalar; @@ -141,7 +146,7 @@ int main (int argc, char ** argv) scalar=trace(scalar); scalar=localInnerProduct(cVec,cVec); scalar=localNorm2(cVec); - + */ // -=,+=,*=,() // add,+,sub,-,mult,mac,* // adj,conjugate @@ -153,10 +158,11 @@ int main (int argc, char ** argv) // localNorm2 // localInnerProduct + scMat = sMat*scMat; // LatticeSpinColourMatrix = LatticeSpinMatrix * LatticeSpinColourMatrix - - + + /* #ifdef SSE4 ///////// Tests the new class Grid_simd std::complex ctest(3.0,2.0); @@ -196,8 +202,10 @@ int main (int argc, char ** argv) std::cout << sum<< std::endl; #endif + */ /////////////////////// - + /* + printf("DEBUG: calling 3.5 \n"); // Non-lattice (const objects) * Lattice ColourMatrix cm; SpinColourMatrix scm; @@ -217,6 +225,7 @@ int main (int argc, char ** argv) vscm = vscm*cplx; scMat = scMat*cplx; + printf("DEBUG: calling 3.7 \n"); scm = cplx*scm; vscm = cplx*vscm; scMat = cplx*scMat; @@ -224,12 +233,14 @@ int main (int argc, char ** argv) vscm = myint*vscm; scMat = scMat*myint; + printf("DEBUG: calling 3.9 \n"); scm = scm*mydouble; vscm = vscm*mydouble; scMat = scMat*mydouble; scMat = mydouble*scMat; cMat = mydouble*cMat; - + + printf("DEBUG: calling 4 \n"); sMat = adj(sMat); // LatticeSpinMatrix adjoint sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix sMat = GammaFive*sMat; // SpinMatrix * LatticeSpinMatrix @@ -240,6 +251,9 @@ int main (int argc, char ** argv) scm=transpose(scm); scm=transposeIndex<1>(scm); + + + // Foo = Foo+scalar; // LatticeColourMatrix+Scalar // Foo = Foo*scalar; // LatticeColourMatrix*Scalar // Foo = Foo-scalar; // LatticeColourMatrix-Scalar @@ -279,7 +293,8 @@ int main (int argc, char ** argv) pokeIndex<1> (c_m,c,0,0); } - + */ + FooBar = Bar; /* @@ -332,14 +347,14 @@ int main (int argc, char ** argv) // Lattice SU(3) x SU(3) Fine.Barrier(); FooBar = Foo * Bar; - + // Lattice 12x12 GEMM scFooBar = scFoo * scBar; - + // Benchmark some simple operations LatticeSU3 * Lattice SU3. double t0,t1,flops; double bytes; - int ncall=100; + int ncall=5000; int Nc = Grid::QCD::Nc; LatticeGaugeField U(&Fine); @@ -351,19 +366,21 @@ int main (int argc, char ** argv) if ( Fine.IsBoss() ) { printf("%f flop and %f bytes\n",flops,bytes/ncall); } - FooBar = Foo * Bar; + FooBar = Foo * Bar; Fine.Barrier(); t0=usecond(); for(int i=0;i Date: Tue, 26 May 2015 12:02:54 +0900 Subject: [PATCH 2/4] checked performance of new vector libaries. Added check for c++11 support on the configure.ac --- Makefile.in | 4 +- aclocal.m4 | 1 + configure | 186 ++++++++++++++++++++++++++++++++ configure.ac | 4 +- lib/Grid_config.h | 3 + lib/Grid_config.h.in | 3 + lib/Grid_simd.h | 4 + lib/cshift/Grid_cshift_common.h | 12 ++- lib/math/Grid_math_tensors.h | 14 +++ lib/simd/Grid_avx.h | 6 +- m4/ax_cxx_compile_stdcxx_11.m4 | 167 ++++++++++++++++++++++++++++ tests/Grid_main.cc | 53 +-------- 12 files changed, 398 insertions(+), 59 deletions(-) create mode 100644 m4/ax_cxx_compile_stdcxx_11.m4 diff --git a/Makefile.in b/Makefile.in index b6894ef6..d473c2df 100644 --- a/Makefile.in +++ b/Makefile.in @@ -84,7 +84,8 @@ DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \ $(top_srcdir)/configure $(am__configure_deps) COPYING TODO \ compile config.guess config.sub depcomp install-sh missing ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ @@ -212,6 +213,7 @@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ GREP = @GREP@ +HAVE_CXX11 = @HAVE_CXX11@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ diff --git a/aclocal.m4 b/aclocal.m4 index 389763bf..a3d1bc9c 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -1083,3 +1083,4 @@ AC_SUBST([am__tar]) AC_SUBST([am__untar]) ]) # _AM_PROG_TAR +m4_include([m4/ax_cxx_compile_stdcxx_11.m4]) diff --git a/configure b/configure index 8c9e8c59..b7bd49f0 100755 --- a/configure +++ b/configure @@ -633,6 +633,7 @@ BUILD_COMMS_MPI_TRUE EGREP GREP CXXCPP +HAVE_CXX11 RANLIB OPENMP_CXXFLAGS am__fastdepCXX_FALSE @@ -3965,6 +3966,191 @@ else RANLIB="$ac_cv_prog_RANLIB" fi + ax_cxx_compile_cxx11_required=true + ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + ac_success=no + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features by default" >&5 +$as_echo_n "checking whether $CXX supports C++11 features by default... " >&6; } +if ${ax_cv_cxx_compile_cxx11+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + struct Base { + virtual void f() {} + }; + struct Child : public Base { + virtual void f() override {} + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b; + + typedef check check_type; + check_type c; + check_type&& cr = static_cast(c); + + auto d = a; + auto l = [](){}; + // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable] + struct use_l { use_l() { l(); } }; + + // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae + // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this + namespace test_template_alias_sfinae { + struct foo {}; + + template + using member = typename T::member_type; + + template + void func(...) {} + + template + void func(member*) {} + + void test(); + + void test() { + func(0); + } + } + +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ax_cv_cxx_compile_cxx11=yes +else + ax_cv_cxx_compile_cxx11=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_cxx_compile_cxx11" >&5 +$as_echo "$ax_cv_cxx_compile_cxx11" >&6; } + if test x$ax_cv_cxx_compile_cxx11 = xyes; then + ac_success=yes + fi + + + + if test x$ac_success = xno; then + for switch in -std=c++11 -std=c++0x +std=c++11; do + cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh` + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5 +$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; } +if eval \${$cachevar+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS $switch" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + struct Base { + virtual void f() {} + }; + struct Child : public Base { + virtual void f() override {} + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b; + + typedef check check_type; + check_type c; + check_type&& cr = static_cast(c); + + auto d = a; + auto l = [](){}; + // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable] + struct use_l { use_l() { l(); } }; + + // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae + // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this + namespace test_template_alias_sfinae { + struct foo {}; + + template + using member = typename T::member_type; + + template + void func(...) {} + + template + void func(member*) {} + + void test(); + + void test() { + func(0); + } + } + +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + eval $cachevar=yes +else + eval $cachevar=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + CXXFLAGS="$ac_save_CXXFLAGS" +fi +eval ac_res=\$$cachevar + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + if eval test x\$$cachevar = xyes; then + CXXFLAGS="$CXXFLAGS $switch" + ac_success=yes + break + fi + done + fi + ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + if test x$ax_cxx_compile_cxx11_required = xtrue; then + if test x$ac_success = xno; then + as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5 + fi + else + if test x$ac_success = xno; then + HAVE_CXX11=0 + { $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5 +$as_echo "$as_me: No compiler with C++11 support was found" >&6;} + else + HAVE_CXX11=1 + +$as_echo "#define HAVE_CXX11 1" >>confdefs.h + + fi + + + fi + + # Checks for libraries. #AX_GCC_VAR_ATTRIBUTE(aligned) diff --git a/configure.ac b/configure.ac index bfcbfcef..54a36eb1 100644 --- a/configure.ac +++ b/configure.ac @@ -3,7 +3,7 @@ # # Project Grid package # -# Time-stamp: <2015-05-22 15:46:09 neo> +# Time-stamp: <2015-05-25 14:54:34 neo> AC_PREREQ([2.63]) AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk]) @@ -26,6 +26,8 @@ AC_LANG(C++) AC_PROG_CXX AC_OPENMP AC_PROG_RANLIB +AX_CXX_COMPILE_STDCXX_11(noext, mandatory) + # Checks for libraries. #AX_GCC_VAR_ATTRIBUTE(aligned) diff --git a/lib/Grid_config.h b/lib/Grid_config.h index 78582f3e..2397894f 100644 --- a/lib/Grid_config.h +++ b/lib/Grid_config.h @@ -16,6 +16,9 @@ /* GRID_COMMS_NONE */ #define GRID_COMMS_NONE 1 +/* define if the compiler supports basic C++11 syntax */ +/* #undef HAVE_CXX11 */ + /* Define to 1 if you have the declaration of `be64toh', and to 0 if you don't. */ #define HAVE_DECL_BE64TOH 1 diff --git a/lib/Grid_config.h.in b/lib/Grid_config.h.in index b7f56d5b..6f05d6cb 100644 --- a/lib/Grid_config.h.in +++ b/lib/Grid_config.h.in @@ -15,6 +15,9 @@ /* GRID_COMMS_NONE */ #undef GRID_COMMS_NONE +/* define if the compiler supports basic C++11 syntax */ +#undef HAVE_CXX11 + /* Define to 1 if you have the declaration of `be64toh', and to 0 if you don't. */ #undef HAVE_DECL_BE64TOH diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 9484c0d6..bdcb2d1b 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -103,6 +103,10 @@ namespace Grid { inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } + inline void vstream(ComplexF &l, const ComplexF &r){ l=r;} + inline void vstream(ComplexD &l, const ComplexD &r){ l=r;} + inline void vstream(RealF &l, const RealF &r){ l=r;} + inline void vstream(RealD &l, const RealD &r){ l=r;} class Zero{}; diff --git a/lib/cshift/Grid_cshift_common.h b/lib/cshift/Grid_cshift_common.h index 65c0cb87..90fc10b7 100644 --- a/lib/cshift/Grid_cshift_common.h +++ b/lib/cshift/Grid_cshift_common.h @@ -160,13 +160,21 @@ template void Copy_plane(Lattice& lhs,Lattice &rhs, int PARALLEL_NESTED_LOOP2 for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ - + /* int o =n*rhs._grid->_slice_stride[dimension]; int ocb=1<CheckerBoardFromOindex(o+b); if ( ocb&cbmask ) { lhs._odata[lo+o+b]=rhs._odata[ro+o+b]; } - + */ + + int o =n*rhs._grid->_slice_stride[dimension]+b; + int ocb=1<CheckerBoardFromOindex(o); + if ( ocb&cbmask ) { + //lhs._odata[lo+o]=rhs._odata[ro+o]; + vstream(lhs._odata[lo+o],rhs._odata[ro+o]); + } + } } diff --git a/lib/math/Grid_math_tensors.h b/lib/math/Grid_math_tensors.h index a0424576..94cd3d58 100644 --- a/lib/math/Grid_math_tensors.h +++ b/lib/math/Grid_math_tensors.h @@ -38,6 +38,10 @@ public: iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type iScalar(const Zero &z){ *this = zero; }; + + + + iScalar & operator= (const Zero &hero){ zeroit(*this); return *this; @@ -206,6 +210,16 @@ public: iMatrix(const Zero &z){ *this = zero; }; iMatrix() =default; + // No copy constructor... + + iMatrix& operator=(const iMatrix& rhs){ + for(int i=0;i & operator= (const Zero &hero){ zeroit(*this); diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index ea796122..303f30c5 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -4,7 +4,7 @@ Using intrinsics */ -// Time-stamp: <2015-05-22 15:51:24 neo> +// Time-stamp: <2015-05-22 18:58:27 neo> //---------------------------------------------------------------------- #include @@ -307,9 +307,7 @@ namespace Optimization { conv.v = b; switch (perm){ // 8x32 bits=>3 permutes - case 2: - conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); - break; + case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; default: assert(0); break; diff --git a/m4/ax_cxx_compile_stdcxx_11.m4 b/m4/ax_cxx_compile_stdcxx_11.m4 new file mode 100644 index 00000000..395b13d2 --- /dev/null +++ b/m4/ax_cxx_compile_stdcxx_11.m4 @@ -0,0 +1,167 @@ +# ============================================================================ +# http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html +# ============================================================================ +# +# SYNOPSIS +# +# AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional]) +# +# DESCRIPTION +# +# Check for baseline language coverage in the compiler for the C++11 +# standard; if necessary, add switches to CXXFLAGS to enable support. +# +# The first argument, if specified, indicates whether you insist on an +# extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. +# -std=c++11). If neither is specified, you get whatever works, with +# preference for an extended mode. +# +# The second argument, if specified 'mandatory' or if left unspecified, +# indicates that baseline C++11 support is required and that the macro +# should error out if no mode with that support is found. If specified +# 'optional', then configuration proceeds regardless, after defining +# HAVE_CXX11 if and only if a supporting mode is found. +# +# LICENSE +# +# Copyright (c) 2008 Benjamin Kosnik +# Copyright (c) 2012 Zack Weinberg +# Copyright (c) 2013 Roy Stogner +# Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 11 + +m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[ + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + struct Base { + virtual void f() {} + }; + struct Child : public Base { + virtual void f() override {} + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b; + + typedef check check_type; + check_type c; + check_type&& cr = static_cast(c); + + auto d = a; + auto l = [](){}; + // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable] + struct use_l { use_l() { l(); } }; + + // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae + // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this + namespace test_template_alias_sfinae { + struct foo {}; + + template + using member = typename T::member_type; + + template + void func(...) {} + + template + void func(member*) {} + + void test(); + + void test() { + func(0); + } + } +]]) + +AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl + m4_if([$1], [], [], + [$1], [ext], [], + [$1], [noext], [], + [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl + m4_if([$2], [], [ax_cxx_compile_cxx11_required=true], + [$2], [mandatory], [ax_cxx_compile_cxx11_required=true], + [$2], [optional], [ax_cxx_compile_cxx11_required=false], + [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])]) + AC_LANG_PUSH([C++])dnl + ac_success=no + AC_CACHE_CHECK(whether $CXX supports C++11 features by default, + ax_cv_cxx_compile_cxx11, + [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], + [ax_cv_cxx_compile_cxx11=yes], + [ax_cv_cxx_compile_cxx11=no])]) + if test x$ax_cv_cxx_compile_cxx11 = xyes; then + ac_success=yes + fi + + m4_if([$1], [noext], [], [dnl + if test x$ac_success = xno; then + for switch in -std=gnu++11 -std=gnu++0x; do + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch]) + AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch, + $cachevar, + [ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS $switch" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], + [eval $cachevar=yes], + [eval $cachevar=no]) + CXXFLAGS="$ac_save_CXXFLAGS"]) + if eval test x\$$cachevar = xyes; then + CXXFLAGS="$CXXFLAGS $switch" + ac_success=yes + break + fi + done + fi]) + + m4_if([$1], [ext], [], [dnl + if test x$ac_success = xno; then + dnl HP's aCC needs +std=c++11 according to: + dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf + for switch in -std=c++11 -std=c++0x +std=c++11; do + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch]) + AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch, + $cachevar, + [ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS $switch" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], + [eval $cachevar=yes], + [eval $cachevar=no]) + CXXFLAGS="$ac_save_CXXFLAGS"]) + if eval test x\$$cachevar = xyes; then + CXXFLAGS="$CXXFLAGS $switch" + ac_success=yes + break + fi + done + fi]) + AC_LANG_POP([C++]) + if test x$ax_cxx_compile_cxx11_required = xtrue; then + if test x$ac_success = xno; then + AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.]) + fi + else + if test x$ac_success = xno; then + HAVE_CXX11=0 + AC_MSG_NOTICE([No compiler with C++11 support was found]) + else + HAVE_CXX11=1 + AC_DEFINE(HAVE_CXX11,1, + [define if the compiler supports basic C++11 syntax]) + fi + + AC_SUBST(HAVE_CXX11) + fi +]) diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index c1f1c5d0..fd198f30 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -105,7 +105,7 @@ int main (int argc, char ** argv) fflush(stdout); - /* + cVec = cMat * cVec; // LatticeColourVector = LatticeColourMatrix * LatticeColourVector sVec = sMat * sVec; // LatticeSpinVector = LatticeSpinMatrix * LatticeSpinVector scVec= scMat * scVec;// LatticeSpinColourVector = LatticeSpinColourMatrix * LatticeSpinColourVector @@ -146,7 +146,7 @@ int main (int argc, char ** argv) scalar=trace(scalar); scalar=localInnerProduct(cVec,cVec); scalar=localNorm2(cVec); - */ + // -=,+=,*=,() // add,+,sub,-,mult,mac,* // adj,conjugate @@ -162,50 +162,7 @@ int main (int argc, char ** argv) scMat = sMat*scMat; // LatticeSpinColourMatrix = LatticeSpinMatrix * LatticeSpinColourMatrix - /* -#ifdef SSE4 - ///////// Tests the new class Grid_simd - std::complex ctest(3.0,2.0); - std::complex ctestf(3.0,2.0); - MyComplexF TestMe1(1.0); // fills only real part - MyComplexD TestMe2(ctest); - MyComplexD TestMe3(ctest);// compiler generate conversion of basic types - //MyRealF TestMe5(ctest);// Must generate compiler error - MyRealD TestRe1(2.0); - MyRealF TestRe2(3.0); - - vone(TestRe2); - - MyComplexF TestMe6(ctestf); - MyComplexF TestMe7(ctestf); - - MyComplexD TheSum= TestMe2*TestMe3; - MyComplexF TheSumF= TestMe6*TestMe7; - - - - double dsum[2]; - _mm_store_pd(dsum, TheSum.v); - for (int i =0; i< 2; i++) - printf("%f\n", dsum[i]); - MyComplexD TheSumI = timesMinusI(TheSum); - MyComplexF TheSumIF = timesMinusI(TheSumF); - - float fsum[4]; - _mm_store_ps(fsum, TheSumF.v); - for (int i =0; i< 4; i++) - printf("%f\n", fsum[i]); - - vstore(TheSumI, &ctest); - std::complex sum = Reduce(TheSumF); - std::cout << ctest<< std::endl; - std::cout << sum<< std::endl; - -#endif - */ /////////////////////// - /* - printf("DEBUG: calling 3.5 \n"); // Non-lattice (const objects) * Lattice ColourMatrix cm; SpinColourMatrix scm; @@ -225,7 +182,6 @@ int main (int argc, char ** argv) vscm = vscm*cplx; scMat = scMat*cplx; - printf("DEBUG: calling 3.7 \n"); scm = cplx*scm; vscm = cplx*vscm; scMat = cplx*scMat; @@ -233,14 +189,12 @@ int main (int argc, char ** argv) vscm = myint*vscm; scMat = scMat*myint; - printf("DEBUG: calling 3.9 \n"); scm = scm*mydouble; vscm = vscm*mydouble; scMat = scMat*mydouble; scMat = mydouble*scMat; cMat = mydouble*cMat; - printf("DEBUG: calling 4 \n"); sMat = adj(sMat); // LatticeSpinMatrix adjoint sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix sMat = GammaFive*sMat; // SpinMatrix * LatticeSpinMatrix @@ -293,8 +247,6 @@ int main (int argc, char ** argv) pokeIndex<1> (c_m,c,0,0); } - */ - FooBar = Bar; /* @@ -392,7 +344,6 @@ int main (int argc, char ** argv) t0=usecond(); for(int i=0;i Date: Tue, 26 May 2015 13:31:10 +0900 Subject: [PATCH 3/4] Cleaning up simd files --- TODO | 4 +- lib/Grid_simd.h | 90 ------------------------------------ lib/simd/Grid_vector_types.h | 66 ++++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 96 deletions(-) diff --git a/TODO b/TODO index ed4dedd4..670c53e3 100644 --- a/TODO +++ b/TODO @@ -1,8 +1,8 @@ ================================================================ *** Hacks and bug fixes to clean up and Audits ================================================================ -* Base class to share common code between vRealF, VComplexF etc... - - Performance check on Guido's reimplementation strategy +* Base class to share common code between vRealF, VComplexF etc... done + - Performance check on Guido's reimplementation strategy - (GUIDO) tested and no difference was found, merged * FIXME audit diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 19b30f03..39eb4654 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -95,100 +95,10 @@ namespace Grid { template<> inline void zeroit(RealF &arg){ arg=0; }; template<> inline void zeroit(RealD &arg){ arg=0; }; - // Eventually delete this part -#if defined (SSE4) - typedef __m128 fvec; - typedef __m128d dvec; - typedef __m128 cvec; - typedef __m128d zvec; - typedef __m128i ivec; -#endif -#if defined (AVX1) || defined (AVX2) - typedef __m256 fvec; - typedef __m256d dvec; - typedef __m256 cvec; - typedef __m256d zvec; - typedef __m256i ivec; -#endif -#if defined (AVX512) - typedef __m512 fvec; - typedef __m512d dvec; - typedef __m512 cvec; - typedef __m512d zvec; - typedef __m512i ivec; -#endif -#if defined (QPX) - typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision - typedef float cvec __attribute__ ((vector_size (16))); - - typedef vector4double dvec; - typedef vector4double zvec; -#endif - -#if defined (AVX1) || defined (AVX2) || defined (AVX512) - inline void v_prefetch0(int size, const char *ptr){ - for(int i=0;i BA DC FE HG -// Permute 1 every ABCDEFGH -> CD AB GH EF -// Permute 2 every ABCDEFGH -> EFGH ABCD -// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) -// Permute 4 possible on half precision @512bit vectors. -////////////////////////////////////////////////////////// -template -inline void Gpermute(vsimd &y,const vsimd &b,int perm){ - union { - fvec f; - decltype(vsimd::v) v; - } conv; - conv.v = b.v; - switch (perm){ -#if defined(AVX1)||defined(AVX2) - // 8x32 bits=>3 permutes - case 2: - conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); - break; - case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; - case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; -#endif -#ifdef SSE4 - case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break; -#endif -#ifdef AVX512 - // 16 floats=> permutes - // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo - // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn - // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl - // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh - case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; - case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; - case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; -#endif -#ifdef QPX -#error not implemented -#endif - default: assert(0); break; - } - y.v=conv.v; - - }; }; #include - namespace Grid { // NB: Template the following on "type Complex" and then implement *,+,- for diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index a77cf963..3664e0f7 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -2,7 +2,7 @@ /*! @file Grid_vector_types.h @brief Defines templated class Grid_simd to deal with inner vector types */ -// Time-stamp: <2015-05-26 12:05:39 neo> +// Time-stamp: <2015-05-26 13:22:36 neo> //--------------------------------------------------------------------------- #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES @@ -16,7 +16,9 @@ #if defined AVX512 #include "Grid_knc.h" #endif - +#if defined QPX +#include "Grid_qpx.h" +#endif namespace Grid { @@ -33,8 +35,6 @@ namespace Grid { template using EnableIf = Invoke>; template using NotEnableIf= Invoke>; - - //////////////////////////////////////////////////////// // Check for complexity with type traits template struct is_complex : std::false_type {}; @@ -57,6 +57,58 @@ namespace Grid { /////////////////////////////////////////////// +////////////////////////////////////////////////////////// +// Permute +// Permute 0 every ABCDEFGH -> BA DC FE HG +// Permute 1 every ABCDEFGH -> CD AB GH EF +// Permute 2 every ABCDEFGH -> EFGH ABCD +// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) +// Permute 4 possible on half precision @512bit vectors. +////////////////////////////////////////////////////////// +template +inline void Gpermute(vsimd &y,const vsimd &b,int perm){ + union { + SIMD_Ftype f; + decltype(vsimd::v) v; + } conv; + conv.v = b.v; + switch (perm){ +#if defined(AVX1)||defined(AVX2) + // 8x32 bits=>3 permutes + case 2: + conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); + break; + case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; + case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; +#endif +#ifdef SSE4 + case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; + case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break; +#endif +#ifdef AVX512 + // 16 floats=> permutes + // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo + // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn + // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl + // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh + case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; + case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; + case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; + case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; +#endif +#ifdef QPX +#error not implemented +#endif + default: assert(0); break; + } + y.v=conv.v; + + }; + +/////////////////////////////////////// + + + /* @brief Grid_simd class for the SIMD vector type operations */ @@ -380,6 +432,12 @@ namespace Grid { typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD; typedef Grid_simd< Integer , SIMD_Itype > vInteger; + + + + + + } #endif From 500f6ed0c5eb627c3abb265c595b67eb876ef11a Mon Sep 17 00:00:00 2001 From: neo Date: Tue, 26 May 2015 13:54:34 +0900 Subject: [PATCH 4/4] More cleanup of Grid_simd.h --- lib/Grid_simd.h | 128 ++++++++++++----------------------- lib/simd/Grid_vector_types.h | 14 +++- 2 files changed, 57 insertions(+), 85 deletions(-) diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 39eb4654..cccc82e0 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -44,49 +44,49 @@ namespace Grid { inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; } inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; } - - //////////////////////////////////////////////////////////////////////////////// - //Provide support functions for basic real and complex data types required by Grid - //Single and double precision versions. Should be able to template this once only. - //////////////////////////////////////////////////////////////////////////////// - inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); }; - inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} - inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} - inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} - // conjugate already supported for complex - - inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } - inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } - - //conjugate already supported for complex - - inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} - inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} - inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} - inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} - inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);} - inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);} - inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);} - inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);} - - inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} - inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} - inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} - inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);} - - inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); } - inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } - - inline void vstream(ComplexF &l, const ComplexF &r){ l=r;} - inline void vstream(ComplexD &l, const ComplexD &r){ l=r;} - inline void vstream(RealF &l, const RealF &r){ l=r;} - inline void vstream(RealD &l, const RealD &r){ l=r;} - - + + //////////////////////////////////////////////////////////////////////////////// + //Provide support functions for basic real and complex data types required by Grid + //Single and double precision versions. Should be able to template this once only. + //////////////////////////////////////////////////////////////////////////////// + inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); }; + inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} + inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} + inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} + // conjugate already supported for complex + + inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } + inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } + inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } + + //conjugate already supported for complex + + inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} + inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} + inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} + inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} + inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);} + inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);} + inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);} + inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);} + + inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} + inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} + inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} + inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);} + + inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); } + inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } + inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } + + inline void vstream(ComplexF &l, const ComplexF &r){ l=r;} + inline void vstream(ComplexD &l, const ComplexD &r){ l=r;} + inline void vstream(RealF &l, const RealF &r){ l=r;} + inline void vstream(RealD &l, const RealD &r){ l=r;} + + class Zero{}; static Zero zero; template inline void zeroit(itype &arg){ arg=zero;}; @@ -94,52 +94,12 @@ namespace Grid { template<> inline void zeroit(ComplexD &arg){ arg=0; }; template<> inline void zeroit(RealF &arg){ arg=0; }; template<> inline void zeroit(RealD &arg){ arg=0; }; - + }; #include namespace Grid { - - // NB: Template the following on "type Complex" and then implement *,+,- for - // ComplexF, ComplexD, RealF, RealD above to - // get full generality of binops with scalars. - inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); } - inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } - - inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); } - inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); } - inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); } - inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); } - - inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); } - inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } - inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } - - inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); } - inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); } - inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); } - inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); } - inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); } - // Default precision #ifdef GRID_DEFAULT_PRECISION_DOUBLE typedef vRealD vReal; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 3664e0f7..ae01269f 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -2,7 +2,7 @@ /*! @file Grid_vector_types.h @brief Defines templated class Grid_simd to deal with inner vector types */ -// Time-stamp: <2015-05-26 13:22:36 neo> +// Time-stamp: <2015-05-26 13:44:54 neo> //--------------------------------------------------------------------------- #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES @@ -156,6 +156,18 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){ friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); } friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); } + + friend inline void mac (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + friend inline void mult(Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); } + friend inline void sub (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); } + friend inline void add (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); } + friend inline void mac (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ a,const Scalar_type *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + friend inline void mult(Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) * (*r); } + friend inline void sub (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) - (*r); } + friend inline void add (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) + (*r); } + + + //not for integer types... template < class S = Scalar_type, NotEnableIf, int> = 0 > friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }