From 3d04dc33c6c64f2ed761ef62094519a81668aaee Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Tue, 13 Jun 2017 13:26:59 +0200 Subject: [PATCH 1/8] ARM neon intrinsics support --- configure.ac | 3 + lib/simd/Grid_generic_types.h | 2 +- lib/simd/Grid_neon.h | 460 +++++++++++++++++++++++++++------- lib/simd/Grid_vector_types.h | 44 ++-- 4 files changed, 396 insertions(+), 113 deletions(-) diff --git a/configure.ac b/configure.ac index 62b7545b..20f71128 100644 --- a/configure.ac +++ b/configure.ac @@ -244,6 +244,9 @@ case ${ax_cv_cxx_compiler_vendor} in [generic SIMD vector width (in bytes)]) SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" SIMD_FLAGS='';; + NEONv8) + AC_DEFINE([NEONV8],[1],[ARMv8 NEON]) + SIMD_FLAGS='';; QPX|BGQ) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) SIMD_FLAGS='';; diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index 642f6ffe..eac65e09 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -26,7 +26,7 @@ Author: Antonin Portelli See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ - +#define GEN_SIMD_WIDTH 16 static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes"); //#define VECTOR_LOOPS diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index 7c1ad443..f3f802e7 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -1,11 +1,12 @@ - /************************************************************************************* +/************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_neon.h Copyright (C) 2015 +Author: Nils Meyer Author: Peter Boyle Author: neo @@ -27,18 +28,23 @@ Author: neo *************************************************************************************/ /* END LEGAL */ //---------------------------------------------------------------------- -/*! @file Grid_sse4.h - @brief Optimization libraries for NEON (ARM) instructions set ARMv8 +/* + + ARMv8 NEON intrinsics layer by + + Nils Meyer , + University of Regensburg, Germany + SFB/TRR55 - Experimental - Using intrinsics - DEVELOPING! */ -// Time-stamp: <2015-07-10 17:45:09 neo> //---------------------------------------------------------------------- +//#ifndef ARM_NEON +//#define ARM_NEON +#include "Grid_generic_types.h" #include -// ARMv8 supports double precision - +namespace Grid { namespace Optimization { template @@ -46,16 +52,20 @@ namespace Optimization { float32x4_t f; vtype v; }; - union u128f { float32x4_t v; float f[4]; }; union u128d { float64x2_t v; - double f[4]; + double f[2]; }; - + // half precision + union u128h { + float16x8_t v; + uint16_t f[8]; + }; + struct Vsplat{ //Complex float inline float32x4_t operator()(float a, float b){ @@ -64,31 +74,31 @@ namespace Optimization { } // Real float inline float32x4_t operator()(float a){ - return vld1q_dup_f32(&a); + return vdupq_n_f32(a); } //Complex double - inline float32x4_t operator()(double a, double b){ - float tmp[4]={(float)a,(float)b,(float)a,(float)b}; - return vld1q_f32(tmp); + inline float64x2_t operator()(double a, double b){ + double tmp[2]={a,b}; + return vld1q_f64(tmp); } - //Real double - inline float32x4_t operator()(double a){ - return vld1q_dup_f32(&a); + //Real double // N:tbc + inline float64x2_t operator()(double a){ + return vdupq_n_f64(a); } - //Integer + //Integer // N:tbc inline uint32x4_t operator()(Integer a){ - return vld1q_dup_u32(&a); + return vdupq_n_u32(a); } }; struct Vstore{ - //Float + //Float inline void operator()(float32x4_t a, float* F){ vst1q_f32(F, a); } //Double - inline void operator()(float32x4_t a, double* D){ - vst1q_f32((float*)D, a); + inline void operator()(float64x2_t a, double* D){ + vst1q_f64(D, a); } //Integer inline void operator()(uint32x4_t a, Integer* I){ @@ -97,54 +107,54 @@ namespace Optimization { }; - struct Vstream{ - //Float + struct Vstream{ // N:equivalents to _mm_stream_p* in NEON? + //Float // N:generic inline void operator()(float * a, float32x4_t b){ - + memcpy(a,&b,4*sizeof(float)); } - //Double - inline void operator()(double * a, float32x4_t b){ - + //Double // N:generic + inline void operator()(double * a, float64x2_t b){ + memcpy(a,&b,2*sizeof(double)); } }; + // Nils: Vset untested; not used currently in Grid at all; + // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b struct Vset{ - // Complex float + // Complex float // N:ok inline float32x4_t operator()(Grid::ComplexF *a){ - float32x4_t foo; - return foo; + float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; + return vld1q_f32(tmp); } - // Complex double - inline float32x4_t operator()(Grid::ComplexD *a){ - float32x4_t foo; - return foo; + // Complex double // N:ok + inline float64x2_t operator()(Grid::ComplexD *a){ + double tmp[2]={a[0].imag(),a[0].real()}; + return vld1q_f64(tmp); } - // Real float + // Real float // N:ok inline float32x4_t operator()(float *a){ - float32x4_t foo; - return foo; + float tmp[4]={a[3],a[2],a[1],a[0]}; + return vld1q_f32(tmp); } - // Real double - inline float32x4_t operator()(double *a){ - float32x4_t foo; - return foo; + // Real double // N:ok + inline float64x2_t operator()(double *a){ + double tmp[2]={a[1],a[0]}; + return vld1q_f64(tmp); } - // Integer + // Integer // N:ok inline uint32x4_t operator()(Integer *a){ - uint32x4_t foo; - return foo; + return vld1q_dup_u32(a); } - - }; + // N:leaving as is template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled - inline Out_type operator()(In_type in){ + inline Out_type operator()(In_type in){ printf("Error, using wrong Reduce function\n"); exit(1); return 0; @@ -184,26 +194,98 @@ namespace Optimization { } }; + struct MultRealPart{ + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + float32x4_t re = vtrn1q_f32(a, a); + return vmulq_f32(re, b); + } + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + float64x2_t re = vzip1q_f64(a, a); + return vmulq_f64(re, b); + } + }; + + struct MaddRealPart{ + inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){ + float32x4_t re = vtrn1q_f32(a, a); + return vfmaq_f32(c, re, b); + } + inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){ + float64x2_t re = vzip1q_f64(a, a); + return vfmaq_f64(c, re, b); + } + }; + + struct Div{ + // Real float + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + return vdivq_f32(a, b); + } + // Real double + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vdivq_f64(a, b); + } + }; + struct MultComplex{ // Complex float inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - float32x4_t foo; - return foo; + + float32x4_t r0, r1, r2, r3, r4; + + // a = ar ai Ar Ai + // b = br bi Br Bi + // collect real/imag part, negate bi and Bi + r0 = vtrn1q_f32(b, b); // br br Br Br + r1 = vnegq_f32(b); // -br -bi -Br -Bi + r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi + + // the fun part + r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ... + r4 = vrev64q_f32(r3); // -bi*ai bi*ar ... + + // fma(a,b,c) = a+b*c + return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ... + + // no fma, use mul and add + //float32x4_t r5; + //r5 = vmulq_f32(r0, a); + //return vaddq_f32(r4, r5); } // Complex double inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - float32x4_t foo; - return foo; + + float64x2_t r0, r1, r2, r3, r4; + + // b = br bi + // collect real/imag part, negate bi + r0 = vtrn1q_f64(b, b); // br br + r1 = vnegq_f64(b); // -br -bi + r2 = vtrn2q_f64(b, r1); // bi -bi + + // the fun part + r3 = vmulq_f64(r2, a); // bi*ar -bi*ai + r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar + + // fma(a,b,c) = a+b*c + return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi + + // no fma, use mul and add + //float64x2_t r5; + //r5 = vmulq_f64(r0, a); + //return vaddq_f64(r4, r5); } }; struct Mult{ // Real float inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ - return vaddq_f32(vmulq_f32(b,c),a); + //return vaddq_f32(vmulq_f32(b,c),a); + return vfmaq_f32(a, b, c); } inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ - return vaddq_f64(vmulq_f64(b,c),a); + //return vaddq_f64(vmulq_f64(b,c),a); + return vfmaq_f64(a, b, c); } inline float32x4_t operator()(float32x4_t a, float32x4_t b){ return vmulq_f32(a,b); @@ -221,89 +303,275 @@ namespace Optimization { struct Conj{ // Complex single inline float32x4_t operator()(float32x4_t in){ - return in; + // ar ai br bi -> ar -ai br -bi + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(r0); // -ai -ar -bi -br + return vtrn1q_f32(in, r1); // ar -ai br -bi } // Complex double - //inline float32x4_t operator()(float32x4_t in){ - // return 0; - //} + inline float64x2_t operator()(float64x2_t in){ + + float64x2_t r0, r1; + r0 = vextq_f64(in, in, 1); // ai ar + r1 = vnegq_f64(r0); // -ai -ar + return vextq_f64(r0, r1, 1); // ar -ai + } // do not define for integer input }; struct TimesMinusI{ //Complex single inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - return in; + // ar ai br bi -> ai -ar ai -br + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(in); // ai ar bi br + return vtrn1q_f32(r1, r0); // ar -ai br -bi } //Complex double - //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - // return in; - //} - - + inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ + // a ib -> b -ia + float64x2_t tmp; + tmp = vnegq_f64(in); + return vextq_f64(in, tmp, 1); + } }; struct TimesI{ //Complex single inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - //need shuffle - return in; + // ar ai br bi -> -ai ar -bi br + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(r0); // -ai -ar -bi -br + return vtrn1q_f32(r1, in); // -ai ar -bi br } //Complex double - //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - // return 0; - //} + inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ + // a ib -> -b ia + float64x2_t tmp; + tmp = vnegq_f64(in); + return vextq_f64(tmp, in, 1); + } + }; + + struct Permute{ + + static inline float32x4_t Permute0(float32x4_t in){ // N:ok + // AB CD -> CD AB + return vextq_f32(in, in, 2); + }; + static inline float32x4_t Permute1(float32x4_t in){ // N:ok + // AB CD -> BA DC + return vrev64q_f32(in); + }; + static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle + return in; + }; + static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle + return in; + }; + + static inline float64x2_t Permute0(float64x2_t in){ // N:ok + // AB -> BA + return vextq_f64(in, in, 1); + }; + static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle + return in; + }; + static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle + return in; + }; + static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle + return in; + }; + + }; + + struct Rotate{ + + static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok + switch(n){ + case 0: // AB CD -> AB CD + return tRotate<0>(in); + break; + case 1: // AB CD -> BC DA + return tRotate<1>(in); + break; + case 2: // AB CD -> CD AB + return tRotate<2>(in); + break; + case 3: // AB CD -> DA BC + return tRotate<3>(in); + break; + default: assert(0); + } + } + static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok + switch(n){ + case 0: // AB -> AB + return tRotate<0>(in); + break; + case 1: // AB -> BA + return tRotate<1>(in); + break; + default: assert(0); + } + } + +// working, but no restriction on n +// template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); }; +// template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); }; + +// restriction on n + template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; + template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; + + }; + + struct PrecisionChange { + + static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) { + float16x4_t h = vcvt_f16_f32(a); + return vcvt_high_f16_f32(h, b); + } + static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) { + sb = vcvt_high_f32_f16(h); + // there is no direct conversion from lower float32x4_t to float64x2_t + // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang + //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang + // workaround for clang + uint32x4_t h1u = reinterpret_cast(h); + float16x8_t h1 = reinterpret_cast(vextq_u32(h1u, h1u, 2)); + sa = vcvt_high_f32_f16(h1); + } + static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) { + float32x2_t s = vcvt_f32_f64(a); + return vcvt_high_f32_f64(s, b); + + } + static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) { + b = vcvt_high_f64_f32(s); + // there is no direct conversion from lower float32x4_t to float64x2_t + float32x4_t s1 = vextq_f32(s, s, 2); + a = vcvt_high_f64_f32(s1); + + } + static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) { + float32x4_t s1 = DtoS(a, b); + float32x4_t s2 = DtoS(c, d); + return StoH(s1, s2); + } + static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) { + float32x4_t s1, s2; + HtoS(h, s1, s2); + StoD(s1, a, b); + StoD(s2, c, d); + } + }; + + ////////////////////////////////////////////// + // Exchange support + + struct Exchange{ + static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + // in1: ABCD -> out1: ABEF + // in2: EFGH -> out2: CDGH + + // z: CDAB + float32x4_t z = vextq_f32(in1, in1, 2); + // out1: ABEF + out1 = vextq_f32(z, in2, 2); + + // z: GHEF + z = vextq_f32(in2, in2, 2); + // out2: CDGH + out2 = vextq_f32(in1, z, 2); + }; + + static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + // in1: ABCD -> out1: AECG + // in2: EFGH -> out2: BFDH + out1 = vtrn1q_f32(in1, in2); + out2 = vtrn2q_f32(in1, in2); + }; + static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + assert(0); + return; + }; + static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + assert(0); + return; + }; + // double precision + static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + // in1: AB -> out1: AC + // in2: CD -> out2: BD + out1 = vzip1q_f64(in1, in2); + out2 = vzip2q_f64(in1, in2); + }; + static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; + static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; + static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; }; ////////////////////////////////////////////// // Some Template specialization - template < typename vtype > - void permute(vtype &a, vtype b, int perm) { - }; //Complex float Reduce template<> inline Grid::ComplexF Reduce::operator()(float32x4_t in){ - return 0; + float32x4_t v1; // two complex + v1 = Optimization::Permute::Permute0(in); + v1 = vaddq_f32(v1,in); + u128f conv; conv.v=v1; + return Grid::ComplexF(conv.f[0],conv.f[1]); } //Real float Reduce template<> inline Grid::RealF Reduce::operator()(float32x4_t in){ - float32x2_t high = vget_high_f32(in); - float32x2_t low = vget_low_f32(in); - float32x2_t tmp = vadd_f32(low, high); - float32x2_t sum = vpadd_f32(tmp, tmp); - return vget_lane_f32(sum,0); + return vaddvq_f32(in); } - - + + //Complex double Reduce - template<> + template<> // N:by Boyle inline Grid::ComplexD Reduce::operator()(float64x2_t in){ - return 0; + u128d conv; conv.v = in; + return Grid::ComplexD(conv.f[0],conv.f[1]); } - + //Real double Reduce template<> inline Grid::RealD Reduce::operator()(float64x2_t in){ - float64x2_t sum = vpaddq_f64(in, in); - return vgetq_lane_f64(sum,0); + return vaddvq_f64(in); } //Integer Reduce template<> inline Integer Reduce::operator()(uint32x4_t in){ // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); + printf("Reduce : Missing integer implementation -> FIX\n"); assert(0); } } ////////////////////////////////////////////////////////////////////////////////////// -// Here assign types -namespace Grid { +// Here assign types +// typedef Optimization::vech SIMD_Htype; // Reduced precision type + typedef float16x8_t SIMD_Htype; // Half precision type typedef float32x4_t SIMD_Ftype; // Single precision type typedef float64x2_t SIMD_Dtype; // Double precision type typedef uint32x4_t SIMD_Itype; // Integer type @@ -312,13 +580,6 @@ namespace Grid { inline void prefetch_HINT_T0(const char *ptr){}; - // Gpermute function - template < typename VectorSIMD > - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { - Optimization::permute(y.v,b.v,perm); - } - - // Function name aliases typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vstore VstoreSIMD; @@ -326,16 +587,21 @@ namespace Grid { typedef Optimization::Vstream VstreamSIMD; template using ReduceSIMD = Optimization::Reduce; - + // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; } + +//#endif // ARM_NEON diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 0048382f..424b5573 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -328,15 +328,15 @@ class Grid_simd { /////////////////////////////////////// //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 ) - //#pragma GCC push_options - //#pragma GCC optimize ("O0") + //#pragma GCC push_options + //#pragma GCC optimize ("O0") //#endif template friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { Grid_simd ret; Grid_simd::conv_t conv; Grid_simd::scalar_type s; - + conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { s = conv.s[i]; @@ -368,7 +368,7 @@ class Grid_simd { //#pragma GCC pop_options //#endif /////////////////////// - // Exchange + // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh /////////////////////// friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) @@ -379,7 +379,7 @@ class Grid_simd { Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } else if(n==1) { Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { + } else if(n==0) { Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } } @@ -406,7 +406,7 @@ class Grid_simd { int dist = perm & 0xF; y = rotate(b, dist); return; - } + } else if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); @@ -425,7 +425,7 @@ class Grid_simd { } - + }; // end of Grid_simd class definition @@ -451,29 +451,29 @@ inline Grid_simd rotate(Grid_simd b, int nrot) { ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); return ret; } -template =0> +template =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,nrot); } -template =0> +template =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,2*nrot); } -template +template inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); -} -template =0> +} +template =0> inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} +} @@ -604,13 +604,27 @@ inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { ret.v = binary(a.v, b.v, MultRealPartSIMD()); return ret; }; +// TEST for Test_simd +template = 0> +inline Grid_simd real_mult(std::complex a, std::complex b) { + Grid_simd ret; + //ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; + template = 0> inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); return ret; }; - +// TEST for Test_simd +template = 0> +inline Grid_simd real_madd(std::complex a, std::complex b) { + Grid_simd ret; + //ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; // Distinguish between complex types and others template = 0> @@ -640,7 +654,7 @@ inline Grid_simd operator/(Grid_simd a, Grid_simd b) { ret = a * conjugate(b) ; den = b * conjugate(b) ; - + auto real_den = toReal(den); ret.v=binary(ret.v, real_den.v, DivSIMD()); From bf729766ddd36c9ebe8c6be35d7527353aff2963 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Tue, 27 Jun 2017 20:32:24 +0200 Subject: [PATCH 2/8] removed collision with QPX implementation --- lib/simd/Grid_generic_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index eac65e09..642f6ffe 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -26,7 +26,7 @@ Author: Antonin Portelli See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#define GEN_SIMD_WIDTH 16 + static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes"); //#define VECTOR_LOOPS From e43a8b6b8ad52b88ed8e9ae1c140dc34a3da18d4 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Tue, 27 Jun 2017 20:58:48 +0200 Subject: [PATCH 3/8] removed comments --- Grid_vector_types.h | 857 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 857 insertions(+) create mode 100644 Grid_vector_types.h diff --git a/Grid_vector_types.h b/Grid_vector_types.h new file mode 100644 index 00000000..e05fecc4 --- /dev/null +++ b/Grid_vector_types.h @@ -0,0 +1,857 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/simd/Grid_vector_type.h + +Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Guido Cossu +Author: Peter Boyle +Author: neo + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +//--------------------------------------------------------------------------- +/*! @file Grid_vector_types.h + @brief Defines templated class Grid_simd to deal with inner vector types +*/ +// Time-stamp: <2015-07-10 17:45:33 neo> +//--------------------------------------------------------------------------- +#ifndef GRID_VECTOR_TYPES +#define GRID_VECTOR_TYPES + +#ifdef GEN +#include "Grid_generic.h" +#endif +#ifdef SSE4 +#include "Grid_sse4.h" +#endif +#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) +#include "Grid_avx.h" +#endif +#if defined AVX512 +#include "Grid_avx512.h" +#endif +#if defined IMCI +#include "Grid_imci.h" +#endif +#ifdef NEONv8 +#include "Grid_neon.h" +#endif +#if defined QPX +#include "Grid_qpx.h" +#endif + +#include "l1p.h" + +namespace Grid { + +////////////////////////////////////// +// To take the floating point type of real/complex type +////////////////////////////////////// +template +struct RealPart { + typedef T type; +}; +template +struct RealPart > { + typedef T type; +}; + +#include + +////////////////////////////////////// +// demote a vector to real type +////////////////////////////////////// +// type alias used to simplify the syntax of std::enable_if +template using Invoke = typename T::type; +template using EnableIf = Invoke >; +template using NotEnableIf = Invoke >; + +//////////////////////////////////////////////////////// +// Check for complexity with type traits +template struct is_complex : public std::false_type {}; +template <> struct is_complex > : public std::true_type {}; +template <> struct is_complex > : public std::true_type {}; + +template using IfReal = Invoke::value, int> >; +template using IfComplex = Invoke::value, int> >; +template using IfInteger = Invoke::value, int> >; +template using IfSame = Invoke::value, int> >; + +template using IfNotReal = Invoke::value, int> >; +template using IfNotComplex = Invoke::value, int> >; +template using IfNotInteger = Invoke::value, int> >; +template using IfNotSame = Invoke::value, int> >; + +//////////////////////////////////////////////////////// +// Define the operation templates functors +// general forms to allow for vsplat syntax +// need explicit declaration of types when used since +// clang cannot automatically determine the output type sometimes +template +Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) { + return op(src_1, src_2, src_3); +} + +template +Out binary(Input1 src_1, Input2 src_2, Operation op) { + return op(src_1, src_2); +} + +template +Out unary(Input src, Operation op) { + return op(src); +} +/////////////////////////////////////////////// + +/* + @brief Grid_simd class for the SIMD vector type operations + */ +template +class Grid_simd { + public: + typedef typename RealPart::type Real; + typedef Vector_type vector_type; + typedef Scalar_type scalar_type; + + typedef union conv_t_union { + Vector_type v; + Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)]; + conv_t_union(){}; + } conv_t; + + Vector_type v; + + static inline constexpr int Nsimd(void) { + return sizeof(Vector_type) / sizeof(Scalar_type); + } + + Grid_simd &operator=(const Grid_simd &&rhs) { + v = rhs.v; + return *this; + }; + Grid_simd &operator=(const Grid_simd &rhs) { + v = rhs.v; + return *this; + }; // faster than not declaring it and leaving to the compiler + Grid_simd() = default; + Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps + Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + + ///////////////////////////// + // Constructors + ///////////////////////////// + Grid_simd &operator=(Zero &z) { + vzero(*this); + return (*this); + } + + // Enable if complex type + template + Grid_simd(const typename std::enable_if::value, S>::type a) { + vsplat(*this, a); + }; + + Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; + + /////////////////////////////////////////////// + // mac, mult, sub, add, adj + /////////////////////////////////////////////// + + // FIXME -- alias this to an inline MAC struct. + friend inline void mac(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ a, + const Grid_simd *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + + friend inline void mult(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) * (*r); + } + + friend inline void sub(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) - (*r); + } + friend inline void add(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) + (*r); + } + friend inline void mac(Grid_simd *__restrict__ y, + const Scalar_type *__restrict__ a, + const Grid_simd *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + friend inline void mult(Grid_simd *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) * (*r); + } + friend inline void sub(Grid_simd *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) - (*r); + } + friend inline void add(Grid_simd *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd *__restrict__ r) { + *y = (*l) + (*r); + } + + friend inline void mac(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ a, + const Scalar_type *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + friend inline void mult(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) * (*r); + } + friend inline void sub(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) - (*r); + } + friend inline void add(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) + (*r); + } + + //////////////////////////////////////////////////////////////////////// + // FIXME: gonna remove these load/store, get, set, prefetch + //////////////////////////////////////////////////////////////////////// + friend inline void vset(Grid_simd &ret, Scalar_type *a) { + ret.v = unary(a, VsetSIMD()); + } + + /////////////////////// + // Vstore + /////////////////////// + friend inline void vstore(const Grid_simd &ret, Scalar_type *a) { + binary(ret.v, (Real *)a, VstoreSIMD()); + } + + /////////////////////// + // Vprefetch + /////////////////////// + friend inline void vprefetch(const Grid_simd &v) { + prefetch_HINT_T0((const char *)&v.v); + } + + /////////////////////// + // Reduce + /////////////////////// + friend inline Scalar_type Reduce(const Grid_simd &in) { + return unary(in.v, ReduceSIMD()); + } + + //////////////////////////// + // operator scalar * simd + //////////////////////////// + friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { + Grid_simd va; + vsplat(va, a); + return va * b; + } + friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) { + return a * b; + } + + ////////////////////////////////// + // Divides + ////////////////////////////////// + friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) { + Grid_simd va; + vsplat(va, a); + return va / b; + } + friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) { + Grid_simd va; + vsplat(va, a); + return b / a; + } + + /////////////////////// + // Unary negation + /////////////////////// + friend inline Grid_simd operator-(const Grid_simd &r) { + Grid_simd ret; + vzero(ret); + ret = ret - r; + return ret; + } + // *=,+=,-= operators + inline Grid_simd &operator*=(const Grid_simd &r) { + *this = (*this) * r; + return *this; + // return (*this)*r; ? + } + inline Grid_simd &operator+=(const Grid_simd &r) { + *this = *this + r; + return *this; + } + inline Grid_simd &operator-=(const Grid_simd &r) { + *this = *this - r; + return *this; + } + + /////////////////////////////////////// + // Not all functions are supported + // through SIMD and must breakout to + // scalar type and back again. This + // provides support + /////////////////////////////////////// + + template + friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { + Grid_simd ret; + Grid_simd::conv_t conv; + Grid_simd::scalar_type s; + + conv.v = v.v; + for (int i = 0; i < Nsimd(); i++) { + s = conv.s[i]; + conv.s[i] = func(s); + } + ret.v = conv.v; + return ret; + } + template + friend inline Grid_simd SimdApplyBinop(const functor &func, + const Grid_simd &x, + const Grid_simd &y) { + Grid_simd ret; + Grid_simd::conv_t cx; + Grid_simd::conv_t cy; + Grid_simd::scalar_type sx,sy; + + cx.v = x.v; + cy.v = y.v; + for (int i = 0; i < Nsimd(); i++) { + sx = cx.s[i]; + sy = cy.s[i]; + cx.s[i] = func(sx,sy); + } + ret.v = cx.v; + return ret; + } + /////////////////////// + // Exchange + // Al Ah , Bl Bh -> Al Bl Ah,Bh + /////////////////////// + friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) + { + if (n==3) { + Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); + } else if(n==2) { + Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); + } else if(n==1) { + Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); + } else if(n==0) { + Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); + } + } + + //////////////////////////////////////////////////////////////////// + // General permute; assumes vector length is same across + // all subtypes; may not be a good assumption, but could + // add the vector width as a template param for BG/Q for example + //////////////////////////////////////////////////////////////////// + friend inline void permute0(Grid_simd &y, Grid_simd b) { + y.v = Optimization::Permute::Permute0(b.v); + } + friend inline void permute1(Grid_simd &y, Grid_simd b) { + y.v = Optimization::Permute::Permute1(b.v); + } + friend inline void permute2(Grid_simd &y, Grid_simd b) { + y.v = Optimization::Permute::Permute2(b.v); + } + friend inline void permute3(Grid_simd &y, Grid_simd b) { + y.v = Optimization::Permute::Permute3(b.v); + } + friend inline void permute(Grid_simd &y, Grid_simd b, int perm) { + if (perm & RotateBit) { + int dist = perm & 0xF; + y = rotate(b, dist); + return; + } + else if(perm==3) permute3(y, b); + else if(perm==2) permute2(y, b); + else if(perm==1) permute1(y, b); + else if(perm==0) permute0(y, b); + } + + /////////////////////////////// + // Getting single lanes + /////////////////////////////// + inline Scalar_type getlane(int lane) { + return ((Scalar_type*)&v)[lane]; + } + + inline void putlane(const Scalar_type &S, int lane){ + ((Scalar_type*)&v)[lane] = S; + } + + + +}; // end of Grid_simd class definition + +inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } +inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; } +inline void permute(RealD &y,RealD b, int perm) { y=b; } +inline void permute(RealF &y,RealF b, int perm) { y=b; } + +//////////////////////////////////////////////////////////////////// +// General rotate +//////////////////////////////////////////////////////////////////// +template = 0> +inline Grid_simd rotate(Grid_simd b, int nrot) { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + ret.v = Optimization::Rotate::rotate(b.v, nrot); + return ret; +} +template = 0> +inline Grid_simd rotate(Grid_simd b, int nrot) { + nrot = nrot % Grid_simd::Nsimd(); + Grid_simd ret; + ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); + return ret; +} +template =0> +inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) +{ + nrot = nrot % Grid_simd::Nsimd(); + ret.v = Optimization::Rotate::rotate(b.v,nrot); +} +template =0> +inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) +{ + nrot = nrot % Grid_simd::Nsimd(); + ret.v = Optimization::Rotate::rotate(b.v,2*nrot); +} + +template +inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ + S* typepun =(S*) &src; + vsplat(ret,typepun[lane]); +} +template =0> +inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ + S* typepun =(S*) &src; + ret.v = unary(real(typepun[lane]), VsplatSIMD()); +} + + + +/////////////////////// +// Splat +/////////////////////// + +// this is only for the complex version +template = 0, class ABtype> +inline void vsplat(Grid_simd &ret, ABtype a, ABtype b) { + ret.v = binary(a, b, VsplatSIMD()); +} + +// overload if complex +template +inline void vsplat(Grid_simd &ret, EnableIf, S> c) { + vsplat(ret, real(c), imag(c)); +} +template +inline void rsplat(Grid_simd &ret, EnableIf, S> c) { + vsplat(ret, real(c), real(c)); +} + +// if real fill with a, if complex fill with a in the real part (first function +// above) +template +inline void vsplat(Grid_simd &ret, NotEnableIf, S> a) { + ret.v = unary(a, VsplatSIMD()); +} +////////////////////////// + +/////////////////////////////////////////////// +// Initialise to 1,0,i for the correct types +/////////////////////////////////////////////// +// For complex types +template = 0> +inline void vone(Grid_simd &ret) { + vsplat(ret, S(1.0, 0.0)); +} +template = 0> +inline void vzero(Grid_simd &ret) { + vsplat(ret, S(0.0, 0.0)); +} // use xor? +template = 0> +inline void vcomplex_i(Grid_simd &ret) { + vsplat(ret, S(0.0, 1.0)); +} + +template = 0> +inline void visign(Grid_simd &ret) { + vsplat(ret, S(1.0, -1.0)); +} +template = 0> +inline void vrsign(Grid_simd &ret) { + vsplat(ret, S(-1.0, 1.0)); +} + +// if not complex overload here +template = 0> +inline void vone(Grid_simd &ret) { + vsplat(ret, S(1.0)); +} +template = 0> +inline void vzero(Grid_simd &ret) { + vsplat(ret, S(0.0)); +} + +// For integral types +template = 0> +inline void vone(Grid_simd &ret) { + vsplat(ret, 1); +} +template = 0> +inline void vzero(Grid_simd &ret) { + vsplat(ret, 0); +} +template = 0> +inline void vtrue(Grid_simd &ret) { + vsplat(ret, 0xFFFFFFFF); +} +template = 0> +inline void vfalse(Grid_simd &ret) { + vsplat(ret, 0); +} +template +inline void zeroit(Grid_simd &z) { + vzero(z); +} + +/////////////////////// +// Vstream +/////////////////////// +template = 0> +inline void vstream(Grid_simd &out, const Grid_simd &in) { + binary((S *)&out.v, in.v, VstreamSIMD()); +} +template = 0> +inline void vstream(Grid_simd &out, const Grid_simd &in) { + typedef typename S::value_type T; + binary((T *)&out.v, in.v, VstreamSIMD()); +} +template = 0> +inline void vstream(Grid_simd &out, const Grid_simd &in) { + out = in; +} + +//////////////////////////////////// +// Arithmetic operator overloads +,-,* +//////////////////////////////////// +template +inline Grid_simd operator+(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, SumSIMD()); + return ret; +}; + +template +inline Grid_simd operator-(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, SubSIMD()); + return ret; +}; + +// Distinguish between complex types and others +template = 0> +inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; +template = 0> +inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); + return ret; +}; + + +// Distinguish between complex types and others +template = 0> +inline Grid_simd operator*(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultComplexSIMD()); + return ret; +}; + +// Real/Integer types +template = 0> +inline Grid_simd operator*(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultSIMD()); + return ret; +}; + +// Distinguish between complex types and others +template = 0> +inline Grid_simd operator/(Grid_simd a, Grid_simd b) { + typedef Grid_simd simd; + + simd ret; + simd den; + typename simd::conv_t conv; + + ret = a * conjugate(b) ; + den = b * conjugate(b) ; + + + auto real_den = toReal(den); + + ret.v=binary(ret.v, real_den.v, DivSIMD()); + + return ret; +}; + +// Real/Integer types +template = 0> +inline Grid_simd operator/(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, DivSIMD()); + return ret; +}; + +/////////////////////// +// Conjugate +/////////////////////// +template = 0> +inline Grid_simd conjugate(const Grid_simd &in) { + Grid_simd ret; + ret.v = unary(in.v, ConjSIMD()); + return ret; +} +template = 0> +inline Grid_simd conjugate(const Grid_simd &in) { + return in; // for real objects +} +// Suppress adj for integer types... // odd; why conjugate above but not adj?? +template = 0> +inline Grid_simd adj(const Grid_simd &in) { + return conjugate(in); +} + +/////////////////////// +// timesMinusI +/////////////////////// +template = 0> +inline void timesMinusI(Grid_simd &ret, const Grid_simd &in) { + ret.v = binary(in.v, ret.v, TimesMinusISIMD()); +} +template = 0> +inline Grid_simd timesMinusI(const Grid_simd &in) { + Grid_simd ret; + timesMinusI(ret, in); + return ret; +} +template = 0> +inline Grid_simd timesMinusI(const Grid_simd &in) { + return in; +} + +/////////////////////// +// timesI +/////////////////////// +template = 0> +inline void timesI(Grid_simd &ret, const Grid_simd &in) { + ret.v = binary(in.v, ret.v, TimesISIMD()); +} +template = 0> +inline Grid_simd timesI(const Grid_simd &in) { + Grid_simd ret; + timesI(ret, in); + return ret; +} +template = 0> +inline Grid_simd timesI(const Grid_simd &in) { + return in; +} + +///////////////////// +// Inner, outer +///////////////////// + +template +inline Grid_simd innerProduct(const Grid_simd &l, + const Grid_simd &r) { + return conjugate(l) * r; +} +template +inline Grid_simd outerProduct(const Grid_simd &l, + const Grid_simd &r) { + return l * conjugate(r); +} + +template +inline Grid_simd trace(const Grid_simd &arg) { + return arg; +} + +//////////////////////////////////////////////////////////// +// copy/splat complex real parts into real; +// insert real into complex and zero imag; +//////////////////////////////////////////////////////////// + +// real = toReal( complex ) +template = 0> +inline Grid_simd toReal(const Grid_simd, V> &in) { + typedef Grid_simd simd; + simd ret; + typename simd::conv_t conv; + conv.v = in.v; // copy the vector content (bytewise) + for (int i = 0; i < simd::Nsimd(); i += 2) { + conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc... + } + ret.v = conv.v; + return ret; +} + +// complex = toComplex( real ) +template = 0> // must be a real arg +inline Grid_simd, V> toComplex(const Grid_simd &in) { + typedef Grid_simd Rsimd; + typedef Grid_simd, V> Csimd; + typename Rsimd::conv_t conv; // address as real + + conv.v = in.v; + for (int i = 0; i < Rsimd::Nsimd(); i += 2) { + assert(conv.s[i + 1] == conv.s[i]); + // trap any cases where real was not duplicated + // indicating the SIMD grids of real and imag assignment did not correctly + // match + conv.s[i + 1] = 0.0; // zero imaginary parts + } + Csimd ret; + ret.v = conv.v; + return ret; +} + +/////////////////////////////// +// Define available types +/////////////////////////////// +typedef Grid_simd vRealF; +typedef Grid_simd vRealD; +typedef Grid_simd, SIMD_Ftype> vComplexF; +typedef Grid_simd, SIMD_Dtype> vComplexD; +typedef Grid_simd vInteger; + +// Half precision; no arithmetic support +typedef Grid_simd vRealH; +typedef Grid_simd, SIMD_Htype> vComplexH; + +inline void precisionChange(vRealF *out,vRealD *in,int nvec) +{ + assert((nvec&0x1)==0); + for(int m=0;m*2 +struct is_simd : public std::false_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; + +template using IfSimd = Invoke::value, int> >; +template using IfNotSimd = Invoke::value, unsigned> >; +} + +#endif From a9c816a26883056c367ea5972bca41fc1b24b3a3 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Tue, 27 Jun 2017 21:39:15 +0200 Subject: [PATCH 4/8] moved file to correct folder --- Grid_vector_types.h | 857 ----------------------------------- lib/simd/Grid_vector_types.h | 75 +-- 2 files changed, 22 insertions(+), 910 deletions(-) delete mode 100644 Grid_vector_types.h diff --git a/Grid_vector_types.h b/Grid_vector_types.h deleted file mode 100644 index e05fecc4..00000000 --- a/Grid_vector_types.h +++ /dev/null @@ -1,857 +0,0 @@ -/************************************************************************************* - -Grid physics library, www.github.com/paboyle/Grid - -Source file: ./lib/simd/Grid_vector_type.h - -Copyright (C) 2015 - -Author: Azusa Yamaguchi -Author: Guido Cossu -Author: Peter Boyle -Author: neo - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -See the full license in the file "LICENSE" in the top level distribution -directory -*************************************************************************************/ -/* END LEGAL */ -//--------------------------------------------------------------------------- -/*! @file Grid_vector_types.h - @brief Defines templated class Grid_simd to deal with inner vector types -*/ -// Time-stamp: <2015-07-10 17:45:33 neo> -//--------------------------------------------------------------------------- -#ifndef GRID_VECTOR_TYPES -#define GRID_VECTOR_TYPES - -#ifdef GEN -#include "Grid_generic.h" -#endif -#ifdef SSE4 -#include "Grid_sse4.h" -#endif -#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4) -#include "Grid_avx.h" -#endif -#if defined AVX512 -#include "Grid_avx512.h" -#endif -#if defined IMCI -#include "Grid_imci.h" -#endif -#ifdef NEONv8 -#include "Grid_neon.h" -#endif -#if defined QPX -#include "Grid_qpx.h" -#endif - -#include "l1p.h" - -namespace Grid { - -////////////////////////////////////// -// To take the floating point type of real/complex type -////////////////////////////////////// -template -struct RealPart { - typedef T type; -}; -template -struct RealPart > { - typedef T type; -}; - -#include - -////////////////////////////////////// -// demote a vector to real type -////////////////////////////////////// -// type alias used to simplify the syntax of std::enable_if -template using Invoke = typename T::type; -template using EnableIf = Invoke >; -template using NotEnableIf = Invoke >; - -//////////////////////////////////////////////////////// -// Check for complexity with type traits -template struct is_complex : public std::false_type {}; -template <> struct is_complex > : public std::true_type {}; -template <> struct is_complex > : public std::true_type {}; - -template using IfReal = Invoke::value, int> >; -template using IfComplex = Invoke::value, int> >; -template using IfInteger = Invoke::value, int> >; -template using IfSame = Invoke::value, int> >; - -template using IfNotReal = Invoke::value, int> >; -template using IfNotComplex = Invoke::value, int> >; -template using IfNotInteger = Invoke::value, int> >; -template using IfNotSame = Invoke::value, int> >; - -//////////////////////////////////////////////////////// -// Define the operation templates functors -// general forms to allow for vsplat syntax -// need explicit declaration of types when used since -// clang cannot automatically determine the output type sometimes -template -Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) { - return op(src_1, src_2, src_3); -} - -template -Out binary(Input1 src_1, Input2 src_2, Operation op) { - return op(src_1, src_2); -} - -template -Out unary(Input src, Operation op) { - return op(src); -} -/////////////////////////////////////////////// - -/* - @brief Grid_simd class for the SIMD vector type operations - */ -template -class Grid_simd { - public: - typedef typename RealPart::type Real; - typedef Vector_type vector_type; - typedef Scalar_type scalar_type; - - typedef union conv_t_union { - Vector_type v; - Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)]; - conv_t_union(){}; - } conv_t; - - Vector_type v; - - static inline constexpr int Nsimd(void) { - return sizeof(Vector_type) / sizeof(Scalar_type); - } - - Grid_simd &operator=(const Grid_simd &&rhs) { - v = rhs.v; - return *this; - }; - Grid_simd &operator=(const Grid_simd &rhs) { - v = rhs.v; - return *this; - }; // faster than not declaring it and leaving to the compiler - Grid_simd() = default; - Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps - Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; - - ///////////////////////////// - // Constructors - ///////////////////////////// - Grid_simd &operator=(Zero &z) { - vzero(*this); - return (*this); - } - - // Enable if complex type - template - Grid_simd(const typename std::enable_if::value, S>::type a) { - vsplat(*this, a); - }; - - Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; - - /////////////////////////////////////////////// - // mac, mult, sub, add, adj - /////////////////////////////////////////////// - - // FIXME -- alias this to an inline MAC struct. - friend inline void mac(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ a, - const Grid_simd *__restrict__ x) { - *y = (*a) * (*x) + (*y); - }; - - friend inline void mult(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) * (*r); - } - - friend inline void sub(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) - (*r); - } - friend inline void add(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) + (*r); - } - friend inline void mac(Grid_simd *__restrict__ y, - const Scalar_type *__restrict__ a, - const Grid_simd *__restrict__ x) { - *y = (*a) * (*x) + (*y); - }; - friend inline void mult(Grid_simd *__restrict__ y, - const Scalar_type *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) * (*r); - } - friend inline void sub(Grid_simd *__restrict__ y, - const Scalar_type *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) - (*r); - } - friend inline void add(Grid_simd *__restrict__ y, - const Scalar_type *__restrict__ l, - const Grid_simd *__restrict__ r) { - *y = (*l) + (*r); - } - - friend inline void mac(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ a, - const Scalar_type *__restrict__ x) { - *y = (*a) * (*x) + (*y); - }; - friend inline void mult(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Scalar_type *__restrict__ r) { - *y = (*l) * (*r); - } - friend inline void sub(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Scalar_type *__restrict__ r) { - *y = (*l) - (*r); - } - friend inline void add(Grid_simd *__restrict__ y, - const Grid_simd *__restrict__ l, - const Scalar_type *__restrict__ r) { - *y = (*l) + (*r); - } - - //////////////////////////////////////////////////////////////////////// - // FIXME: gonna remove these load/store, get, set, prefetch - //////////////////////////////////////////////////////////////////////// - friend inline void vset(Grid_simd &ret, Scalar_type *a) { - ret.v = unary(a, VsetSIMD()); - } - - /////////////////////// - // Vstore - /////////////////////// - friend inline void vstore(const Grid_simd &ret, Scalar_type *a) { - binary(ret.v, (Real *)a, VstoreSIMD()); - } - - /////////////////////// - // Vprefetch - /////////////////////// - friend inline void vprefetch(const Grid_simd &v) { - prefetch_HINT_T0((const char *)&v.v); - } - - /////////////////////// - // Reduce - /////////////////////// - friend inline Scalar_type Reduce(const Grid_simd &in) { - return unary(in.v, ReduceSIMD()); - } - - //////////////////////////// - // operator scalar * simd - //////////////////////////// - friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { - Grid_simd va; - vsplat(va, a); - return va * b; - } - friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) { - return a * b; - } - - ////////////////////////////////// - // Divides - ////////////////////////////////// - friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) { - Grid_simd va; - vsplat(va, a); - return va / b; - } - friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) { - Grid_simd va; - vsplat(va, a); - return b / a; - } - - /////////////////////// - // Unary negation - /////////////////////// - friend inline Grid_simd operator-(const Grid_simd &r) { - Grid_simd ret; - vzero(ret); - ret = ret - r; - return ret; - } - // *=,+=,-= operators - inline Grid_simd &operator*=(const Grid_simd &r) { - *this = (*this) * r; - return *this; - // return (*this)*r; ? - } - inline Grid_simd &operator+=(const Grid_simd &r) { - *this = *this + r; - return *this; - } - inline Grid_simd &operator-=(const Grid_simd &r) { - *this = *this - r; - return *this; - } - - /////////////////////////////////////// - // Not all functions are supported - // through SIMD and must breakout to - // scalar type and back again. This - // provides support - /////////////////////////////////////// - - template - friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { - Grid_simd ret; - Grid_simd::conv_t conv; - Grid_simd::scalar_type s; - - conv.v = v.v; - for (int i = 0; i < Nsimd(); i++) { - s = conv.s[i]; - conv.s[i] = func(s); - } - ret.v = conv.v; - return ret; - } - template - friend inline Grid_simd SimdApplyBinop(const functor &func, - const Grid_simd &x, - const Grid_simd &y) { - Grid_simd ret; - Grid_simd::conv_t cx; - Grid_simd::conv_t cy; - Grid_simd::scalar_type sx,sy; - - cx.v = x.v; - cy.v = y.v; - for (int i = 0; i < Nsimd(); i++) { - sx = cx.s[i]; - sy = cy.s[i]; - cx.s[i] = func(sx,sy); - } - ret.v = cx.v; - return ret; - } - /////////////////////// - // Exchange - // Al Ah , Bl Bh -> Al Bl Ah,Bh - /////////////////////// - friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) - { - if (n==3) { - Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); - } else if(n==2) { - Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); - } else if(n==1) { - Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { - Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); - } - } - - //////////////////////////////////////////////////////////////////// - // General permute; assumes vector length is same across - // all subtypes; may not be a good assumption, but could - // add the vector width as a template param for BG/Q for example - //////////////////////////////////////////////////////////////////// - friend inline void permute0(Grid_simd &y, Grid_simd b) { - y.v = Optimization::Permute::Permute0(b.v); - } - friend inline void permute1(Grid_simd &y, Grid_simd b) { - y.v = Optimization::Permute::Permute1(b.v); - } - friend inline void permute2(Grid_simd &y, Grid_simd b) { - y.v = Optimization::Permute::Permute2(b.v); - } - friend inline void permute3(Grid_simd &y, Grid_simd b) { - y.v = Optimization::Permute::Permute3(b.v); - } - friend inline void permute(Grid_simd &y, Grid_simd b, int perm) { - if (perm & RotateBit) { - int dist = perm & 0xF; - y = rotate(b, dist); - return; - } - else if(perm==3) permute3(y, b); - else if(perm==2) permute2(y, b); - else if(perm==1) permute1(y, b); - else if(perm==0) permute0(y, b); - } - - /////////////////////////////// - // Getting single lanes - /////////////////////////////// - inline Scalar_type getlane(int lane) { - return ((Scalar_type*)&v)[lane]; - } - - inline void putlane(const Scalar_type &S, int lane){ - ((Scalar_type*)&v)[lane] = S; - } - - - -}; // end of Grid_simd class definition - -inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } -inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; } -inline void permute(RealD &y,RealD b, int perm) { y=b; } -inline void permute(RealF &y,RealF b, int perm) { y=b; } - -//////////////////////////////////////////////////////////////////// -// General rotate -//////////////////////////////////////////////////////////////////// -template = 0> -inline Grid_simd rotate(Grid_simd b, int nrot) { - nrot = nrot % Grid_simd::Nsimd(); - Grid_simd ret; - ret.v = Optimization::Rotate::rotate(b.v, nrot); - return ret; -} -template = 0> -inline Grid_simd rotate(Grid_simd b, int nrot) { - nrot = nrot % Grid_simd::Nsimd(); - Grid_simd ret; - ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); - return ret; -} -template =0> -inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) -{ - nrot = nrot % Grid_simd::Nsimd(); - ret.v = Optimization::Rotate::rotate(b.v,nrot); -} -template =0> -inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) -{ - nrot = nrot % Grid_simd::Nsimd(); - ret.v = Optimization::Rotate::rotate(b.v,2*nrot); -} - -template -inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ - S* typepun =(S*) &src; - vsplat(ret,typepun[lane]); -} -template =0> -inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ - S* typepun =(S*) &src; - ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} - - - -/////////////////////// -// Splat -/////////////////////// - -// this is only for the complex version -template = 0, class ABtype> -inline void vsplat(Grid_simd &ret, ABtype a, ABtype b) { - ret.v = binary(a, b, VsplatSIMD()); -} - -// overload if complex -template -inline void vsplat(Grid_simd &ret, EnableIf, S> c) { - vsplat(ret, real(c), imag(c)); -} -template -inline void rsplat(Grid_simd &ret, EnableIf, S> c) { - vsplat(ret, real(c), real(c)); -} - -// if real fill with a, if complex fill with a in the real part (first function -// above) -template -inline void vsplat(Grid_simd &ret, NotEnableIf, S> a) { - ret.v = unary(a, VsplatSIMD()); -} -////////////////////////// - -/////////////////////////////////////////////// -// Initialise to 1,0,i for the correct types -/////////////////////////////////////////////// -// For complex types -template = 0> -inline void vone(Grid_simd &ret) { - vsplat(ret, S(1.0, 0.0)); -} -template = 0> -inline void vzero(Grid_simd &ret) { - vsplat(ret, S(0.0, 0.0)); -} // use xor? -template = 0> -inline void vcomplex_i(Grid_simd &ret) { - vsplat(ret, S(0.0, 1.0)); -} - -template = 0> -inline void visign(Grid_simd &ret) { - vsplat(ret, S(1.0, -1.0)); -} -template = 0> -inline void vrsign(Grid_simd &ret) { - vsplat(ret, S(-1.0, 1.0)); -} - -// if not complex overload here -template = 0> -inline void vone(Grid_simd &ret) { - vsplat(ret, S(1.0)); -} -template = 0> -inline void vzero(Grid_simd &ret) { - vsplat(ret, S(0.0)); -} - -// For integral types -template = 0> -inline void vone(Grid_simd &ret) { - vsplat(ret, 1); -} -template = 0> -inline void vzero(Grid_simd &ret) { - vsplat(ret, 0); -} -template = 0> -inline void vtrue(Grid_simd &ret) { - vsplat(ret, 0xFFFFFFFF); -} -template = 0> -inline void vfalse(Grid_simd &ret) { - vsplat(ret, 0); -} -template -inline void zeroit(Grid_simd &z) { - vzero(z); -} - -/////////////////////// -// Vstream -/////////////////////// -template = 0> -inline void vstream(Grid_simd &out, const Grid_simd &in) { - binary((S *)&out.v, in.v, VstreamSIMD()); -} -template = 0> -inline void vstream(Grid_simd &out, const Grid_simd &in) { - typedef typename S::value_type T; - binary((T *)&out.v, in.v, VstreamSIMD()); -} -template = 0> -inline void vstream(Grid_simd &out, const Grid_simd &in) { - out = in; -} - -//////////////////////////////////// -// Arithmetic operator overloads +,-,* -//////////////////////////////////// -template -inline Grid_simd operator+(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, SumSIMD()); - return ret; -}; - -template -inline Grid_simd operator-(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, SubSIMD()); - return ret; -}; - -// Distinguish between complex types and others -template = 0> -inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, MultRealPartSIMD()); - return ret; -}; -template = 0> -inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { - Grid_simd ret; - ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); - return ret; -}; - - -// Distinguish between complex types and others -template = 0> -inline Grid_simd operator*(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, MultComplexSIMD()); - return ret; -}; - -// Real/Integer types -template = 0> -inline Grid_simd operator*(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, MultSIMD()); - return ret; -}; - -// Distinguish between complex types and others -template = 0> -inline Grid_simd operator/(Grid_simd a, Grid_simd b) { - typedef Grid_simd simd; - - simd ret; - simd den; - typename simd::conv_t conv; - - ret = a * conjugate(b) ; - den = b * conjugate(b) ; - - - auto real_den = toReal(den); - - ret.v=binary(ret.v, real_den.v, DivSIMD()); - - return ret; -}; - -// Real/Integer types -template = 0> -inline Grid_simd operator/(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, DivSIMD()); - return ret; -}; - -/////////////////////// -// Conjugate -/////////////////////// -template = 0> -inline Grid_simd conjugate(const Grid_simd &in) { - Grid_simd ret; - ret.v = unary(in.v, ConjSIMD()); - return ret; -} -template = 0> -inline Grid_simd conjugate(const Grid_simd &in) { - return in; // for real objects -} -// Suppress adj for integer types... // odd; why conjugate above but not adj?? -template = 0> -inline Grid_simd adj(const Grid_simd &in) { - return conjugate(in); -} - -/////////////////////// -// timesMinusI -/////////////////////// -template = 0> -inline void timesMinusI(Grid_simd &ret, const Grid_simd &in) { - ret.v = binary(in.v, ret.v, TimesMinusISIMD()); -} -template = 0> -inline Grid_simd timesMinusI(const Grid_simd &in) { - Grid_simd ret; - timesMinusI(ret, in); - return ret; -} -template = 0> -inline Grid_simd timesMinusI(const Grid_simd &in) { - return in; -} - -/////////////////////// -// timesI -/////////////////////// -template = 0> -inline void timesI(Grid_simd &ret, const Grid_simd &in) { - ret.v = binary(in.v, ret.v, TimesISIMD()); -} -template = 0> -inline Grid_simd timesI(const Grid_simd &in) { - Grid_simd ret; - timesI(ret, in); - return ret; -} -template = 0> -inline Grid_simd timesI(const Grid_simd &in) { - return in; -} - -///////////////////// -// Inner, outer -///////////////////// - -template -inline Grid_simd innerProduct(const Grid_simd &l, - const Grid_simd &r) { - return conjugate(l) * r; -} -template -inline Grid_simd outerProduct(const Grid_simd &l, - const Grid_simd &r) { - return l * conjugate(r); -} - -template -inline Grid_simd trace(const Grid_simd &arg) { - return arg; -} - -//////////////////////////////////////////////////////////// -// copy/splat complex real parts into real; -// insert real into complex and zero imag; -//////////////////////////////////////////////////////////// - -// real = toReal( complex ) -template = 0> -inline Grid_simd toReal(const Grid_simd, V> &in) { - typedef Grid_simd simd; - simd ret; - typename simd::conv_t conv; - conv.v = in.v; // copy the vector content (bytewise) - for (int i = 0; i < simd::Nsimd(); i += 2) { - conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc... - } - ret.v = conv.v; - return ret; -} - -// complex = toComplex( real ) -template = 0> // must be a real arg -inline Grid_simd, V> toComplex(const Grid_simd &in) { - typedef Grid_simd Rsimd; - typedef Grid_simd, V> Csimd; - typename Rsimd::conv_t conv; // address as real - - conv.v = in.v; - for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == conv.s[i]); - // trap any cases where real was not duplicated - // indicating the SIMD grids of real and imag assignment did not correctly - // match - conv.s[i + 1] = 0.0; // zero imaginary parts - } - Csimd ret; - ret.v = conv.v; - return ret; -} - -/////////////////////////////// -// Define available types -/////////////////////////////// -typedef Grid_simd vRealF; -typedef Grid_simd vRealD; -typedef Grid_simd, SIMD_Ftype> vComplexF; -typedef Grid_simd, SIMD_Dtype> vComplexD; -typedef Grid_simd vInteger; - -// Half precision; no arithmetic support -typedef Grid_simd vRealH; -typedef Grid_simd, SIMD_Htype> vComplexH; - -inline void precisionChange(vRealF *out,vRealD *in,int nvec) -{ - assert((nvec&0x1)==0); - for(int m=0;m*2 -struct is_simd : public std::false_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; - -template using IfSimd = Invoke::value, int> >; -template using IfNotSimd = Invoke::value, unsigned> >; -} - -#endif diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 424b5573..e05fecc4 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -327,16 +327,12 @@ class Grid_simd { // provides support /////////////////////////////////////// - //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 ) - //#pragma GCC push_options - //#pragma GCC optimize ("O0") - //#endif template friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { Grid_simd ret; Grid_simd::conv_t conv; Grid_simd::scalar_type s; - + conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { s = conv.s[i]; @@ -364,11 +360,8 @@ class Grid_simd { ret.v = cx.v; return ret; } - //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 ) - //#pragma GCC pop_options - //#endif /////////////////////// - // Exchange + // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh /////////////////////// friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) @@ -379,7 +372,7 @@ class Grid_simd { Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } else if(n==1) { Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { + } else if(n==0) { Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } } @@ -406,7 +399,7 @@ class Grid_simd { int dist = perm & 0xF; y = rotate(b, dist); return; - } + } else if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); @@ -425,10 +418,9 @@ class Grid_simd { } - + }; // end of Grid_simd class definition - inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; } inline void permute(RealD &y,RealD b, int perm) { y=b; } @@ -451,29 +443,29 @@ inline Grid_simd rotate(Grid_simd b, int nrot) { ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); return ret; } -template =0> +template =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,nrot); } -template =0> +template =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,2*nrot); } -template +template inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); -} -template =0> +} +template =0> inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} +} @@ -604,27 +596,13 @@ inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { ret.v = binary(a.v, b.v, MultRealPartSIMD()); return ret; }; -// TEST for Test_simd -template = 0> -inline Grid_simd real_mult(std::complex a, std::complex b) { - Grid_simd ret; - //ret.v = binary(a.v, b.v, MultRealPartSIMD()); - return ret; -}; - template = 0> inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { Grid_simd ret; ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); return ret; }; -// TEST for Test_simd -template = 0> -inline Grid_simd real_madd(std::complex a, std::complex b) { - Grid_simd ret; - //ret.v = binary(a.v, b.v, MultRealPartSIMD()); - return ret; -}; + // Distinguish between complex types and others template = 0> @@ -654,7 +632,7 @@ inline Grid_simd operator/(Grid_simd a, Grid_simd b) { ret = a * conjugate(b) ; den = b * conjugate(b) ; - + auto real_den = toReal(den); ret.v=binary(ret.v, real_den.v, DivSIMD()); @@ -773,8 +751,8 @@ inline Grid_simd, V> toComplex(const Grid_simd &in) { conv.v = in.v; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == - conv.s[i]); // trap any cases where real was not duplicated + assert(conv.s[i + 1] == conv.s[i]); + // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match conv.s[i + 1] = 0.0; // zero imaginary parts @@ -852,8 +830,6 @@ inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionCha inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);} inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);} - - // Check our vector types are of an appropriate size. #if defined QPX static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect"); @@ -868,21 +844,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc ///////////////////////////////////////// template struct is_simd : public std::false_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; -template <> -struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; -template -using IfSimd = Invoke::value, int> >; -template -using IfNotSimd = Invoke::value, unsigned> >; +template using IfSimd = Invoke::value, int> >; +template using IfNotSimd = Invoke::value, unsigned> >; } #endif From 0933aeefd4ad053015c4e6bf232df235c546047a Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Wed, 28 Jun 2017 20:22:22 +0200 Subject: [PATCH 5/8] corrected Grid_neon.h --- lib/simd/Grid_neon.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index f3f802e7..38815389 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -41,6 +41,10 @@ Author: neo //#ifndef ARM_NEON //#define ARM_NEON +#ifndef GEN_SIMD_WIDTH +#define GEN_SIMD_WIDTH 16u +#endif + #include "Grid_generic_types.h" #include From 8859a151cc844b8170729285dc2a272e4ffa4940 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 29 Jun 2017 11:30:29 +0100 Subject: [PATCH 6/8] Small corrections to the NEON port --- configure.ac | 2 +- lib/qcd/smearing/WilsonFlow.h | 9 ++++----- lib/simd/Grid_neon.h | 15 +++++---------- lib/simd/Grid_vector_types.h | 2 +- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/configure.ac b/configure.ac index a69b97e3..75cf7891 100644 --- a/configure.ac +++ b/configure.ac @@ -250,7 +250,7 @@ case ${ax_cv_cxx_compiler_vendor} in SIMD_FLAGS='';; NEONv8) AC_DEFINE([NEONV8],[1],[ARMv8 NEON]) - SIMD_FLAGS='';; + SIMD_FLAGS='-march=armv8-a';; QPX|BGQ) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) SIMD_FLAGS='';; diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h index 5e9f2d95..4f5c0d43 100644 --- a/lib/qcd/smearing/WilsonFlow.h +++ b/lib/qcd/smearing/WilsonFlow.h @@ -108,7 +108,7 @@ void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real if (maxTau - taus < epsilon){ epsilon = maxTau-taus; } - std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; + //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; GaugeField Z(U._grid); GaugeField Zprime(U._grid); GaugeField tmp(U._grid), Uprime(U._grid); @@ -138,10 +138,10 @@ void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real // adjust integration step taus += epsilon; - std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; + //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); - std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; + //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; } @@ -166,7 +166,6 @@ void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const { out = in; for (unsigned int step = 1; step <= Nstep; step++) { auto start = std::chrono::high_resolution_clock::now(); - std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl; evolve_step(out); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = end - start; @@ -191,7 +190,7 @@ void WilsonFlow::smear_adaptive(GaugeField& out, const GaugeField& in, Re unsigned int step = 0; do{ step++; - std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; + //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; evolve_step_adaptive(out, maxTau); std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index 38815389..d6eb9c5a 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -6,9 +6,9 @@ Copyright (C) 2015 -Author: Nils Meyer -Author: Peter Boyle -Author: neo + Author: Nils Meyer + Author: Peter Boyle + Author: neo This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,7 +27,7 @@ Author: neo See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -//---------------------------------------------------------------------- + /* ARMv8 NEON intrinsics layer by @@ -37,9 +37,6 @@ Author: neo SFB/TRR55 */ -//---------------------------------------------------------------------- -//#ifndef ARM_NEON -//#define ARM_NEON #ifndef GEN_SIMD_WIDTH #define GEN_SIMD_WIDTH 16u @@ -606,6 +603,4 @@ namespace Optimization { typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; -} - -//#endif // ARM_NEON +} \ No newline at end of file diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index e05fecc4..27585547 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -53,7 +53,7 @@ directory #if defined IMCI #include "Grid_imci.h" #endif -#ifdef NEONv8 +#ifdef NEONV8 #include "Grid_neon.h" #endif #if defined QPX From bf630a6821ea8923fc9690a03f621f6d69b31f4e Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 29 Jun 2017 11:42:25 +0100 Subject: [PATCH 7/8] README file update --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9432abe1..5d168298 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ License: GPL v2. -Last update Nov 2016. +Last update June 2017. _Please do not send pull requests to the `master` branch which is reserved for releases._ @@ -78,14 +78,17 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi for most programmers. The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. -Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way). +Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported. -These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers. +These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. MPI, OpenMP, and SIMD parallelism are present in the library. Please see https://arxiv.org/abs/1512.03487 for more detail. +### Required libraries +Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed. + ### Quick start First, start by cloning the repository: @@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff | `AVXFMA4` | AVX (256 bit) + FMA4 | | `AVX2` | AVX 2 (256 bit) | | `AVX512` | AVX 512 bit | -| `QPX` | QPX (256 bit) | +| `NEONv8` | ARM NEON (128 bit) | +| `QPX` | IBM QPX (256 bit) | Alternatively, some CPU codenames can be directly used: From 09d09d0fe5bce853e1b42115371cd935a4e29cc0 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 29 Jun 2017 11:48:11 +0100 Subject: [PATCH 8/8] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5d168298..1f0b450c 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. MPI, OpenMP, and SIMD parallelism are present in the library. -Please see https://arxiv.org/abs/1512.03487 for more detail. +Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail. ### Required libraries Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed. @@ -176,7 +176,7 @@ The following options can be use with the `--enable-simd=` option to target diff | `AVXFMA4` | AVX (256 bit) + FMA4 | | `AVX2` | AVX 2 (256 bit) | | `AVX512` | AVX 512 bit | -| `NEONv8` | ARM NEON (128 bit) | +| `NEONv8` | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit) | | `QPX` | IBM QPX (256 bit) | Alternatively, some CPU codenames can be directly used: @@ -216,4 +216,4 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are w --with-mpfr= \ --enable-mkl \ CXX=CC CC=cc -``` \ No newline at end of file +```