diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 91e9cda2..7972da55 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -5,8 +5,10 @@ Source file: ./lib/simd/Grid_generic.h Copyright (C) 2015 + Copyright (C) 2017 Author: Antonin Portelli + Andrew Lawson This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,51 +28,10 @@ Author: Antonin Portelli *************************************************************************************/ /* END LEGAL */ -static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes"); - -//#define VECTOR_LOOPS - -// playing with compiler pragmas -#ifdef VECTOR_LOOPS -#ifdef __clang__ -#define VECTOR_FOR(i, w, inc)\ -_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\ -for (unsigned int i = 0; i < w; i += inc) -#elif defined __INTEL_COMPILER -#define VECTOR_FOR(i, w, inc)\ -_Pragma("simd vectorlength(w*8)")\ -for (unsigned int i = 0; i < w; i += inc) -#else -#define VECTOR_FOR(i, w, inc)\ -for (unsigned int i = 0; i < w; i += inc) -#endif -#else -#define VECTOR_FOR(i, w, inc)\ -for (unsigned int i = 0; i < w; i += inc) -#endif +#include "Grid_generic_types.h" namespace Grid { namespace Optimization { - - // type traits giving the number of elements for each vector type - template struct W; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; - }; - template <> struct W { - constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; - constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; - }; - - // SIMD vector types - template - struct vec { - alignas(GEN_SIMD_WIDTH) T v[W::r]; - }; - - typedef vec vecf; - typedef vec vecd; struct Vsplat{ // Complex @@ -99,11 +60,6 @@ namespace Optimization { return out; } - - // Integer - inline int operator()(Integer a){ - return a; - } }; struct Vstore{ @@ -112,11 +68,6 @@ namespace Optimization { inline void operator()(vec a, T *D){ *((vec *)D) = a; } - //Integer - inline void operator()(int a, Integer *I){ - *I = a; - } - }; struct Vstream{ @@ -151,11 +102,6 @@ namespace Optimization { return out; } - - // Integer - inline int operator()(Integer *a){ - return *a; - } }; ///////////////////////////////////////////////////// @@ -174,11 +120,6 @@ namespace Optimization { return out; } - - //I nteger - inline int operator()(int a, int b){ - return a + b; - } }; struct Sub{ @@ -194,11 +135,6 @@ namespace Optimization { return out; } - - //Integer - inline int operator()(int a, int b){ - return a-b; - } }; struct Mult{ @@ -214,11 +150,6 @@ namespace Optimization { return out; } - - // Integer - inline int operator()(int a, int b){ - return a*b; - } }; #define cmul(a, b, c, i)\ @@ -232,13 +163,26 @@ namespace Optimization { VECTOR_FOR(i, W::c, 1) { - out.v[2*i] = a[2*i]*b[2*i]; - out.v[2*i+1] = a[2*i]*b[2*i+1]; + out.v[2*i] = a.v[2*i]*b.v[2*i]; + out.v[2*i+1] = a.v[2*i]*b.v[2*i+1]; } return out; - }; + } }; + struct MaddRealPart{ + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + out.v[2*i] = a.v[2*i]*b.v[2*i] + c.v[2*i]; + out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1]; + } + return out; + } + }; struct MultComplex{ // Complex @@ -369,6 +313,11 @@ namespace Optimization { } struct Rotate{ + + template static inline vec tRotate(vec in){ + return rotate(in, n); + } + template static inline vec rotate(vec in, int n){ vec out; @@ -442,8 +391,12 @@ namespace Optimization { //Integer Reduce template<> - inline Integer Reduce::operator()(int in){ - return in; + inline Integer Reduce::operator()(veci in){ + Integer a = 0; + + acc(in.v, a, 0, 1, W::r); + + return a; } } @@ -452,7 +405,7 @@ namespace Optimization { typedef Optimization::vecf SIMD_Ftype; // Single precision type typedef Optimization::vecd SIMD_Dtype; // Double precision type - typedef int SIMD_Itype; // Integer type + typedef Optimization::veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; @@ -472,6 +425,7 @@ namespace Optimization { typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h new file mode 100644 index 00000000..2142bc8e --- /dev/null +++ b/lib/simd/Grid_generic_types.h @@ -0,0 +1,80 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_generic_types.h + + Copyright (C) 2017 + +Author: Antonin Portelli + Andrew Lawson + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes"); + +//#define VECTOR_LOOPS + +// playing with compiler pragmas +#ifdef VECTOR_LOOPS +#ifdef __clang__ +#define VECTOR_FOR(i, w, inc)\ +_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\ +for (unsigned int i = 0; i < w; i += inc) +#elif defined __INTEL_COMPILER +#define VECTOR_FOR(i, w, inc)\ +_Pragma("simd vectorlength(w*8)")\ +for (unsigned int i = 0; i < w; i += inc) +#else +#define VECTOR_FOR(i, w, inc)\ +for (unsigned int i = 0; i < w; i += inc) +#endif +#else +#define VECTOR_FOR(i, w, inc)\ +for (unsigned int i = 0; i < w; i += inc) +#endif + +namespace Grid { +namespace Optimization { + + // type traits giving the number of elements for each vector type + template struct W; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + typedef vec vecf; + typedef vec vecd; + typedef vec veci; + +}} diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 99a9ea68..44403d87 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -5,8 +5,10 @@ Source file: ./lib/simd/Grid_qpx.h Copyright (C) 2016 + Copyright (C) 2017 Author: Antonin Portelli + Andrew Lawson This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,6 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution directory ******************************************************************************/ +#ifndef GEN_SIMD_WIDTH +#define GEN_SIMD_WIDTH 32u +#endif +#include "Grid_generic_types.h" // Definitions for simulated integer SIMD. + namespace Grid { namespace Optimization { typedef struct @@ -62,8 +69,15 @@ namespace Optimization { return (vector4double){a, a, a, a}; } //Integer - inline int operator()(Integer a){ - return a; + inline veci operator()(Integer a){ + veci out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a; + } + + return out; } }; @@ -88,9 +102,10 @@ namespace Optimization { inline void operator()(vector4double a, double *d){ vec_st(a, 0, d); } + //Integer - inline void operator()(int a, Integer *i){ - i[0] = a; + inline void operator()(veci a, Integer *i){ + *((veci *)i) = a; } }; @@ -142,11 +157,13 @@ namespace Optimization { return vec_ld(0, a); } // Integer - inline int operator()(Integer *a){ - return a[0]; - } - - + inline veci operator()(Integer *a){ + veci out; + + out = *((veci *)a); + + return out; + } }; template @@ -200,8 +217,15 @@ namespace Optimization { FLOAT_WRAP_2(operator(), inline) //Integer - inline int operator()(int a, int b){ - return a + b; + inline veci operator()(veci a, veci b){ + veci out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a.v[i] + b.v[i]; + } + + return out; } }; @@ -215,8 +239,15 @@ namespace Optimization { FLOAT_WRAP_2(operator(), inline) //Integer - inline int operator()(int a, int b){ - return a - b; + inline veci operator()(veci a, veci b){ + veci out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a.v[i] - b.v[i]; + } + + return out; } }; @@ -248,8 +279,15 @@ namespace Optimization { FLOAT_WRAP_2(operator(), inline) // Integer - inline int operator()(int a, int b){ - return a*b; + inline veci operator()(veci a, veci b){ + veci out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a.v[i]*b.v[i]; + } + + return out; } }; @@ -263,8 +301,15 @@ namespace Optimization { FLOAT_WRAP_2(operator(), inline) // Integer - inline int operator()(int a, int b){ - return a/b; + inline veci operator()(veci a, veci b){ + veci out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a.v[i]/b.v[i]; + } + + return out; } }; @@ -418,7 +463,7 @@ namespace Optimization { // Here assign types typedef Optimization::vector4float SIMD_Ftype; // Single precision type typedef vector4double SIMD_Dtype; // Double precision type -typedef int SIMD_Itype; // Integer type +typedef Optimization::veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 8a6ab2e7..3f8a6f3a 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -747,6 +747,15 @@ typedef Grid_simd, SIMD_Ftype> vComplexF; typedef Grid_simd, SIMD_Dtype> vComplexD; typedef Grid_simd vInteger; +// Check our vector types are of an appropriate size. +#if defined QPX +static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect"); +static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect"); +#else +static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect"); +static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect"); +#endif + ///////////////////////////////////////// // Some traits to recognise the types ///////////////////////////////////////// diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 92f9bcd8..cd485cce 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -180,6 +180,65 @@ void Tester(const functor &func) assert(ok==0); } +template +void IntTester(const functor &func) +{ + typedef Integer scal; + typedef vInteger vec; + GridSerialRNG sRNG; + sRNG.SeedRandomDevice(); + + int Nsimd = vec::Nsimd(); + + std::vector input1(Nsimd); + std::vector input2(Nsimd); + std::vector result(Nsimd); + std::vector reference(Nsimd); + + std::vector > buf(3); + vec & v_input1 = buf[0]; + vec & v_input2 = buf[1]; + vec & v_result = buf[2]; + + + for(int i=0;i(v_input1,input1); + merge(v_input2,input2); + merge(v_result,result); + + func(v_result,v_input1,v_input2); + + for(int i=0;i(v_result,result); + + std::cout << GridLogMessage << " " << func.name() << std::endl; + + std::cout << GridLogDebug << v_input1 << std::endl; + std::cout << GridLogDebug << v_input2 << std::endl; + std::cout << GridLogDebug << v_result << std::endl; + + int ok=0; + for(int i=0;i void ReductionTester(const functor &func) @@ -472,6 +531,13 @@ int main (int argc, char ** argv) for(int r=0;r(funcRotate(r)); } + + std::cout<