diff --git a/configure.ac b/configure.ac index fb80b1ae..7c81e5b5 100644 --- a/configure.ac +++ b/configure.ac @@ -149,8 +149,14 @@ CXXFLAGS=$CXXFLAGS_CPY LDFLAGS=$LDFLAGS_CPY ############### SIMD instruction selection -AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=], - [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN]) +AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code], + [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN]) + +AC_ARG_ENABLE([gen-simd-width], + [AS_HELP_STRING([--enable-gen-simd-width=size], + [size (in bytes) of the generic SIMD vectors (default: 32)])], + [ac_gen_simd_width=$enable_gen_simd_width], + [ac_gen_simd_width=32]) case ${ax_cv_cxx_compiler_vendor} in clang|gnu) @@ -179,8 +185,11 @@ case ${ax_cv_cxx_compiler_vendor} in KNL) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) SIMD_FLAGS='-march=knl';; - GEN256) - AC_DEFINE([GEN256],[1],[generic vector code]) + GEN) + AC_DEFINE([GEN],[1],[generic vector code]) + AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width], + [generic SIMD vector width (in bytes)]) + SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" SIMD_FLAGS='';; QPX|BGQ) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) @@ -211,8 +220,11 @@ case ${ax_cv_cxx_compiler_vendor} in KNL) AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing]) SIMD_FLAGS='-xmic-avx512';; - GEN256) - AC_DEFINE([GEN256],[1],[generic vector code]) + GEN) + AC_DEFINE([GEN],[1],[generic vector code]) + AC_DEFINE([GEN_SIMD_WIDTH],[$ac_gen_simd_width], + [generic SIMD vector width (in bytes)]) + SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" SIMD_FLAGS='';; *) AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);; @@ -382,7 +394,7 @@ os (target) : $target_os compiler vendor : ${ax_cv_cxx_compiler_vendor} compiler version : ${ax_cv_gxx_version} ----- BUILD OPTIONS ----------------------------------- -SIMD : ${ac_SIMD} +SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} Threading : ${ac_openmp} Communications type : ${comms_type} Default precision : ${ac_PRECISION} diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index e69de29b..62c78afb 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -0,0 +1,462 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_generic.h + + Copyright (C) 2015 + +Author: Antonin Portelli + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes"); + +//#define VECTOR_LOOPS + +// playing with compiler pragmas +#ifdef VECTOR_LOOPS +#ifdef __clang__ +#define VECTOR_FOR(i, w, inc)\ +_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\ +for (unsigned int i = 0; i < w; i += inc) +#elif defined __INTEL_COMPILER +#define VECTOR_FOR(i, w, inc)\ +_Pragma("simd vectorlength(w*8)")\ +for (unsigned int i = 0; i < w; i += inc) +#else +#define VECTOR_FOR(i, w, inc)\ +for (unsigned int i = 0; i < w; i += inc) +#endif +#else +#define VECTOR_FOR(i, w, inc)\ +for (unsigned int i = 0; i < w; i += inc) +#endif + +namespace Grid { +namespace Optimization { + + // type traits giving the number of elements for each vector type + template struct W; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + typedef vec vecf; + typedef vec vecd; + + struct Vsplat{ + // Complex + template + inline vec operator()(T a, T b){ + vec out; + + VECTOR_FOR(i, W::r, 2) + { + out.v[i] = a; + out.v[i+1] = b; + } + + return out; + } + + // Real + template + inline vec operator()(T a){ + vec out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a; + } + + return out; + } + + // Integer + inline int operator()(Integer a){ + return a; + } + }; + + struct Vstore{ + // Real + template + inline void operator()(vec a, T *D){ + *((vec *)D) = a; + } + //Integer + inline void operator()(int a, Integer *I){ + *I = a; + } + + }; + + struct Vstream{ + // Real + template + inline void operator()(T * a, vec b){ + *((vec *)a) = b; + } + }; + + struct Vset{ + // Complex + template + inline vec operator()(std::complex *a){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + out.v[2*i] = a[i].real(); + out.v[2*i+1] = a[i].imag(); + } + + return out; + } + + // Real + template + inline vec operator()(T *a){ + vec out; + + out = *((vec *)a); + + return out; + } + + // Integer + inline int operator()(Integer *a){ + return *a; + } + }; + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + // Complex/Real + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a.v[i] + b.v[i]; + } + + return out; + } + + //I nteger + inline int operator()(int a, int b){ + return a + b; + } + }; + + struct Sub{ + // Complex/Real + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a.v[i] - b.v[i]; + } + + return out; + } + + //Integer + inline int operator()(int a, int b){ + return a-b; + } + }; + + struct Mult{ + // Real + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a.v[i]*b.v[i]; + } + + return out; + } + + // Integer + inline int operator()(int a, int b){ + return a*b; + } + }; + + #define cmul(a, b, c, i)\ + c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ + c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; + + struct MultComplex{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + cmul(a.v, b.v, out.v, 2*i); + } + + return out; + } + }; + + #undef cmul + + struct Div{ + // Real + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::r, 1) + { + out.v[i] = a.v[i]/b.v[i]; + } + + return out; + } + }; + + #define conj(a, b, i)\ + b[i] = a[i];\ + b[i+1] = -a[i+1]; + + struct Conj{ + // Complex + template + inline vec operator()(vec a){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + conj(a.v, out.v, 2*i); + } + + return out; + } + }; + + #undef conj + + #define timesmi(a, b, i)\ + b[i] = a[i+1];\ + b[i+1] = -a[i]; + + struct TimesMinusI{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + timesmi(a.v, out.v, 2*i); + } + + return out; + } + }; + + #undef timesmi + + #define timesi(a, b, i)\ + b[i] = -a[i+1];\ + b[i+1] = a[i]; + + struct TimesI{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + timesi(a.v, out.v, 2*i); + } + + return out; + } + }; + + #undef timesi + + ////////////////////////////////////////////// + // Some Template specialization + #define perm(a, b, n, w)\ + unsigned int _mask = w >> (n + 1);\ + VECTOR_FOR(i, w, 1)\ + {\ + b[i] = a[i^_mask];\ + } + + #define DECL_PERMUTE_N(n)\ + template \ + static inline vec Permute##n(vec in) {\ + vec out;\ + perm(in.v, out.v, n, W::r);\ + return out;\ + } + + struct Permute{ + DECL_PERMUTE_N(0); + DECL_PERMUTE_N(1); + DECL_PERMUTE_N(2); + DECL_PERMUTE_N(3); + }; + + #undef perm + #undef DECL_PERMUTE_N + + #define rot(a, b, n, w)\ + VECTOR_FOR(i, w, 1)\ + {\ + b[i] = a[(i + n)%w];\ + } + + struct Rotate{ + template + static inline vec rotate(vec in, int n){ + vec out; + + rot(in.v, out.v, n, W::r); + + return out; + } + }; + + #undef rot + + #define acc(v, a, off, step, n)\ + for (unsigned int i = off; i < n; i += step)\ + {\ + a += v[i];\ + } + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + //Complex float Reduce + template <> + inline Grid::ComplexF Reduce::operator()(vecf in){ + float a = 0.f, b = 0.f; + + acc(in.v, a, 0, 2, W::r); + acc(in.v, b, 1, 2, W::r); + + return Grid::ComplexF(a, b); + } + + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(vecf in){ + float a = 0.; + + acc(in.v, a, 0, 1, W::r); + + return a; + } + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(vecd in){ + double a = 0., b = 0.; + + acc(in.v, a, 0, 2, W::r); + acc(in.v, b, 1, 2, W::r); + + return Grid::ComplexD(a, b); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(vecd in){ + double a = 0.f; + + acc(in.v, a, 0, 1, W::r); + + return a; + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(int in){ + return in; + } +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + + typedef Optimization::vecf SIMD_Ftype; // Single precision type + typedef Optimization::vecd SIMD_Dtype; // Double precision type + typedef int SIMD_Itype; // Integer type + + // prefetch utilities + inline void v_prefetch0(int size, const char *ptr){}; + inline void prefetch_HINT_T0(const char *ptr){}; + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; +} diff --git a/lib/simd/Grid_generic_256.h b/lib/simd/Grid_generic_256.h deleted file mode 100644 index 42df6cf3..00000000 --- a/lib/simd/Grid_generic_256.h +++ /dev/null @@ -1,644 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/simd/Grid_generic.h - - Copyright (C) 2015 - -Author: Peter Boyle -Author: neo - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ - -#ifndef GEN_SIMD_WIDTH -#define GEN_SIMD_DCOMPLEX_WIDTH 2 -#endif - -#include "Grid_generic.h" - -namespace Grid { -namespace Optimization { - - constexpr unsigned int dcw = GEN_SIMD_DCOMPLEX_WIDTH; - constexpr unsigned int fcw = 2*dcw; - constexpr unsigned int dw = 2*dcw; - constexpr unsigned int fw = 2*fcw; - - struct vecf { - float v[fw]; - }; - - struct vecd { - double v[dw]; - }; - - struct Vsplat{ - //Complex float - inline vecf operator()(float a, float b){ - vecf out; - - for (unsigned int i = 0; i < fw; i += 2) - { - out.v[i] = a; - out.v[i+1] = b; - } - - return out; - } - - // Real float - inline vecf operator()(float a){ - vecf out; - - for (unsigned int i = 0; i < fw; ++i) - { - out.v[i] = a; - } - - return out; - } - - //Complex double - inline vecd operator()(double a, double b){ - vecd out; - - for (unsigned int i = 0; i < dw; i += 2) - { - out.v[i] = a; - out.v[i+1] = b; - } - - return out; - } - - //Real double - inline vecd operator()(double a){ - vecd out; - - for (unsigned int i = 0; i < dw; ++i) - { - out.v[i] = a; - } - - return out; - } - - //Integer - inline int operator()(Integer a){ - return a; - } - }; - - struct Vstore{ - //Float - inline void operator()(vecf a, float* F){ - memcpy(F,a.v,fw*sizeof(float)); - } - //Double - inline void operator()(vecd a, double* D){ - memcpy(D,a.v,dw*sizeof(double)); - } - //Integer - inline void operator()(int a, Integer* I){ - I[0] = a; - } - - }; - - struct Vstream{ - //Float - inline void operator()(float * a, vecf b){ - memcpy(a,b.v,fw*sizeof(float)); - } - //Double - inline void operator()(double * a, vecd b){ - memcpy(a,b.v,dw*sizeof(double)); - } - - - }; - - struct Vset{ - // Complex float - inline vecf operator()(Grid::ComplexF *a){ - vecf out; - - for (unsigned int i = 0; i < fcw; ++i) - { - out.v[2*i] = a[i].real(); - out.v[2*i+1] = a[i].imag(); - } - - return out; - } - - // Complex double - inline vecd operator()(Grid::ComplexD *a){ - vecd out; - - for (unsigned int i = 0; i < dcw; ++i) - { - out.v[2*i] = a[i].real(); - out.v[2*i+1] = a[i].imag(); - } - - return out; - } - - // Real float - inline vecf operator()(float *a){ - vecf out; - - memcpy(out.v,a,fw*sizeof(float)); - - return out; - } - // Real double - inline vecd operator()(double *a){ - vecd out; - - memcpy(out.v,a,dw*sizeof(float)); - - return out; - } - // Integer - inline int operator()(Integer *a){ - return a[0]; - } - - - }; - - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// - struct Sum{ - //Complex/Real float - inline vecf operator()(vecf a, vecf b){ - vecf out; - - for (unsigned int i = 0; i < fw; ++i) - { - out.v[i] = a.v[i] + b.v[i]; - } - - return out; - } - - //Complex/Real double - inline vecd operator()(vecd a, vecd b){ - vecd out; - - for (unsigned int i = 0; i < dw; ++i) - { - out.v[i] = a.v[i] + b.v[i]; - } - - return out; - } - - //Integer - inline int operator()(int a, int b){ - return a + b; - } - }; - - struct Sub{ - //Complex/Real float - inline vecf operator()(vecf a, vecf b){ - vecf out; - - for (unsigned int i = 0; i < fw; ++i) - { - out.v[i] = a.v[i] - b.v[i]; - } - - return out; - } - - //Complex/Real double - inline vecd operator()(vecd a, vecd b){ - vecd out; - - for (unsigned int i = 0; i < dw; ++i) - { - out.v[i] = a.v[i] - b.v[i]; - } - - return out; - } - - //Integer - inline int operator()(int a, int b){ - return a-b; - } - }; - - #define cmul(a, b, c, i)\ - c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ - c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; - - struct MultComplex{ - // Complex float - inline vecf operator()(vecf a, vecf b){ - vecf out; - - for (unsigned int i = 0; i < fcw; ++i) - { - cmul(a.v, b.v, out.v, 2*i); - } - - return out; - } - - // Complex double - inline vecd operator()(vecd a, vecd b){ - vecd out; - - for (unsigned int i = 0; i < dcw; ++i) - { - cmul(a.v, b.v, out.v, 2*i); - } - - return out; - } - }; - - #undef cmul - - struct Mult{ - // Real float - inline vecf operator()(vecf a, vecf b){ - vecf out; - - for (unsigned int i = 0; i < fw; ++i) - { - out.v[i] = a.v[i]*b.v[i]; - } - - return out; - } - - // Real double - inline vecd operator()(vecd a, vecd b){ - vecd out; - - for (unsigned int i = 0; i < dw; ++i) - { - out.v[i] = a.v[i]*b.v[i]; - } - - return out; - } - - // Integer - inline int operator()(int a, int b){ - return a*b; - } - }; - - struct Div{ - // Real float - inline vecf operator()(vecf a, vecf b){ - vecf out; - - for (unsigned int i = 0; i < fw; ++i) - { - out.v[i] = a.v[i]/b.v[i]; - } - - return out; - } - // Real double - inline vecd operator()(vecd a, vecd b){ - vecd out; - - for (unsigned int i = 0; i < dw; ++i) - { - out.v[i] = a.v[i]/b.v[i]; - } - - return out; - } - }; - - #define conj(a, b, i)\ - b[i] = a[i];\ - b[i+1] = -a[i+1]; - - struct Conj{ - // Complex single - inline vecf operator()(vecf in){ - vecf out; - - for (unsigned int i = 0; i < fcw; ++i) - { - conj(in.v, out.v, 2*i); - } - - return out; - } - - // Complex double - inline vecd operator()(vecd in){ - vecd out; - - for (unsigned int i = 0; i < dcw; ++i) - { - conj(in.v, out.v, 2*i); - } - - return out; - } - }; - - #undef conj - - #define timesmi(a, b, i)\ - b[i] = a[i+1];\ - b[i+1] = -a[i]; - - struct TimesMinusI{ - // Complex single - inline vecf operator()(vecf in, vecf ret){ - vecf out; - - for (unsigned int i = 0; i < fcw; ++i) - { - timesmi(in.v, out.v, 2*i); - } - - return out; - } - - // Complex double - inline vecd operator()(vecd in, vecd ret){ - vecd out; - - for (unsigned int i = 0; i < dcw; ++i) - { - timesmi(in.v, out.v, 2*i); - } - - return out; - } - }; - - #undef timesmi - - #define timespi(a, b, i)\ - b[i] = -a[i+1];\ - b[i+1] = a[i]; - - struct TimesI{ - // Complex single - inline vecf operator()(vecf in, vecf ret){ - vecf out; - - for (unsigned int i = 0; i < fcw; ++i) - { - timespi(in.v, out.v, 2*i); - } - - return out; - } - - // Complex double - inline vecd operator()(vecd in, vecd ret){ - vecd out; - - for (unsigned int i = 0; i < dcw; ++i) - { - timespi(in.v, out.v, 2*i); - } - - return out; - } - }; - - #undef timespi - - ////////////////////////////////////////////// - // Some Template specialization - struct Permute{ - static inline vecf Permute0(vecf in){ //AB CD -> CD AB - vecf out; - - out.v[0] = in.v[4]; - out.v[1] = in.v[5]; - out.v[2] = in.v[6]; - out.v[3] = in.v[7]; - out.v[4] = in.v[0]; - out.v[5] = in.v[1]; - out.v[6] = in.v[2]; - out.v[7] = in.v[3]; - - return out; - }; - - static inline vecf Permute1(vecf in){ //AB CD -> BA DC - vecf out; - - out.v[0] = in.v[2]; - out.v[1] = in.v[3]; - out.v[2] = in.v[0]; - out.v[3] = in.v[1]; - out.v[4] = in.v[6]; - out.v[5] = in.v[7]; - out.v[6] = in.v[4]; - out.v[7] = in.v[5]; - - return out; - }; - - static inline vecf Permute2(vecf in){ - vecf out; - - out.v[0] = in.v[1]; - out.v[1] = in.v[0]; - out.v[2] = in.v[3]; - out.v[3] = in.v[2]; - out.v[4] = in.v[5]; - out.v[5] = in.v[4]; - out.v[6] = in.v[7]; - out.v[7] = in.v[6]; - - return out; - }; - - static inline vecf Permute3(vecf in){ - return in; - }; - - static inline vecd Permute0(vecd in){ //AB -> BA - vecd out; - - out.v[0] = in.v[2]; - out.v[1] = in.v[3]; - out.v[2] = in.v[0]; - out.v[3] = in.v[1]; - - return out; - }; - - static inline vecd Permute1(vecd in){ - vecd out; - - out.v[0] = in.v[1]; - out.v[1] = in.v[0]; - out.v[2] = in.v[3]; - out.v[3] = in.v[2]; - - return out; - }; - - static inline vecd Permute2(vecd in){ - return in; - }; - - static inline vecd Permute3(vecd in){ - return in; - }; - - }; - - #define rot(a, b, n, w)\ - for (unsigned int i = 0; i < w; ++i)\ - {\ - b[i] = a[(i + n)%w];\ - } - - struct Rotate{ - - static inline vecf rotate(vecf in, int n){ - vecf out; - - rot(in.v, out.v, n, fw); - - return out; - } - - static inline vecd rotate(vecd in,int n){ - vecd out; - - rot(in.v, out.v, n, dw); - - return out; - } - }; - - #undef rot - - #define acc(v, a, off, step, n)\ - for (unsigned int i = off; i < n; i += step)\ - {\ - a += v[i];\ - } - - template - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; - - //Complex float Reduce - template<> - inline Grid::ComplexF Reduce::operator()(vecf in){ - float a = 0.f, b = 0.f; - - acc(in.v, a, 0, 2, fw); - acc(in.v, b, 1, 2, fw); - - return Grid::ComplexF(a, b); - } - - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(vecf in){ - float a = 0.; - - acc(in.v, a, 0, 1, fw); - - return a; - } - - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(vecd in){ - double a = 0., b = 0.; - - acc(in.v, a, 0, 2, dw); - acc(in.v, b, 1, 2, dw); - - return Grid::ComplexD(a, b); - } - - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(vecd in){ - double a = 0.f; - - acc(in.v, a, 0, 1, dw); - - return a; - } - - //Integer Reduce - template<> - inline Integer Reduce::operator()(int in){ - return in; - } -} - -////////////////////////////////////////////////////////////////////////////////////// -// Here assign types - - typedef Optimization::vecf SIMD_Ftype; // Single precision type - typedef Optimization::vecd SIMD_Dtype; // Double precision type - typedef int SIMD_Itype; // Integer type - - // prefetch utilities - inline void v_prefetch0(int size, const char *ptr){}; - inline void prefetch_HINT_T0(const char *ptr){}; - - - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template using ReduceSIMD = Optimization::Reduce; - - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; - -} diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 9b9ad18b..184baad9 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -38,8 +38,8 @@ directory #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES -#ifdef GEN256 -#include "Grid_generic_256.h" +#ifdef GEN +#include "Grid_generic.h" #endif #ifdef SSE4 #include "Grid_sse4.h"