From 042ae5b87c0534348984e87a55c3cdbced132405 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 15 Nov 2016 12:15:33 +0000 Subject: [PATCH] generic 256bits SIMD --- benchmarks/simple_su3_test.cc | 2 +- configure.ac | 8 +- lib/simd/Grid_generic.h | 496 -------------------------- lib/simd/Grid_generic_256.h | 644 ++++++++++++++++++++++++++++++++++ lib/simd/Grid_vector_types.h | 4 +- 5 files changed, 651 insertions(+), 503 deletions(-) create mode 100644 lib/simd/Grid_generic_256.h diff --git a/benchmarks/simple_su3_test.cc b/benchmarks/simple_su3_test.cc index e5cca98d..79437c23 100644 --- a/benchmarks/simple_su3_test.cc +++ b/benchmarks/simple_su3_test.cc @@ -25,7 +25,7 @@ Author: Peter Boyle See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#include +#include using namespace std; using namespace Grid; diff --git a/configure.ac b/configure.ac index 9178a11d..fb80b1ae 100644 --- a/configure.ac +++ b/configure.ac @@ -179,8 +179,8 @@ case ${ax_cv_cxx_compiler_vendor} in KNL) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) SIMD_FLAGS='-march=knl';; - GEN) - AC_DEFINE([GENERIC_VEC],[1],[generic vector code]) + GEN256) + AC_DEFINE([GEN256],[1],[generic vector code]) SIMD_FLAGS='';; QPX|BGQ) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) @@ -211,8 +211,8 @@ case ${ax_cv_cxx_compiler_vendor} in KNL) AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing]) SIMD_FLAGS='-xmic-avx512';; - GEN) - AC_DEFINE([GENERIC_VEC],[1],[generic vector code]) + GEN256) + AC_DEFINE([GEN256],[1],[generic vector code]) SIMD_FLAGS='';; *) AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);; diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index ca0029ef..e69de29b 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -1,496 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/simd/Grid_generic.h - - Copyright (C) 2015 - -Author: Peter Boyle -Author: neo - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ - -namespace Grid { -namespace Optimization { - - template - union uconv { - float f; - vtype v; - }; - - union u128f { - float v; - float f[4]; - }; - union u128d { - double v; - double f[2]; - }; - - struct Vsplat{ - //Complex float - inline u128f operator()(float a, float b){ - u128f out; - out.f[0] = a; - out.f[1] = b; - out.f[2] = a; - out.f[3] = b; - return out; - } - // Real float - inline u128f operator()(float a){ - u128f out; - out.f[0] = a; - out.f[1] = a; - out.f[2] = a; - out.f[3] = a; - return out; - } - //Complex double - inline u128d operator()(double a, double b){ - u128d out; - out.f[0] = a; - out.f[1] = b; - return out; - } - //Real double - inline u128d operator()(double a){ - u128d out; - out.f[0] = a; - out.f[1] = a; - return out; - } - //Integer - inline int operator()(Integer a){ - return a; - } - }; - - struct Vstore{ - //Float - inline void operator()(u128f a, float* F){ - memcpy(F,a.f,4*sizeof(float)); - } - //Double - inline void operator()(u128d a, double* D){ - memcpy(D,a.f,2*sizeof(double)); - } - //Integer - inline void operator()(int a, Integer* I){ - I[0] = a; - } - - }; - - struct Vstream{ - //Float - inline void operator()(float * a, u128f b){ - memcpy(a,b.f,4*sizeof(float)); - } - //Double - inline void operator()(double * a, u128d b){ - memcpy(a,b.f,2*sizeof(double)); - } - - - }; - - struct Vset{ - // Complex float - inline u128f operator()(Grid::ComplexF *a){ - u128f out; - out.f[0] = a[0].real(); - out.f[1] = a[0].imag(); - out.f[2] = a[1].real(); - out.f[3] = a[1].imag(); - return out; - } - // Complex double - inline u128d operator()(Grid::ComplexD *a){ - u128d out; - out.f[0] = a[0].real(); - out.f[1] = a[0].imag(); - return out; - } - // Real float - inline u128f operator()(float *a){ - u128f out; - out.f[0] = a[0]; - out.f[1] = a[1]; - out.f[2] = a[2]; - out.f[3] = a[3]; - return out; - } - // Real double - inline u128d operator()(double *a){ - u128d out; - out.f[0] = a[0]; - out.f[1] = a[1]; - return out; - } - // Integer - inline int operator()(Integer *a){ - return a[0]; - } - - - }; - - template - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; - - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// - struct Sum{ - //Complex/Real float - inline u128f operator()(u128f a, u128f b){ - u128f out; - out.f[0] = a.f[0] + b.f[0]; - out.f[1] = a.f[1] + b.f[1]; - out.f[2] = a.f[2] + b.f[2]; - out.f[3] = a.f[3] + b.f[3]; - return out; - } - //Complex/Real double - inline u128d operator()(u128d a, u128d b){ - u128d out; - out.f[0] = a.f[0] + b.f[0]; - out.f[1] = a.f[1] + b.f[1]; - return out; - } - //Integer - inline int operator()(int a, int b){ - return a + b; - } - }; - - struct Sub{ - //Complex/Real float - inline u128f operator()(u128f a, u128f b){ - u128f out; - out.f[0] = a.f[0] - b.f[0]; - out.f[1] = a.f[1] - b.f[1]; - out.f[2] = a.f[2] - b.f[2]; - out.f[3] = a.f[3] - b.f[3]; - return out; - } - //Complex/Real double - inline u128d operator()(u128d a, u128d b){ - u128d out; - out.f[0] = a.f[0] - b.f[0]; - out.f[1] = a.f[1] - b.f[1]; - return out; - } - //Integer - inline int operator()(int a, int b){ - return a-b; - } - }; - - struct MultComplex{ - // Complex float - inline u128f operator()(u128f a, u128f b){ - u128f out; - out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1]; - out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0]; - out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3]; - out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2]; - return out; - } - // Complex double - inline u128d operator()(u128d a, u128d b){ - u128d out; - out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1]; - out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0]; - return out; - } - }; - - struct Mult{ - //CK: Appear unneeded - // inline float mac(float a, float b,double c){ - // return 0; - // } - // inline double mac(double a, double b,double c){ - // return 0; - // } - - // Real float - inline u128f operator()(u128f a, u128f b){ - u128f out; - out.f[0] = a.f[0]*b.f[0]; - out.f[1] = a.f[1]*b.f[1]; - out.f[2] = a.f[2]*b.f[2]; - out.f[3] = a.f[3]*b.f[3]; - return out; - } - // Real double - inline u128d operator()(u128d a, u128d b){ - u128d out; - out.f[0] = a.f[0]*b.f[0]; - out.f[1] = a.f[1]*b.f[1]; - return out; - } - // Integer - inline int operator()(int a, int b){ - return a*b; - } - }; - - struct Conj{ - // Complex single - inline u128f operator()(u128f in){ - u128f out; - out.f[0] = in.f[0]; - out.f[1] = -in.f[1]; - out.f[2] = in.f[2]; - out.f[3] = -in.f[3]; - return out; - } - // Complex double - inline u128d operator()(u128d in){ - u128d out; - out.f[0] = in.f[0]; - out.f[1] = -in.f[1]; - return out; - } - // do not define for integer input - }; - - struct TimesMinusI{ - //Complex single - inline u128f operator()(u128f in, u128f ret){ //note ret is ignored - u128f out; - out.f[0] = in.f[1]; - out.f[1] = -in.f[0]; - out.f[2] = in.f[3]; - out.f[3] = -in.f[2]; - return out; - } - //Complex double - inline u128d operator()(u128d in, u128d ret){ - u128d out; - out.f[0] = in.f[1]; - out.f[1] = -in.f[0]; - return out; - } - }; - - struct TimesI{ - //Complex single - inline u128f operator()(u128f in, u128f ret){ //note ret is ignored - u128f out; - out.f[0] = -in.f[1]; - out.f[1] = in.f[0]; - out.f[2] = -in.f[3]; - out.f[3] = in.f[2]; - return out; - } - //Complex double - inline u128d operator()(u128d in, u128d ret){ - u128d out; - out.f[0] = -in.f[1]; - out.f[1] = in.f[0]; - return out; - } - }; - - ////////////////////////////////////////////// - // Some Template specialization - struct Permute{ - //We just have to mirror the permutes of Grid_sse4.h - static inline u128f Permute0(u128f in){ //AB CD -> CD AB - u128f out; - out.f[0] = in.f[2]; - out.f[1] = in.f[3]; - out.f[2] = in.f[0]; - out.f[3] = in.f[1]; - return out; - }; - static inline u128f Permute1(u128f in){ //AB CD -> BA DC - u128f out; - out.f[0] = in.f[1]; - out.f[1] = in.f[0]; - out.f[2] = in.f[3]; - out.f[3] = in.f[2]; - return out; - }; - static inline u128f Permute2(u128f in){ - return in; - }; - static inline u128f Permute3(u128f in){ - return in; - }; - - static inline u128d Permute0(u128d in){ //AB -> BA - u128d out; - out.f[0] = in.f[1]; - out.f[1] = in.f[0]; - return out; - }; - static inline u128d Permute1(u128d in){ - return in; - }; - static inline u128d Permute2(u128d in){ - return in; - }; - static inline u128d Permute3(u128d in){ - return in; - }; - - }; - - template < typename vtype > - void permute(vtype &a, vtype b, int perm) { - }; - - struct Rotate{ - - static inline u128f rotate(u128f in,int n){ - u128f out; - switch(n){ - case 0: - out.f[0] = in.f[0]; - out.f[1] = in.f[1]; - out.f[2] = in.f[2]; - out.f[3] = in.f[3]; - break; - case 1: - out.f[0] = in.f[1]; - out.f[1] = in.f[2]; - out.f[2] = in.f[3]; - out.f[3] = in.f[0]; - break; - case 2: - out.f[0] = in.f[2]; - out.f[1] = in.f[3]; - out.f[2] = in.f[0]; - out.f[3] = in.f[1]; - break; - case 3: - out.f[0] = in.f[3]; - out.f[1] = in.f[0]; - out.f[2] = in.f[1]; - out.f[3] = in.f[2]; - break; - default: assert(0); - } - return out; - } - static inline u128d rotate(u128d in,int n){ - u128d out; - switch(n){ - case 0: - out.f[0] = in.f[0]; - out.f[1] = in.f[1]; - break; - case 1: - out.f[0] = in.f[1]; - out.f[1] = in.f[0]; - break; - default: assert(0); - } - return out; - } - }; - - //Complex float Reduce - template<> - inline Grid::ComplexF Reduce::operator()(u128f in){ //2 complex - return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]); - } - //Real float Reduce - template<> - inline Grid::RealF Reduce::operator()(u128f in){ //4 floats - return in.f[0] + in.f[1] + in.f[2] + in.f[3]; - } - - - //Complex double Reduce - template<> - inline Grid::ComplexD Reduce::operator()(u128d in){ //1 complex - return Grid::ComplexD(in.f[0],in.f[1]); - } - - //Real double Reduce - template<> - inline Grid::RealD Reduce::operator()(u128d in){ //2 doubles - return in.f[0] + in.f[1]; - } - - //Integer Reduce - template<> - inline Integer Reduce::operator()(int in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); - } -} - -////////////////////////////////////////////////////////////////////////////////////// -// Here assign types - - typedef Optimization::u128f SIMD_Ftype; // Single precision type - typedef Optimization::u128d SIMD_Dtype; // Double precision type - typedef int SIMD_Itype; // Integer type - - // prefetch utilities - inline void v_prefetch0(int size, const char *ptr){}; - inline void prefetch_HINT_T0(const char *ptr){}; - - - - // Gpermute function - template < typename VectorSIMD > - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { - Optimization::permute(y.v,b.v,perm); - } - - - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template using ReduceSIMD = Optimization::Reduce; - - - - - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; - -} diff --git a/lib/simd/Grid_generic_256.h b/lib/simd/Grid_generic_256.h new file mode 100644 index 00000000..42df6cf3 --- /dev/null +++ b/lib/simd/Grid_generic_256.h @@ -0,0 +1,644 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_generic.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: neo + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +#ifndef GEN_SIMD_WIDTH +#define GEN_SIMD_DCOMPLEX_WIDTH 2 +#endif + +#include "Grid_generic.h" + +namespace Grid { +namespace Optimization { + + constexpr unsigned int dcw = GEN_SIMD_DCOMPLEX_WIDTH; + constexpr unsigned int fcw = 2*dcw; + constexpr unsigned int dw = 2*dcw; + constexpr unsigned int fw = 2*fcw; + + struct vecf { + float v[fw]; + }; + + struct vecd { + double v[dw]; + }; + + struct Vsplat{ + //Complex float + inline vecf operator()(float a, float b){ + vecf out; + + for (unsigned int i = 0; i < fw; i += 2) + { + out.v[i] = a; + out.v[i+1] = b; + } + + return out; + } + + // Real float + inline vecf operator()(float a){ + vecf out; + + for (unsigned int i = 0; i < fw; ++i) + { + out.v[i] = a; + } + + return out; + } + + //Complex double + inline vecd operator()(double a, double b){ + vecd out; + + for (unsigned int i = 0; i < dw; i += 2) + { + out.v[i] = a; + out.v[i+1] = b; + } + + return out; + } + + //Real double + inline vecd operator()(double a){ + vecd out; + + for (unsigned int i = 0; i < dw; ++i) + { + out.v[i] = a; + } + + return out; + } + + //Integer + inline int operator()(Integer a){ + return a; + } + }; + + struct Vstore{ + //Float + inline void operator()(vecf a, float* F){ + memcpy(F,a.v,fw*sizeof(float)); + } + //Double + inline void operator()(vecd a, double* D){ + memcpy(D,a.v,dw*sizeof(double)); + } + //Integer + inline void operator()(int a, Integer* I){ + I[0] = a; + } + + }; + + struct Vstream{ + //Float + inline void operator()(float * a, vecf b){ + memcpy(a,b.v,fw*sizeof(float)); + } + //Double + inline void operator()(double * a, vecd b){ + memcpy(a,b.v,dw*sizeof(double)); + } + + + }; + + struct Vset{ + // Complex float + inline vecf operator()(Grid::ComplexF *a){ + vecf out; + + for (unsigned int i = 0; i < fcw; ++i) + { + out.v[2*i] = a[i].real(); + out.v[2*i+1] = a[i].imag(); + } + + return out; + } + + // Complex double + inline vecd operator()(Grid::ComplexD *a){ + vecd out; + + for (unsigned int i = 0; i < dcw; ++i) + { + out.v[2*i] = a[i].real(); + out.v[2*i+1] = a[i].imag(); + } + + return out; + } + + // Real float + inline vecf operator()(float *a){ + vecf out; + + memcpy(out.v,a,fw*sizeof(float)); + + return out; + } + // Real double + inline vecd operator()(double *a){ + vecd out; + + memcpy(out.v,a,dw*sizeof(float)); + + return out; + } + // Integer + inline int operator()(Integer *a){ + return a[0]; + } + + + }; + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline vecf operator()(vecf a, vecf b){ + vecf out; + + for (unsigned int i = 0; i < fw; ++i) + { + out.v[i] = a.v[i] + b.v[i]; + } + + return out; + } + + //Complex/Real double + inline vecd operator()(vecd a, vecd b){ + vecd out; + + for (unsigned int i = 0; i < dw; ++i) + { + out.v[i] = a.v[i] + b.v[i]; + } + + return out; + } + + //Integer + inline int operator()(int a, int b){ + return a + b; + } + }; + + struct Sub{ + //Complex/Real float + inline vecf operator()(vecf a, vecf b){ + vecf out; + + for (unsigned int i = 0; i < fw; ++i) + { + out.v[i] = a.v[i] - b.v[i]; + } + + return out; + } + + //Complex/Real double + inline vecd operator()(vecd a, vecd b){ + vecd out; + + for (unsigned int i = 0; i < dw; ++i) + { + out.v[i] = a.v[i] - b.v[i]; + } + + return out; + } + + //Integer + inline int operator()(int a, int b){ + return a-b; + } + }; + + #define cmul(a, b, c, i)\ + c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ + c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; + + struct MultComplex{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + vecf out; + + for (unsigned int i = 0; i < fcw; ++i) + { + cmul(a.v, b.v, out.v, 2*i); + } + + return out; + } + + // Complex double + inline vecd operator()(vecd a, vecd b){ + vecd out; + + for (unsigned int i = 0; i < dcw; ++i) + { + cmul(a.v, b.v, out.v, 2*i); + } + + return out; + } + }; + + #undef cmul + + struct Mult{ + // Real float + inline vecf operator()(vecf a, vecf b){ + vecf out; + + for (unsigned int i = 0; i < fw; ++i) + { + out.v[i] = a.v[i]*b.v[i]; + } + + return out; + } + + // Real double + inline vecd operator()(vecd a, vecd b){ + vecd out; + + for (unsigned int i = 0; i < dw; ++i) + { + out.v[i] = a.v[i]*b.v[i]; + } + + return out; + } + + // Integer + inline int operator()(int a, int b){ + return a*b; + } + }; + + struct Div{ + // Real float + inline vecf operator()(vecf a, vecf b){ + vecf out; + + for (unsigned int i = 0; i < fw; ++i) + { + out.v[i] = a.v[i]/b.v[i]; + } + + return out; + } + // Real double + inline vecd operator()(vecd a, vecd b){ + vecd out; + + for (unsigned int i = 0; i < dw; ++i) + { + out.v[i] = a.v[i]/b.v[i]; + } + + return out; + } + }; + + #define conj(a, b, i)\ + b[i] = a[i];\ + b[i+1] = -a[i+1]; + + struct Conj{ + // Complex single + inline vecf operator()(vecf in){ + vecf out; + + for (unsigned int i = 0; i < fcw; ++i) + { + conj(in.v, out.v, 2*i); + } + + return out; + } + + // Complex double + inline vecd operator()(vecd in){ + vecd out; + + for (unsigned int i = 0; i < dcw; ++i) + { + conj(in.v, out.v, 2*i); + } + + return out; + } + }; + + #undef conj + + #define timesmi(a, b, i)\ + b[i] = a[i+1];\ + b[i+1] = -a[i]; + + struct TimesMinusI{ + // Complex single + inline vecf operator()(vecf in, vecf ret){ + vecf out; + + for (unsigned int i = 0; i < fcw; ++i) + { + timesmi(in.v, out.v, 2*i); + } + + return out; + } + + // Complex double + inline vecd operator()(vecd in, vecd ret){ + vecd out; + + for (unsigned int i = 0; i < dcw; ++i) + { + timesmi(in.v, out.v, 2*i); + } + + return out; + } + }; + + #undef timesmi + + #define timespi(a, b, i)\ + b[i] = -a[i+1];\ + b[i+1] = a[i]; + + struct TimesI{ + // Complex single + inline vecf operator()(vecf in, vecf ret){ + vecf out; + + for (unsigned int i = 0; i < fcw; ++i) + { + timespi(in.v, out.v, 2*i); + } + + return out; + } + + // Complex double + inline vecd operator()(vecd in, vecd ret){ + vecd out; + + for (unsigned int i = 0; i < dcw; ++i) + { + timespi(in.v, out.v, 2*i); + } + + return out; + } + }; + + #undef timespi + + ////////////////////////////////////////////// + // Some Template specialization + struct Permute{ + static inline vecf Permute0(vecf in){ //AB CD -> CD AB + vecf out; + + out.v[0] = in.v[4]; + out.v[1] = in.v[5]; + out.v[2] = in.v[6]; + out.v[3] = in.v[7]; + out.v[4] = in.v[0]; + out.v[5] = in.v[1]; + out.v[6] = in.v[2]; + out.v[7] = in.v[3]; + + return out; + }; + + static inline vecf Permute1(vecf in){ //AB CD -> BA DC + vecf out; + + out.v[0] = in.v[2]; + out.v[1] = in.v[3]; + out.v[2] = in.v[0]; + out.v[3] = in.v[1]; + out.v[4] = in.v[6]; + out.v[5] = in.v[7]; + out.v[6] = in.v[4]; + out.v[7] = in.v[5]; + + return out; + }; + + static inline vecf Permute2(vecf in){ + vecf out; + + out.v[0] = in.v[1]; + out.v[1] = in.v[0]; + out.v[2] = in.v[3]; + out.v[3] = in.v[2]; + out.v[4] = in.v[5]; + out.v[5] = in.v[4]; + out.v[6] = in.v[7]; + out.v[7] = in.v[6]; + + return out; + }; + + static inline vecf Permute3(vecf in){ + return in; + }; + + static inline vecd Permute0(vecd in){ //AB -> BA + vecd out; + + out.v[0] = in.v[2]; + out.v[1] = in.v[3]; + out.v[2] = in.v[0]; + out.v[3] = in.v[1]; + + return out; + }; + + static inline vecd Permute1(vecd in){ + vecd out; + + out.v[0] = in.v[1]; + out.v[1] = in.v[0]; + out.v[2] = in.v[3]; + out.v[3] = in.v[2]; + + return out; + }; + + static inline vecd Permute2(vecd in){ + return in; + }; + + static inline vecd Permute3(vecd in){ + return in; + }; + + }; + + #define rot(a, b, n, w)\ + for (unsigned int i = 0; i < w; ++i)\ + {\ + b[i] = a[(i + n)%w];\ + } + + struct Rotate{ + + static inline vecf rotate(vecf in, int n){ + vecf out; + + rot(in.v, out.v, n, fw); + + return out; + } + + static inline vecd rotate(vecd in,int n){ + vecd out; + + rot(in.v, out.v, n, dw); + + return out; + } + }; + + #undef rot + + #define acc(v, a, off, step, n)\ + for (unsigned int i = off; i < n; i += step)\ + {\ + a += v[i];\ + } + + template + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + //Complex float Reduce + template<> + inline Grid::ComplexF Reduce::operator()(vecf in){ + float a = 0.f, b = 0.f; + + acc(in.v, a, 0, 2, fw); + acc(in.v, b, 1, 2, fw); + + return Grid::ComplexF(a, b); + } + + //Real float Reduce + template<> + inline Grid::RealF Reduce::operator()(vecf in){ + float a = 0.; + + acc(in.v, a, 0, 1, fw); + + return a; + } + + //Complex double Reduce + template<> + inline Grid::ComplexD Reduce::operator()(vecd in){ + double a = 0., b = 0.; + + acc(in.v, a, 0, 2, dw); + acc(in.v, b, 1, 2, dw); + + return Grid::ComplexD(a, b); + } + + //Real double Reduce + template<> + inline Grid::RealD Reduce::operator()(vecd in){ + double a = 0.f; + + acc(in.v, a, 0, 1, dw); + + return a; + } + + //Integer Reduce + template<> + inline Integer Reduce::operator()(int in){ + return in; + } +} + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + + typedef Optimization::vecf SIMD_Ftype; // Single precision type + typedef Optimization::vecd SIMD_Dtype; // Double precision type + typedef int SIMD_Itype; // Integer type + + // prefetch utilities + inline void v_prefetch0(int size, const char *ptr){}; + inline void prefetch_HINT_T0(const char *ptr){}; + + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index e1592eef..9b9ad18b 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -38,8 +38,8 @@ directory #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES -#ifdef GENERIC_VEC -#include "Grid_generic.h" +#ifdef GEN256 +#include "Grid_generic_256.h" #endif #ifdef SSE4 #include "Grid_sse4.h"