diff --git a/Grid/simd/Grid_doubled_vector.h b/Grid/simd/Grid_doubled_vector.h new file mode 100644 index 00000000..ee604750 --- /dev/null +++ b/Grid/simd/Grid_doubled_vector.h @@ -0,0 +1,666 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_vector_types.h + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +class Grid_simd2 { +public: + typedef typename RealPart::type Real; + typedef Vector_type vector_type; + typedef Scalar_type scalar_type; + + typedef union conv_t_union { + Vector_type v; + Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)]; + accelerator_inline conv_t_union(){}; + } conv_t; + + static constexpr int nvec=2; + Vector_type v[nvec]; + + static accelerator_inline constexpr int Nsimd(void) { + static_assert( (sizeof(Vector_type) / sizeof(Scalar_type) >= 1), " size mismatch " ); + + return nvec*sizeof(Vector_type) / sizeof(Scalar_type); + } + + accelerator_inline Grid_simd2 &operator=(const Grid_simd2 &&rhs) { + for(int n=0;n accelerator_inline + Grid_simd2(const typename std::enable_if::value, S>::type a) { + vsplat(*this, a); + }; + + ///////////////////////////// + // Constructors + ///////////////////////////// + accelerator_inline Grid_simd2 & operator=(const Zero &z) { + vzero(*this); + return (*this); + } + + /////////////////////////////////////////////// + // mac, mult, sub, add, adj + /////////////////////////////////////////////// + + friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ a, + const Grid_simd2 *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + + friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) * (*r); + } + + friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) - (*r); + } + friend accelerator_inline void add(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) + (*r); + } + friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, + const Scalar_type *__restrict__ a, + const Grid_simd2 *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) * (*r); + } + friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) - (*r); + } + friend accelerator_inline void add(Grid_simd2 *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) + (*r); + } + + friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ a, + const Scalar_type *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) * (*r); + } + friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) - (*r); + } + friend accelerator_inline void add(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) + (*r); + } + + //////////////////////////////////////////////////////////////////////// + // FIXME: gonna remove these load/store, get, set, prefetch + //////////////////////////////////////////////////////////////////////// + friend accelerator_inline void vset(Grid_simd2 &ret, Scalar_type *a) { + for(int n=0;n + friend accelerator_inline Grid_simd2 SimdApply(const functor &func, const Grid_simd2 &v) { + Grid_simd2 ret; + for(int n=0;n + friend accelerator_inline Grid_simd2 SimdApplyBinop(const functor &func, + const Grid_simd2 &x, + const Grid_simd2 &y) { + Grid_simd2 ret; + for(int n=0;n Al Bl Ah,Bh + /////////////////////// + friend accelerator_inline void exchange0(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + out1.v[0] = in1.v[0]; + out1.v[1] = in2.v[0]; + out2.v[0] = in1.v[1]; + out2.v[1] = in2.v[1]; + } + friend accelerator_inline void exchange1(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + exchange0(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); + exchange0(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); + } + friend accelerator_inline void exchange2(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + exchange1(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); + exchange1(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); + } + friend accelerator_inline void exchange3(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + exchange2(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); + exchange2(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); + } + friend accelerator_inline void exchange4(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + exchange3(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); + exchange3(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); + } + friend accelerator_inline void exchange(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2,int n) + { + if (n==3) { + exchange3(out1,out2,in1,in2); + } else if(n==2) { + exchange2(out1,out2,in1,in2); + } else if(n==1) { + exchange1(out1,out2,in1,in2); + } else if(n==0) { + exchange0(out1,out2,in1,in2); + } + } + //////////////////////////////////////////////////////////////////// + // General permute; assumes vector length is same across + // all subtypes; may not be a good assumption, but could + // add the vector width as a template param for BG/Q for example + //////////////////////////////////////////////////////////////////// + friend accelerator_inline void permute0(Grid_simd2 &y, Grid_simd2 b) { + y.v[0]=b.v[1]; + y.v[1]=b.v[0]; + } + friend accelerator_inline void permute1(Grid_simd2 &y, Grid_simd2 b) { + permute0(y.v[0],b.v[0]); + permute0(y.v[1],b.v[1]); + } + friend accelerator_inline void permute2(Grid_simd2 &y, Grid_simd2 b) { + permute1(y.v[0],b.v[0]); + permute1(y.v[1],b.v[1]); + } + friend accelerator_inline void permute3(Grid_simd2 &y, Grid_simd2 b) { + permute2(y.v[0],b.v[0]); + permute2(y.v[1],b.v[1]); + } + friend accelerator_inline void permute4(Grid_simd2 &y, Grid_simd2 b) { + permute3(y.v[0],b.v[0]); + permute3(y.v[1],b.v[1]); + } + friend accelerator_inline void permute(Grid_simd2 &y, Grid_simd2 b, int perm) { + if(perm==3) permute3(y, b); + else if(perm==2) permute2(y, b); + else if(perm==1) permute1(y, b); + else if(perm==0) permute0(y, b); + } + + /////////////////////////////// + // Getting single lanes + /////////////////////////////// + accelerator_inline Scalar_type getlane(int lane) const { + if(lane < vector_type::Nsimd() ) return v[0].getlane(lane); + else return v[1].getlane(lane%vector_type::Nsimd()); + } + + accelerator_inline void putlane(const Scalar_type &S, int lane){ + if(lane < vector_type::Nsimd() ) v[0].putlane(S,lane); + else v[1].putlane(S,lane%vector_type::Nsimd()); + } +}; // end of Grid_simd2 class definition + +/////////////////////////////// +// Define available types +/////////////////////////////// + +typedef Grid_simd2 , vComplexD> vComplexD2; +typedef Grid_simd2 vRealD2; + + + +///////////////////////////////////////// +// Some traits to recognise the types +///////////////////////////////////////// +template +struct is_simd : public std::false_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; + +template using IfSimd = Invoke::value, int> >; +template using IfNotSimd = Invoke::value, unsigned> >; + +/////////////////////////////////////////////// +// insert / extract with complex support +/////////////////////////////////////////////// +template +accelerator_inline S getlane(const Grid_simd &in,int lane) { + return in.getlane(lane); +} +template +accelerator_inline void putlane(Grid_simd &vec,const S &_S, int lane){ + vec.putlane(_S,lane); +} +template = 0 > +accelerator_inline S getlane(const S &in,int lane) { + return in; +} +template = 0 > +accelerator_inline void putlane(S &vec,const S &_S, int lane){ + vec = _S; +} +template +accelerator_inline S getlane(const Grid_simd2 &in,int lane) { + return in.getlane(lane); +} +template +accelerator_inline void putlane(Grid_simd2 &vec,const S &_S, int lane){ + vec.putlane(_S,lane); +} + + +//////////////////////////////////////////////////////////////////// +// General rotate +//////////////////////////////////////////////////////////////////// + +template +accelerator_inline void vbroadcast(Grid_simd2 &ret,const Grid_simd2 &src,int lane){ + S* typepun =(S*) &src; + vsplat(ret,typepun[lane]); +} +template =0> +accelerator_inline void rbroadcast(Grid_simd2 &ret,const Grid_simd2 &src,int lane){ + typedef typename V::vector_type vector_type; + S* typepun =(S*) &src; + ret.v[0].v = unary(real(typepun[lane]), VsplatSIMD()); + ret.v[1].v = unary(real(typepun[lane]), VsplatSIMD()); +} + + +/////////////////////// +// Splat +/////////////////////// + +// this is only for the complex version +template = 0, class ABtype> +accelerator_inline void vsplat(Grid_simd2 &ret, ABtype a, ABtype b) { + vsplat(ret.v[0],a,b); + vsplat(ret.v[1],a,b); +} + +// overload if complex +template +accelerator_inline void vsplat(Grid_simd2 &ret, EnableIf, S> c) { + vsplat(ret, real(c), imag(c)); +} +template +accelerator_inline void rsplat(Grid_simd2 &ret, EnableIf, S> c) { + vsplat(ret, real(c), real(c)); +} + +// if real fill with a, if complex fill with a in the real part (first function +// above) +template +accelerator_inline void vsplat(Grid_simd2 &ret, NotEnableIf, S> a) +{ + vsplat(ret.v[0],a); + vsplat(ret.v[1],a); +} +////////////////////////// + +/////////////////////////////////////////////// +// Initialise to 1,0,i for the correct types +/////////////////////////////////////////////// +// For complex types +template = 0> +accelerator_inline void vone(Grid_simd2 &ret) { + vsplat(ret, S(1.0, 0.0)); +} +template = 0> +accelerator_inline void vzero(Grid_simd2 &ret) { + vsplat(ret, S(0.0, 0.0)); +} // use xor? +template = 0> +accelerator_inline void vcomplex_i(Grid_simd2 &ret) { + vsplat(ret, S(0.0, 1.0)); +} + +template = 0> +accelerator_inline void visign(Grid_simd2 &ret) { + vsplat(ret, S(1.0, -1.0)); +} +template = 0> +accelerator_inline void vrsign(Grid_simd2 &ret) { + vsplat(ret, S(-1.0, 1.0)); +} + +// if not complex overload here +template = 0> +accelerator_inline void vone(Grid_simd2 &ret) { + vsplat(ret, S(1.0)); +} +template = 0> +accelerator_inline void vzero(Grid_simd2 &ret) { + vsplat(ret, S(0.0)); +} + +// For integral types +template = 0> +accelerator_inline void vone(Grid_simd2 &ret) { + vsplat(ret, 1); +} +template = 0> +accelerator_inline void vzero(Grid_simd2 &ret) { + vsplat(ret, 0); +} +template = 0> +accelerator_inline void vtrue(Grid_simd2 &ret) { + vsplat(ret, 0xFFFFFFFF); +} +template = 0> +accelerator_inline void vfalse(Grid_simd2 &ret) { + vsplat(ret, 0); +} +template +accelerator_inline void zeroit(Grid_simd2 &z) { + vzero(z); +} + +/////////////////////// +// Vstream +/////////////////////// +template = 0> +accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { + vstream(out.v[0],in.v[0]); + vstream(out.v[1],in.v[1]); +} +template = 0> +accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { + vstream(out.v[0],in.v[0]); + vstream(out.v[1],in.v[1]); +} +template = 0> +accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { + vstream(out.v[0],in.v[0]); + vstream(out.v[1],in.v[1]); +} + +//////////////////////////////////// +// Arithmetic operator overloads +,-,* +//////////////////////////////////// +template +accelerator_inline Grid_simd2 operator+(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] = a.v[0]+b.v[0]; + ret.v[1] = a.v[1]+b.v[1]; + return ret; +}; + +template +accelerator_inline Grid_simd2 operator-(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] = a.v[0]-b.v[0]; + ret.v[1] = a.v[1]-b.v[1]; + return ret; +}; + +// Distinguish between complex types and others +template = 0> +accelerator_inline Grid_simd2 real_mult(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] =real_mult(a.v[0],b.v[0]); + ret.v[1] =real_mult(a.v[1],b.v[1]); + return ret; +}; +template = 0> +accelerator_inline Grid_simd2 real_madd(Grid_simd2 a, Grid_simd2 b, Grid_simd2 c) { + Grid_simd2 ret; + ret.v[0] =real_madd(a.v[0],b.v[0],c.v[0]); + ret.v[1] =real_madd(a.v[1],b.v[1],c.v[1]); + return ret; +}; + + +// Distinguish between complex types and others +template +accelerator_inline Grid_simd2 operator*(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] = a.v[0]*b.v[0]; + ret.v[1] = a.v[1]*b.v[1]; + return ret; +}; + +// Distinguish between complex types and others +template +accelerator_inline Grid_simd2 operator/(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] = a.v[0]/b.v[0]; + ret.v[1] = a.v[1]/b.v[1]; + return ret; +}; + +/////////////////////// +// Conjugate +/////////////////////// +template +accelerator_inline Grid_simd2 conjugate(const Grid_simd2 &in) { + Grid_simd2 ret; + ret.v[0] = conjugate(in.v[0]); + ret.v[1] = conjugate(in.v[1]); + return ret; +} +template = 0> +accelerator_inline Grid_simd2 adj(const Grid_simd2 &in) { + return conjugate(in); +} + +/////////////////////// +// timesMinusI +/////////////////////// +template +accelerator_inline void timesMinusI(Grid_simd2 &ret, const Grid_simd2 &in) { + timesMinusI(ret.v[0],in.v[0]); + timesMinusI(ret.v[1],in.v[1]); +} +template +accelerator_inline Grid_simd2 timesMinusI(const Grid_simd2 &in) { + Grid_simd2 ret; + timesMinusI(ret.v[0],in.v[0]); + timesMinusI(ret.v[1],in.v[1]); + return ret; +} + +/////////////////////// +// timesI +/////////////////////// +template +accelerator_inline void timesI(Grid_simd2 &ret, const Grid_simd2 &in) { + timesI(ret.v[0],in.v[0]); + timesI(ret.v[1],in.v[1]); +} +template +accelerator_inline Grid_simd2 timesI(const Grid_simd2 &in) { + Grid_simd2 ret; + timesI(ret.v[0],in.v[0]); + timesI(ret.v[1],in.v[1]); + return ret; +} + +///////////////////// +// Inner, outer +///////////////////// +template +accelerator_inline Grid_simd2 innerProduct(const Grid_simd2 &l,const Grid_simd2 &r) { + return conjugate(l) * r; +} +template +accelerator_inline Grid_simd2 outerProduct(const Grid_simd2 &l,const Grid_simd2 &r) { + return l * conjugate(r); +} + +template +accelerator_inline Grid_simd2 trace(const Grid_simd2 &arg) { + return arg; +} + +//////////////////////////////////////////////////////////// +// copy/splat complex real parts into real; +// insert real into complex and zero imag; +//////////////////////////////////////////////////////////// +accelerator_inline void precisionChange(vComplexD2 &out,const vComplexF &in){ + Optimization::PrecisionChange::StoD(in.v,out.v[0].v,out.v[1].v); +} +accelerator_inline void precisionChange(vComplexF &out,const vComplexD2 &in){ + out.v=Optimization::PrecisionChange::DtoS(in.v[0].v,in.v[1].v); +} +accelerator_inline void precisionChange(vComplexD2 *out,const vComplexF *in,int nvec){ + for(int m=0;m