/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_generic.h Copyright (C) 2015 Copyright (C) 2017 Author: Antonin Portelli Andrew Lawson This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include "Grid_generic_types.h" NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Optimization); struct Vsplat{ // Complex template accelerator_inline vec operator()(T a, T b){ vec out; VECTOR_FOR(i, W::r, 2) { out.v[i] = a; out.v[i+1] = b; } return out; } // Real template accelerator_inline vec operator()(T a){ vec out; VECTOR_FOR(i, W::r, 1) { out.v[i] = a; } return out; } }; struct Vstore{ // Real template accelerator_inline void operator()(vec a, T *D){ *((vec *)D) = a; } }; struct Vstream{ // Real template accelerator_inline void operator()(T * a, vec b){ *((vec *)a) = b; } }; struct Vset{ // Complex template accelerator_inline vec operator()(std::complex *a){ vec out; VECTOR_FOR(i, W::c, 1) { out.v[2*i] = a[i].real(); out.v[2*i+1] = a[i].imag(); } return out; } // Real template accelerator_inline vec operator()(T *a){ vec out; out = *((vec *)a); return out; } }; ///////////////////////////////////////////////////// // Arithmetic operations ///////////////////////////////////////////////////// struct Sum{ // Complex/Real template accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i] + b.v[i]; } return out; } }; struct Sub{ // Complex/Real template accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i] - b.v[i]; } return out; } }; struct Mult{ // Real template accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i]*b.v[i]; } return out; } }; #define cmul(a, b, c, i) \ c[i] = a[i]*b[i] - a[i+1]*b[i+1]; \ c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; struct MultRealPart{ template accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::c, 1) { out.v[2*i] = a.v[2*i]*b.v[2*i]; out.v[2*i+1] = a.v[2*i]*b.v[2*i+1]; } return out; } }; struct MaddRealPart{ template accelerator_inline vec operator()(vec a, vec b, vec c){ vec out; VECTOR_FOR(i, W::c, 1) { out.v[2*i] = a.v[2*i]*b.v[2*i] + c.v[2*i]; out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1]; } return out; } }; struct MultComplex{ // Complex template accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::c, 1) { cmul(a.v, b.v, out.v, 2*i); } return out; } }; #undef cmul struct Div{ // Real template accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::r, 1) { out.v[i] = a.v[i]/b.v[i]; } return out; } }; #define conj(a, b, i) \ b[i] = a[i]; \ b[i+1] = -a[i+1]; struct Conj{ // Complex template accelerator_inline vec operator()(vec a){ vec out; VECTOR_FOR(i, W::c, 1) { conj(a.v, out.v, 2*i); } return out; } }; #undef conj #define timesmi(a, b, i) \ b[i] = a[i+1]; \ b[i+1] = -a[i]; struct TimesMinusI{ // Complex template accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::c, 1) { timesmi(a.v, out.v, 2*i); } return out; } }; #undef timesmi #define timesi(a, b, i) \ b[i] = -a[i+1]; \ b[i+1] = a[i]; struct TimesI{ // Complex template accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::c, 1) { timesi(a.v, out.v, 2*i); } return out; } }; #undef timesi struct PrecisionChange { static accelerator_inline vech StoH (const vecf &a,const vecf &b) { vech ret; const int nf = W::r; #ifdef USE_FP16 vech *ha = (vech *)&a; vech *hb = (vech *)&b; // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; } // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } #else VECTOR_FOR(i, nf,1){ ret.v[i]=0; } assert(0); #endif return ret; } static accelerator_inline void HtoS (vech h,vecf &sa,vecf &sb) { #ifdef USE_FP16 const int nf = W::r; const int nh = W::r; vech *ha = (vech *)&sa; vech *hb = (vech *)&sb; VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; } // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];} // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } #else assert(0); #endif } static accelerator_inline vecf DtoS (vecd a,vecd b) { const int nd = W::r; vecf ret; VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; } VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } return ret; } static accelerator_inline void StoD (vecf s,vecd &a,vecd &b) { const int nd = W::r; VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } } static accelerator_inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { vecf sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); return StoH(sa,sb); } static accelerator_inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { vecf sa,sb; HtoS(h,sa,sb); StoD(sa,a,b); StoD(sb,c,d); } }; ////////////////////////////////////////////// // Exchange support struct Exchange{ template static accelerator_inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ const int w = W::r; unsigned int mask = w >> (n + 1); // std::cout << " Exchange "< static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; }; ////////////////////////////////////////////// // Some Template specialization #define perm(a, b, n, w) \ unsigned int _mask = w >> (n + 1); \ VECTOR_FOR(i, w, 1) \ { \ b[i] = a[i^_mask]; \ } #define DECL_PERMUTE_N(n) \ template \ static accelerator_inline vec Permute##n(vec in) { \ vec out; \ perm(in.v, out.v, n, W::r); \ return out; \ } struct Permute{ DECL_PERMUTE_N(0); DECL_PERMUTE_N(1); DECL_PERMUTE_N(2); DECL_PERMUTE_N(3); }; #undef perm #undef DECL_PERMUTE_N #define rot(a, b, n, w) \ VECTOR_FOR(i, w, 1) \ { \ b[i] = a[(i + n)%w]; \ } struct Rotate{ template static accelerator_inline vec tRotate(vec in){ return rotate(in, n); } template static accelerator_inline vec rotate(vec in, int n){ vec out; rot(in.v, out.v, n, W::r); return out; } }; #undef rot #define acc(v, a, off, step, n) \ for (unsigned int i = off; i < n; i += step) \ { \ a += v[i]; \ } template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled accelerator_inline Out_type operator()(In_type in){ printf("Error, using wrong Reduce function\n"); exit(1); return 0; } }; //Complex float Reduce template <> accelerator_inline Grid::ComplexF Reduce::operator()(vecf in){ float a = 0.f, b = 0.f; acc(in.v, a, 0, 2, W::r); acc(in.v, b, 1, 2, W::r); return Grid::ComplexF(a, b); } //Real float Reduce template<> accelerator_inline Grid::RealF Reduce::operator()(vecf in){ float a = 0.; acc(in.v, a, 0, 1, W::r); return a; } //Complex double Reduce template<> accelerator_inline Grid::ComplexD Reduce::operator()(vecd in){ double a = 0., b = 0.; acc(in.v, a, 0, 2, W::r); acc(in.v, b, 1, 2, W::r); return Grid::ComplexD(a, b); } //Real double Reduce template<> accelerator_inline Grid::RealD Reduce::operator()(vecd in){ double a = 0.f; acc(in.v, a, 0, 1, W::r); return a; } //Integer Reduce template<> accelerator_inline Integer Reduce::operator()(veci in){ Integer a = 0; acc(in.v, a, 0, 1, W::r); return a; } #undef acc // EIGEN compatibility NAMESPACE_END(Optimization) ////////////////////////////////////////////////////////////////////////////////////// // Here assign types typedef Optimization::vech SIMD_Htype; // Reduced precision type typedef Optimization::vecf SIMD_Ftype; // Single precision type typedef Optimization::vecd SIMD_Dtype; // Double precision type typedef Optimization::veci SIMD_Itype; // Integer type // prefetch utilities accelerator_inline void v_prefetch0(int size, const char *ptr){}; accelerator_inline void prefetch_HINT_T0(const char *ptr){}; // Function name aliases typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vstore VstoreSIMD; typedef Optimization::Vset VsetSIMD; typedef Optimization::Vstream VstreamSIMD; template using ReduceSIMD = Optimization::Reduce; // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; NAMESPACE_END(Grid)