/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_gpu.h Copyright (C) 2018 Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ //---------------------------------------------------------------------- /*! @file Grid_gpu.h @brief Optimization libraries for GPU Use float4, double2 */ //---------------------------------------------------------------------- #include namespace Grid { template class GpuComplex { public: pair z; typedef decltype(z.x) real; public: accelerator_inline GpuComplex() = default; accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; }; accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;}; friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) { GpuComplex r ; r.z.x = lhs.z.x + rhs.z.x; r.z.y = lhs.z.y + rhs.z.y; return r; } friend accelerator_inline GpuComplex operator-(const GpuComplex &lhs,const GpuComplex &rhs) { GpuComplex r ; r.z.x = lhs.z.x - rhs.z.x; r.z.y = lhs.z.y - rhs.z.y; return r; } friend accelerator_inline GpuComplex operator*(const GpuComplex &lhs,const GpuComplex &rhs) { GpuComplex r ; r.z.x= lhs.z.x*rhs.z.x - lhs.z.y*rhs.z.y; // rr-ii r.z.y= lhs.z.x*rhs.z.y + lhs.z.y*rhs.z.x; // ri+ir return r; } friend accelerator_inline GpuComplex real_mult(const GpuComplex &l,const GpuComplex &r) { GpuComplex ret; ret.z.x = l.z.x*r.z.x; ret.z.y = l.z.x*r.z.y; return ret; } friend std::ostream& operator<< (std::ostream& stream, const GpuComplex o){ stream << "("<< o.z.x << ","<< o.z.y <<")"; return stream; } }; template struct GpuVector { _datum v[_N]; static const int N = _N; typedef _datum datum; }; template inline accelerator GpuVector operator*(const GpuVector l,const GpuVector r) { GpuVector ret; for(int i=0;i inline accelerator GpuVector operator-(const GpuVector l,const GpuVector r) { GpuVector ret; for(int i=0;i inline accelerator GpuVector operator+(const GpuVector l,const GpuVector r) { GpuVector ret; for(int i=0;i inline accelerator GpuVector operator/(const GpuVector l,const GpuVector r) { GpuVector ret; for(int i=0;i GpuComplexH; typedef GpuComplex GpuComplexF; typedef GpuComplex GpuComplexD; typedef GpuVector GpuVectorRH; typedef GpuVector GpuVectorCH; typedef GpuVector GpuVectorRF; typedef GpuVector GpuVectorCF; typedef GpuVector GpuVectorRD; typedef GpuVector GpuVectorCD; typedef GpuVector GpuVectorI; accelerator_inline float half2float(half h) { float f; #ifdef __CUDA_ARCH__ f = __half2float(h); #else //f = __half2float(h); __half_raw hr(h); Grid_half hh; hh.x = hr.x; f= sfw_half_to_float(hh); #endif return f; } accelerator_inline half float2half(float f) { half h; #ifdef __CUDA_ARCH__ h = __float2half(f); #else Grid_half hh = sfw_float_to_half(f); __half_raw hr; hr.x = hh.x; h = __half(hr); #endif return h; } namespace Optimization { struct Vsplat{ //Complex float accelerator_inline GpuVectorCF operator()(float a, float b){ GpuVectorCF ret; for(int i=0;i accelerator_inline void operator()(GpuVector a, P* Fp){ GpuVector *vF = (GpuVector *)Fp; *vF = a; } }; struct Vstream{ template accelerator_inline void operator()(P* F,GpuVector a){ GpuVector *vF = (GpuVector *)F; *vF = a; } }; struct Vset{ // Complex float accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a){ typedef GpuVectorCF vec; vec ret; for(int i=0;i struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled accelerator_inline Out_type operator()(In_type in){ printf("Error, using wrong Reduce function\n"); exit(1); return 0; } }; ///////////////////////////////////////////////////// // Arithmetic operations ///////////////////////////////////////////////////// struct Sum{ //Real float accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){ return a+b; } accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){ return a+b; } accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ return a+b; } accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){ return a+b; } accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){ return a+b; } }; struct Sub{ accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){ return a-b; } accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){ return a-b; } accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ return a-b; } accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){ return a-b; } accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){ return a-b; } }; struct MultRealPart{ accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ typedef GpuVectorCF vec; vec ret; for(int i=0;i static accelerator_inline vec PermuteN(vec in) { vec out; unsigned int _mask = vec::N >> (n + 1); for(int i=0;i static accelerator_inline vec Permute0(vec in) { return PermuteN<0,vec>(in); } template static accelerator_inline vec Permute1(vec in) { return PermuteN<1,vec>(in); } template static accelerator_inline vec Permute2(vec in) { return PermuteN<2,vec>(in); } template static accelerator_inline vec Permute3(vec in) { return PermuteN<3,vec>(in); } }; struct PrecisionChange { //////////////////////////////////////////////////////////////////////////////////// // Single / Half //////////////////////////////////////////////////////////////////////////////////// static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { int N = GpuVectorCF::N; GpuVectorCH h; for(int i=0;i static accelerator_inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ unsigned int mask = vec::N >> (n + 1); for(int i=0;i static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; }; struct Rotate{ template static accelerator_inline vec tRotate(vec in){ return rotate(in, n); } template static accelerator_inline vec rotate_template(vec in, int n){ vec out; for(int i=0;i accelerator_inline Grid::ComplexF Reduce::operator()(GpuVectorCF in) { GpuComplexF greduce = in.v[0]; for(int i=1;i accelerator_inline Grid::ComplexD Reduce::operator()(GpuVectorCD in) { GpuComplexD greduce = in.v[0]; for(int i=1;i accelerator_inline Grid::RealF Reduce::operator()(GpuVectorRF in) { RealF ret = in.v[0]; for(int i=1;i accelerator_inline Grid::RealD Reduce::operator()(GpuVectorRD in) { RealD ret = in.v[0]; for(int i=1;i accelerator_inline Integer Reduce::operator()(GpuVectorI in) { Integer ret = in.v[0]; for(int i=1;i using ReduceSIMD = Optimization::Reduce; // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; }