From 408b8684754119992c3ed03ce922d71ff8b191a5 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 24 Jan 2018 13:49:12 +0000 Subject: [PATCH] Generic for GPU needs accelerator markup of functions --- lib/simd/Grid_generic.h | 82 ++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 6fef3e2b..35cb3bc3 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -36,7 +36,7 @@ NAMESPACE_BEGIN(Optimization); struct Vsplat{ // Complex template - inline vec operator()(T a, T b){ + accelerator_inline vec operator()(T a, T b){ vec out; VECTOR_FOR(i, W::r, 2) @@ -50,7 +50,7 @@ struct Vsplat{ // Real template - inline vec operator()(T a){ + accelerator_inline vec operator()(T a){ vec out; VECTOR_FOR(i, W::r, 1) @@ -65,7 +65,7 @@ struct Vsplat{ struct Vstore{ // Real template - inline void operator()(vec a, T *D){ + accelerator_inline void operator()(vec a, T *D){ *((vec *)D) = a; } }; @@ -73,7 +73,7 @@ struct Vstore{ struct Vstream{ // Real template - inline void operator()(T * a, vec b){ + accelerator_inline void operator()(T * a, vec b){ *((vec *)a) = b; } }; @@ -81,7 +81,7 @@ struct Vstream{ struct Vset{ // Complex template - inline vec operator()(std::complex *a){ + accelerator_inline vec operator()(std::complex *a){ vec out; VECTOR_FOR(i, W::c, 1) @@ -95,7 +95,7 @@ struct Vset{ // Real template - inline vec operator()(T *a){ + accelerator_inline vec operator()(T *a){ vec out; out = *((vec *)a); @@ -110,7 +110,7 @@ struct Vset{ struct Sum{ // Complex/Real template - inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::r, 1) @@ -125,7 +125,7 @@ struct Sum{ struct Sub{ // Complex/Real template - inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::r, 1) @@ -140,7 +140,7 @@ struct Sub{ struct Mult{ // Real template - inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::r, 1) @@ -158,7 +158,7 @@ struct Mult{ struct MultRealPart{ template - inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::c, 1) @@ -172,7 +172,7 @@ struct MultRealPart{ struct MaddRealPart{ template - inline vec operator()(vec a, vec b, vec c){ + accelerator_inline vec operator()(vec a, vec b, vec c){ vec out; VECTOR_FOR(i, W::c, 1) @@ -187,7 +187,7 @@ struct MaddRealPart{ struct MultComplex{ // Complex template - inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::c, 1) @@ -204,7 +204,7 @@ struct MultComplex{ struct Div{ // Real template - inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::r, 1) @@ -223,7 +223,7 @@ struct Div{ struct Conj{ // Complex template - inline vec operator()(vec a){ + accelerator_inline vec operator()(vec a){ vec out; VECTOR_FOR(i, W::c, 1) @@ -244,7 +244,7 @@ struct Conj{ struct TimesMinusI{ // Complex template - inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::c, 1) @@ -265,7 +265,7 @@ struct TimesMinusI{ struct TimesI{ // Complex template - inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a, vec b){ vec out; VECTOR_FOR(i, W::c, 1) @@ -280,22 +280,23 @@ struct TimesI{ #undef timesi struct PrecisionChange { - static inline vech StoH (const vecf &a,const vecf &b) { - vech ret; + static accelerator_inline vech StoH (const vecf &a,const vecf &b) { + vech ret; + const int nf = W::r; #ifdef USE_FP16 vech *ha = (vech *)&a; vech *hb = (vech *)&b; - const int nf = W::r; // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; } // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } #else + VECTOR_FOR(i, nf,1){ ret.v[i]=0; } assert(0); #endif return ret; } - static inline void HtoS (vech h,vecf &sa,vecf &sb) { + static accelerator_inline void HtoS (vech h,vecf &sa,vecf &sb) { #ifdef USE_FP16 const int nf = W::r; const int nh = W::r; @@ -310,26 +311,25 @@ struct PrecisionChange { assert(0); #endif } - static inline vecf DtoS (vecd a,vecd b) { + static accelerator_inline vecf DtoS (vecd a,vecd b) { const int nd = W::r; - const int nf = W::r; vecf ret; VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; } VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } return ret; } - static inline void StoD (vecf s,vecd &a,vecd &b) { + static accelerator_inline void StoD (vecf s,vecd &a,vecd &b) { const int nd = W::r; VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } } - static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + static accelerator_inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { vecf sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); return StoH(sa,sb); } - static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + static accelerator_inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { vecf sa,sb; HtoS(h,sa,sb); StoD(sa,a,b); @@ -342,7 +342,7 @@ struct PrecisionChange { struct Exchange{ template - static inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ + static accelerator_inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ const int w = W::r; unsigned int mask = w >> (n + 1); // std::cout << " Exchange "< - static inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ + static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template - static inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ + static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template - static inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ + static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; template - static inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ + static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ ExchangeN(out1,out2,in1,in2); }; }; @@ -385,7 +385,7 @@ struct Exchange{ #define DECL_PERMUTE_N(n) \ template \ - static inline vec Permute##n(vec in) { \ + static accelerator_inline vec Permute##n(vec in) { \ vec out; \ perm(in.v, out.v, n, W::r); \ return out; \ @@ -409,12 +409,12 @@ struct Permute{ struct Rotate{ - template static inline vec tRotate(vec in){ + template static accelerator_inline vec tRotate(vec in){ return rotate(in, n); } template - static inline vec rotate(vec in, int n){ + static accelerator_inline vec rotate(vec in, int n){ vec out; rot(in.v, out.v, n, W::r); @@ -435,7 +435,7 @@ template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled - inline Out_type operator()(In_type in){ + accelerator_inline Out_type operator()(In_type in){ printf("Error, using wrong Reduce function\n"); exit(1); return 0; @@ -444,7 +444,7 @@ struct Reduce{ //Complex float Reduce template <> -inline Grid::ComplexF Reduce::operator()(vecf in){ +accelerator_inline Grid::ComplexF Reduce::operator()(vecf in){ float a = 0.f, b = 0.f; acc(in.v, a, 0, 2, W::r); @@ -455,7 +455,7 @@ inline Grid::ComplexF Reduce::operator()(vecf in){ //Real float Reduce template<> -inline Grid::RealF Reduce::operator()(vecf in){ +accelerator_inline Grid::RealF Reduce::operator()(vecf in){ float a = 0.; acc(in.v, a, 0, 1, W::r); @@ -465,7 +465,7 @@ inline Grid::RealF Reduce::operator()(vecf in){ //Complex double Reduce template<> -inline Grid::ComplexD Reduce::operator()(vecd in){ +accelerator_inline Grid::ComplexD Reduce::operator()(vecd in){ double a = 0., b = 0.; acc(in.v, a, 0, 2, W::r); @@ -476,7 +476,7 @@ inline Grid::ComplexD Reduce::operator()(vecd in){ //Real double Reduce template<> -inline Grid::RealD Reduce::operator()(vecd in){ +accelerator_inline Grid::RealD Reduce::operator()(vecd in){ double a = 0.f; acc(in.v, a, 0, 1, W::r); @@ -486,7 +486,7 @@ inline Grid::RealD Reduce::operator()(vecd in){ //Integer Reduce template<> -inline Integer Reduce::operator()(veci in){ +accelerator_inline Integer Reduce::operator()(veci in){ Integer a = 0; acc(in.v, a, 0, 1, W::r); @@ -506,8 +506,8 @@ typedef Optimization::vecd SIMD_Dtype; // Double precision type typedef Optimization::veci SIMD_Itype; // Integer type // prefetch utilities -inline void v_prefetch0(int size, const char *ptr){}; -inline void prefetch_HINT_T0(const char *ptr){}; +accelerator_inline void v_prefetch0(int size, const char *ptr){}; +accelerator_inline void prefetch_HINT_T0(const char *ptr){}; // Function name aliases typedef Optimization::Vsplat VsplatSIMD;