diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 8e55ce2f..2c1a38e7 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -60,11 +60,25 @@ template class GpuComplex { public: pair z; - typedef decltype(z.x) real; + typedef decltype(z.x) Real; public: accelerator_inline GpuComplex() = default; - accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; }; + accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; }; accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;}; + accelerator_inline Real real(void) const { return z.x; }; + accelerator_inline Real imag(void) const { return z.y; }; + accelerator_inline GpuComplex &operator*=(const GpuComplex &r) { + *this = (*this) * r; + return *this; + } + accelerator_inline GpuComplex &operator+=(const GpuComplex &r) { + *this = (*this) + r; + return *this; + } + accelerator_inline GpuComplex &operator-=(const GpuComplex &r) { + *this = (*this) - r; + return *this; + } friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) { GpuComplex r ; r.z.x = lhs.z.x + rhs.z.x; @@ -157,6 +171,11 @@ typedef GpuVector GpuVectorRD; typedef GpuVector GpuVectorCD; typedef GpuVector GpuVectorI; +accelerator_inline GpuComplexF timesI(const GpuComplexF &r) { return(GpuComplexF(-r.imag(),r.real()));} +accelerator_inline GpuComplexD timesI(const GpuComplexD &r) { return(GpuComplexD(-r.imag(),r.real()));} +accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));} +accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));} + accelerator_inline float half2float(half h) { float f; diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index b2213c4e..8f046997 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -65,22 +65,20 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ #else +#if 0 +// Use the scalar as our own complex on GPU template = 0> accelerator_inline -//typename vsimd::vector_type::datum typename vsimd::scalar_type coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) { - // typedef typename vsimd::vector_type::datum S; typedef typename vsimd::scalar_type S; S * __restrict__ p=(S *)&vec; return p[lane]; } template = 0> accelerator_inline -//typename vsimd::vector_type::datum typename vsimd::scalar_type coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) { - // typedef typename vsimd::vector_type::datum S; typedef typename vsimd::scalar_type S; S * __restrict__ p=(S *)&vec; @@ -90,16 +88,43 @@ coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=accelera } template = 0> accelerator_inline void coalescedWrite(vsimd & __restrict__ vec, - // const typename vsimd::vector_type::datum & __restrict__ extracted, const typename vsimd::scalar_type & __restrict__ extracted, int lane=acceleratorSIMTlane(vsimd::Nsimd())) { - // typedef typename vsimd::vector_type::datum S; typedef typename vsimd::scalar_type S; S * __restrict__ p=(S *)&vec; p[lane]=extracted; } +#else +template = 0> accelerator_inline +typename vsimd::vector_type::datum +coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + return p[lane]; +} +template = 0> accelerator_inline +typename vsimd::vector_type::datum +coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + int mask = vsimd::Nsimd() >> (ptype + 1); + int plane= doperm ? lane ^ mask : lane; + return p[plane]; +} +template = 0> accelerator_inline +void coalescedWrite(vsimd & __restrict__ vec, + const typename vsimd::vector_type::datum & __restrict__ extracted, + int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + p[lane]=extracted; +} +#endif ////////////////////////////////////////// // Extract and insert slices on the GPU