From f9b1f240f6af103ff437b459bce2937026c0f4d3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 26 Feb 2021 17:51:41 +0100 Subject: [PATCH] Better SIMD usage/coalescence --- Grid/lattice/Lattice_view.h | 7 ++++- Grid/simd/Grid_gpu_vec.h | 23 ++++++++++++-- Grid/simd/Simd.h | 12 ++++--- Grid/tensors/Tensor_SIMT.h | 62 +++++++++++++++++++++++++++++++++++++ Grid/threads/Accelerator.h | 53 ++++++++++++++++++++++++++----- 5 files changed, 143 insertions(+), 14 deletions(-) diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index 3b76b921..cb568abd 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -67,9 +67,14 @@ public: accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } #endif +#if 1 + // accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; + accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; }; +#else accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; - +#endif + accelerator_inline uint64_t begin(void) const { return 0;}; accelerator_inline uint64_t end(void) const { return this->_odata_size; }; accelerator_inline uint64_t size(void) const { return this->_odata_size; }; diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 8e55ce2f..2c1a38e7 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -60,11 +60,25 @@ template class GpuComplex { public: pair z; - typedef decltype(z.x) real; + typedef decltype(z.x) Real; public: accelerator_inline GpuComplex() = default; - accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; }; + accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; }; accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;}; + accelerator_inline Real real(void) const { return z.x; }; + accelerator_inline Real imag(void) const { return z.y; }; + accelerator_inline GpuComplex &operator*=(const GpuComplex &r) { + *this = (*this) * r; + return *this; + } + accelerator_inline GpuComplex &operator+=(const GpuComplex &r) { + *this = (*this) + r; + return *this; + } + accelerator_inline GpuComplex &operator-=(const GpuComplex &r) { + *this = (*this) - r; + return *this; + } friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) { GpuComplex r ; r.z.x = lhs.z.x + rhs.z.x; @@ -157,6 +171,11 @@ typedef GpuVector GpuVectorRD; typedef GpuVector GpuVectorCD; typedef GpuVector GpuVectorI; +accelerator_inline GpuComplexF timesI(const GpuComplexF &r) { return(GpuComplexF(-r.imag(),r.real()));} +accelerator_inline GpuComplexD timesI(const GpuComplexD &r) { return(GpuComplexD(-r.imag(),r.real()));} +accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));} +accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));} + accelerator_inline float half2float(half h) { float f; diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index 1dc86c1b..76ca3bef 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -148,10 +148,14 @@ accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restri accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } //conjugate already supported for complex -accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} -accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} -accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} -accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} +accelerator_inline ComplexF timesI(const ComplexF &r) { return(ComplexF(-r.imag(),r.real()));} +accelerator_inline ComplexD timesI(const ComplexD &r) { return(ComplexD(-r.imag(),r.real()));} +accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));} +accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));} +//accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} +//accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} +//accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} +//accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} // define projections to real and imaginay parts accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));} diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index ec57a679..ede24fbe 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -64,6 +64,68 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ } #else + +#ifndef GRID_SYCL +// Use the scalar as our own complex on GPU +template = 0> accelerator_inline +typename vsimd::scalar_type +coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + S * __restrict__ p=(S *)&vec; + return p[lane]; +} +template = 0> accelerator_inline +typename vsimd::scalar_type +coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + + S * __restrict__ p=(S *)&vec; + int mask = vsimd::Nsimd() >> (ptype + 1); + int plane= doperm ? lane ^ mask : lane; + return p[plane]; +} +template = 0> accelerator_inline +void coalescedWrite(vsimd & __restrict__ vec, + const typename vsimd::scalar_type & __restrict__ extracted, + int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + S * __restrict__ p=(S *)&vec; + p[lane]=extracted; +} +#else +template = 0> accelerator_inline +typename vsimd::vector_type::datum +coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + return p[lane]; +} +template = 0> accelerator_inline +typename vsimd::vector_type::datum +coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + + S * __restrict__ p=(S *)&vec; + int mask = vsimd::Nsimd() >> (ptype + 1); + int plane= doperm ? lane ^ mask : lane; + return p[plane]; +} +template = 0> accelerator_inline +void coalescedWrite(vsimd & __restrict__ vec, + const typename vsimd::vector_type::datum & __restrict__ extracted, + int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + p[lane]=extracted; +} +#endif + ////////////////////////////////////////// // Extract and insert slices on the GPU ////////////////////////////////////////// diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 59645546..2b7bf53a 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -104,7 +104,7 @@ extern int acceleratorAbortOnGpuError; accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT - return threadIdx.z; + return threadIdx.x; #else return 0; #endif @@ -112,28 +112,67 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ + int nt=acceleratorThreads(); \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator \ (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ __VA_ARGS__; \ }; \ - int nt=acceleratorThreads(); \ - dim3 cu_threads(acceleratorThreads(),1,nsimd); \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ LambdaApply<<>>(num1,num2,nsimd,lambda); \ } +#define accelerator_for6dNB(iter1, num1, \ + iter2, num2, \ + iter3, num3, \ + iter4, num4, \ + iter5, num5, \ + iter6, num6, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2, \ + Iterator iter3,Iterator iter4, \ + Iterator iter5,Iterator iter6) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_blocks (num1,num2,num3); \ + dim3 cu_threads(num4,num5,num6); \ + Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ + } + template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { - uint64_t x = threadIdx.x + blockDim.x*blockIdx.x; - uint64_t y = threadIdx.y + blockDim.y*blockIdx.y; - uint64_t z = threadIdx.z; + // Weird permute is to make lane coalesce for large blocks + uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; + uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; + uint64_t z = threadIdx.x; if ( (x < num1) && (y __global__ +void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, + uint64_t num4, uint64_t num5, uint64_t num6, + lambda Lambda) +{ + uint64_t iter1 = blockIdx.x; + uint64_t iter2 = blockIdx.y; + uint64_t iter3 = blockIdx.z; + uint64_t iter4 = threadIdx.x; + uint64_t iter5 = threadIdx.y; + uint64_t iter6 = threadIdx.z; + + if ( (iter1 < num1) && (iter2 global{unum1,unum2,nsimd}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) mutable { \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ auto lane = item.get_global_id(2); \