mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Better SIMD usage/coalescence
This commit is contained in:
parent
1ac13ec3a7
commit
f9b1f240f6
@ -67,9 +67,14 @@ public:
|
||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
// accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||
accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||
#else
|
||||
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
||||
|
||||
#endif
|
||||
|
||||
accelerator_inline uint64_t begin(void) const { return 0;};
|
||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||
|
@ -60,11 +60,25 @@ template<class pair>
|
||||
class GpuComplex {
|
||||
public:
|
||||
pair z;
|
||||
typedef decltype(z.x) real;
|
||||
typedef decltype(z.x) Real;
|
||||
public:
|
||||
accelerator_inline GpuComplex() = default;
|
||||
accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; };
|
||||
accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; };
|
||||
accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;};
|
||||
accelerator_inline Real real(void) const { return z.x; };
|
||||
accelerator_inline Real imag(void) const { return z.y; };
|
||||
accelerator_inline GpuComplex &operator*=(const GpuComplex &r) {
|
||||
*this = (*this) * r;
|
||||
return *this;
|
||||
}
|
||||
accelerator_inline GpuComplex &operator+=(const GpuComplex &r) {
|
||||
*this = (*this) + r;
|
||||
return *this;
|
||||
}
|
||||
accelerator_inline GpuComplex &operator-=(const GpuComplex &r) {
|
||||
*this = (*this) - r;
|
||||
return *this;
|
||||
}
|
||||
friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) {
|
||||
GpuComplex r ;
|
||||
r.z.x = lhs.z.x + rhs.z.x;
|
||||
@ -157,6 +171,11 @@ typedef GpuVector<NSIMD_RealD, double > GpuVectorRD;
|
||||
typedef GpuVector<NSIMD_ComplexD, GpuComplexD > GpuVectorCD;
|
||||
typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
|
||||
|
||||
accelerator_inline GpuComplexF timesI(const GpuComplexF &r) { return(GpuComplexF(-r.imag(),r.real()));}
|
||||
accelerator_inline GpuComplexD timesI(const GpuComplexD &r) { return(GpuComplexD(-r.imag(),r.real()));}
|
||||
accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));}
|
||||
accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));}
|
||||
|
||||
accelerator_inline float half2float(half h)
|
||||
{
|
||||
float f;
|
||||
|
@ -148,10 +148,14 @@ accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restri
|
||||
accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
//conjugate already supported for complex
|
||||
accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
|
||||
accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
|
||||
accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
|
||||
accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
|
||||
accelerator_inline ComplexF timesI(const ComplexF &r) { return(ComplexF(-r.imag(),r.real()));}
|
||||
accelerator_inline ComplexD timesI(const ComplexD &r) { return(ComplexD(-r.imag(),r.real()));}
|
||||
accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));}
|
||||
accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));}
|
||||
//accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
|
||||
//accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
|
||||
//accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
|
||||
//accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
|
||||
|
||||
// define projections to real and imaginay parts
|
||||
accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));}
|
||||
|
@ -64,6 +64,68 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
|
||||
}
|
||||
#else
|
||||
|
||||
|
||||
#ifndef GRID_SYCL
|
||||
// Use the scalar as our own complex on GPU
|
||||
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||
typename vsimd::scalar_type
|
||||
coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||
{
|
||||
typedef typename vsimd::scalar_type S;
|
||||
S * __restrict__ p=(S *)&vec;
|
||||
return p[lane];
|
||||
}
|
||||
template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||
typename vsimd::scalar_type
|
||||
coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||
{
|
||||
typedef typename vsimd::scalar_type S;
|
||||
|
||||
S * __restrict__ p=(S *)&vec;
|
||||
int mask = vsimd::Nsimd() >> (ptype + 1);
|
||||
int plane= doperm ? lane ^ mask : lane;
|
||||
return p[plane];
|
||||
}
|
||||
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||
void coalescedWrite(vsimd & __restrict__ vec,
|
||||
const typename vsimd::scalar_type & __restrict__ extracted,
|
||||
int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||
{
|
||||
typedef typename vsimd::scalar_type S;
|
||||
S * __restrict__ p=(S *)&vec;
|
||||
p[lane]=extracted;
|
||||
}
|
||||
#else
|
||||
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||
typename vsimd::vector_type::datum
|
||||
coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||
{
|
||||
typedef typename vsimd::vector_type::datum S;
|
||||
S * __restrict__ p=(S *)&vec;
|
||||
return p[lane];
|
||||
}
|
||||
template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||
typename vsimd::vector_type::datum
|
||||
coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||
{
|
||||
typedef typename vsimd::vector_type::datum S;
|
||||
|
||||
S * __restrict__ p=(S *)&vec;
|
||||
int mask = vsimd::Nsimd() >> (ptype + 1);
|
||||
int plane= doperm ? lane ^ mask : lane;
|
||||
return p[plane];
|
||||
}
|
||||
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||
void coalescedWrite(vsimd & __restrict__ vec,
|
||||
const typename vsimd::vector_type::datum & __restrict__ extracted,
|
||||
int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||
{
|
||||
typedef typename vsimd::vector_type::datum S;
|
||||
S * __restrict__ p=(S *)&vec;
|
||||
p[lane]=extracted;
|
||||
}
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Extract and insert slices on the GPU
|
||||
//////////////////////////////////////////
|
||||
|
@ -104,7 +104,7 @@ extern int acceleratorAbortOnGpuError;
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
return threadIdx.z;
|
||||
return threadIdx.x;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@ -112,28 +112,67 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
int nt=acceleratorThreads(); \
|
||||
dim3 cu_threads(acceleratorThreads(),1,nsimd); \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
|
||||
#define accelerator_for6dNB(iter1, num1, \
|
||||
iter2, num2, \
|
||||
iter3, num3, \
|
||||
iter4, num4, \
|
||||
iter5, num5, \
|
||||
iter6, num6, ... ) \
|
||||
{ \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2, \
|
||||
Iterator iter3,Iterator iter4, \
|
||||
Iterator iter5,Iterator iter6) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_blocks (num1,num2,num3); \
|
||||
dim3 cu_threads(num4,num5,num6); \
|
||||
Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
{
|
||||
uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
uint64_t z = threadIdx.z;
|
||||
// Weird permute is to make lane coalesce for large blocks
|
||||
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
|
||||
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
|
||||
uint64_t z = threadIdx.x;
|
||||
if ( (x < num1) && (y<num2) && (z<num3) ) {
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
||||
uint64_t num4, uint64_t num5, uint64_t num6,
|
||||
lambda Lambda)
|
||||
{
|
||||
uint64_t iter1 = blockIdx.x;
|
||||
uint64_t iter2 = blockIdx.y;
|
||||
uint64_t iter3 = blockIdx.z;
|
||||
uint64_t iter4 = threadIdx.x;
|
||||
uint64_t iter5 = threadIdx.y;
|
||||
uint64_t iter6 = threadIdx.z;
|
||||
|
||||
if ( (iter1 < num1) && (iter2<num2) && (iter3<num3)
|
||||
&& (iter4 < num4) && (iter5<num5) && (iter6<num6) )
|
||||
{
|
||||
Lambda(iter1,iter2,iter3,iter4,iter5,iter6);
|
||||
}
|
||||
}
|
||||
|
||||
#define accelerator_barrier(dummy) \
|
||||
{ \
|
||||
cudaDeviceSynchronize(); \
|
||||
@ -221,7 +260,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
||||
cgh.parallel_for<class dslash>( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) mutable { \
|
||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ { \
|
||||
auto iter1 = item.get_global_id(0); \
|
||||
auto iter2 = item.get_global_id(1); \
|
||||
auto lane = item.get_global_id(2); \
|
||||
|
Loading…
Reference in New Issue
Block a user