mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'develop' into bugfix/LatTransfer
* develop: Better SIMD usage/coalescence
This commit is contained in:
commit
1059a81a3c
@ -67,9 +67,14 @@ public:
|
|||||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
// accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||||
|
accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||||
|
#else
|
||||||
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||||
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
||||||
|
#endif
|
||||||
|
|
||||||
accelerator_inline uint64_t begin(void) const { return 0;};
|
accelerator_inline uint64_t begin(void) const { return 0;};
|
||||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||||
|
@ -60,11 +60,25 @@ template<class pair>
|
|||||||
class GpuComplex {
|
class GpuComplex {
|
||||||
public:
|
public:
|
||||||
pair z;
|
pair z;
|
||||||
typedef decltype(z.x) real;
|
typedef decltype(z.x) Real;
|
||||||
public:
|
public:
|
||||||
accelerator_inline GpuComplex() = default;
|
accelerator_inline GpuComplex() = default;
|
||||||
accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; };
|
accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; };
|
||||||
accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;};
|
accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;};
|
||||||
|
accelerator_inline Real real(void) const { return z.x; };
|
||||||
|
accelerator_inline Real imag(void) const { return z.y; };
|
||||||
|
accelerator_inline GpuComplex &operator*=(const GpuComplex &r) {
|
||||||
|
*this = (*this) * r;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
accelerator_inline GpuComplex &operator+=(const GpuComplex &r) {
|
||||||
|
*this = (*this) + r;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
accelerator_inline GpuComplex &operator-=(const GpuComplex &r) {
|
||||||
|
*this = (*this) - r;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) {
|
friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) {
|
||||||
GpuComplex r ;
|
GpuComplex r ;
|
||||||
r.z.x = lhs.z.x + rhs.z.x;
|
r.z.x = lhs.z.x + rhs.z.x;
|
||||||
@ -157,6 +171,11 @@ typedef GpuVector<NSIMD_RealD, double > GpuVectorRD;
|
|||||||
typedef GpuVector<NSIMD_ComplexD, GpuComplexD > GpuVectorCD;
|
typedef GpuVector<NSIMD_ComplexD, GpuComplexD > GpuVectorCD;
|
||||||
typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
|
typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
|
||||||
|
|
||||||
|
accelerator_inline GpuComplexF timesI(const GpuComplexF &r) { return(GpuComplexF(-r.imag(),r.real()));}
|
||||||
|
accelerator_inline GpuComplexD timesI(const GpuComplexD &r) { return(GpuComplexD(-r.imag(),r.real()));}
|
||||||
|
accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));}
|
||||||
|
accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));}
|
||||||
|
|
||||||
accelerator_inline float half2float(half h)
|
accelerator_inline float half2float(half h)
|
||||||
{
|
{
|
||||||
float f;
|
float f;
|
||||||
|
@ -148,10 +148,14 @@ accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restri
|
|||||||
accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||||
|
|
||||||
//conjugate already supported for complex
|
//conjugate already supported for complex
|
||||||
accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
|
accelerator_inline ComplexF timesI(const ComplexF &r) { return(ComplexF(-r.imag(),r.real()));}
|
||||||
accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
|
accelerator_inline ComplexD timesI(const ComplexD &r) { return(ComplexD(-r.imag(),r.real()));}
|
||||||
accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
|
accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));}
|
||||||
accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
|
accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));}
|
||||||
|
//accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
|
||||||
|
//accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
|
||||||
|
//accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
|
||||||
|
//accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
|
||||||
|
|
||||||
// define projections to real and imaginay parts
|
// define projections to real and imaginay parts
|
||||||
accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));}
|
accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));}
|
||||||
|
@ -64,6 +64,68 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef GRID_SYCL
|
||||||
|
// Use the scalar as our own complex on GPU
|
||||||
|
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||||
|
typename vsimd::scalar_type
|
||||||
|
coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||||
|
{
|
||||||
|
typedef typename vsimd::scalar_type S;
|
||||||
|
S * __restrict__ p=(S *)&vec;
|
||||||
|
return p[lane];
|
||||||
|
}
|
||||||
|
template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||||
|
typename vsimd::scalar_type
|
||||||
|
coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||||
|
{
|
||||||
|
typedef typename vsimd::scalar_type S;
|
||||||
|
|
||||||
|
S * __restrict__ p=(S *)&vec;
|
||||||
|
int mask = vsimd::Nsimd() >> (ptype + 1);
|
||||||
|
int plane= doperm ? lane ^ mask : lane;
|
||||||
|
return p[plane];
|
||||||
|
}
|
||||||
|
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||||
|
void coalescedWrite(vsimd & __restrict__ vec,
|
||||||
|
const typename vsimd::scalar_type & __restrict__ extracted,
|
||||||
|
int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||||
|
{
|
||||||
|
typedef typename vsimd::scalar_type S;
|
||||||
|
S * __restrict__ p=(S *)&vec;
|
||||||
|
p[lane]=extracted;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||||
|
typename vsimd::vector_type::datum
|
||||||
|
coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||||
|
{
|
||||||
|
typedef typename vsimd::vector_type::datum S;
|
||||||
|
S * __restrict__ p=(S *)&vec;
|
||||||
|
return p[lane];
|
||||||
|
}
|
||||||
|
template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||||
|
typename vsimd::vector_type::datum
|
||||||
|
coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||||
|
{
|
||||||
|
typedef typename vsimd::vector_type::datum S;
|
||||||
|
|
||||||
|
S * __restrict__ p=(S *)&vec;
|
||||||
|
int mask = vsimd::Nsimd() >> (ptype + 1);
|
||||||
|
int plane= doperm ? lane ^ mask : lane;
|
||||||
|
return p[plane];
|
||||||
|
}
|
||||||
|
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||||
|
void coalescedWrite(vsimd & __restrict__ vec,
|
||||||
|
const typename vsimd::vector_type::datum & __restrict__ extracted,
|
||||||
|
int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
||||||
|
{
|
||||||
|
typedef typename vsimd::vector_type::datum S;
|
||||||
|
S * __restrict__ p=(S *)&vec;
|
||||||
|
p[lane]=extracted;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Extract and insert slices on the GPU
|
// Extract and insert slices on the GPU
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
|
@ -104,7 +104,7 @@ extern int acceleratorAbortOnGpuError;
|
|||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
return threadIdx.z;
|
return threadIdx.x;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
#endif
|
#endif
|
||||||
@ -112,28 +112,67 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
|
|
||||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
{ \
|
{ \
|
||||||
|
int nt=acceleratorThreads(); \
|
||||||
typedef uint64_t Iterator; \
|
typedef uint64_t Iterator; \
|
||||||
auto lambda = [=] accelerator \
|
auto lambda = [=] accelerator \
|
||||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||||
__VA_ARGS__; \
|
__VA_ARGS__; \
|
||||||
}; \
|
}; \
|
||||||
int nt=acceleratorThreads(); \
|
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||||
dim3 cu_threads(acceleratorThreads(),1,nsimd); \
|
|
||||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
|
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define accelerator_for6dNB(iter1, num1, \
|
||||||
|
iter2, num2, \
|
||||||
|
iter3, num3, \
|
||||||
|
iter4, num4, \
|
||||||
|
iter5, num5, \
|
||||||
|
iter6, num6, ... ) \
|
||||||
|
{ \
|
||||||
|
typedef uint64_t Iterator; \
|
||||||
|
auto lambda = [=] accelerator \
|
||||||
|
(Iterator iter1,Iterator iter2, \
|
||||||
|
Iterator iter3,Iterator iter4, \
|
||||||
|
Iterator iter5,Iterator iter6) mutable { \
|
||||||
|
__VA_ARGS__; \
|
||||||
|
}; \
|
||||||
|
dim3 cu_blocks (num1,num2,num3); \
|
||||||
|
dim3 cu_threads(num4,num5,num6); \
|
||||||
|
Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
||||||
|
}
|
||||||
|
|
||||||
template<typename lambda> __global__
|
template<typename lambda> __global__
|
||||||
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||||
{
|
{
|
||||||
uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
|
// Weird permute is to make lane coalesce for large blocks
|
||||||
uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
|
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
|
||||||
uint64_t z = threadIdx.z;
|
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
|
||||||
|
uint64_t z = threadIdx.x;
|
||||||
if ( (x < num1) && (y<num2) && (z<num3) ) {
|
if ( (x < num1) && (y<num2) && (z<num3) ) {
|
||||||
Lambda(x,y,z);
|
Lambda(x,y,z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename lambda> __global__
|
||||||
|
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
||||||
|
uint64_t num4, uint64_t num5, uint64_t num6,
|
||||||
|
lambda Lambda)
|
||||||
|
{
|
||||||
|
uint64_t iter1 = blockIdx.x;
|
||||||
|
uint64_t iter2 = blockIdx.y;
|
||||||
|
uint64_t iter3 = blockIdx.z;
|
||||||
|
uint64_t iter4 = threadIdx.x;
|
||||||
|
uint64_t iter5 = threadIdx.y;
|
||||||
|
uint64_t iter6 = threadIdx.z;
|
||||||
|
|
||||||
|
if ( (iter1 < num1) && (iter2<num2) && (iter3<num3)
|
||||||
|
&& (iter4 < num4) && (iter5<num5) && (iter6<num6) )
|
||||||
|
{
|
||||||
|
Lambda(iter1,iter2,iter3,iter4,iter5,iter6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define accelerator_barrier(dummy) \
|
#define accelerator_barrier(dummy) \
|
||||||
{ \
|
{ \
|
||||||
cudaDeviceSynchronize(); \
|
cudaDeviceSynchronize(); \
|
||||||
@ -221,7 +260,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
||||||
cgh.parallel_for<class dslash>( \
|
cgh.parallel_for<class dslash>( \
|
||||||
cl::sycl::nd_range<3>(global,local), \
|
cl::sycl::nd_range<3>(global,local), \
|
||||||
[=] (cl::sycl::nd_item<3> item) mutable { \
|
[=] (cl::sycl::nd_item<3> item) /*mutable*/ { \
|
||||||
auto iter1 = item.get_global_id(0); \
|
auto iter1 = item.get_global_id(0); \
|
||||||
auto iter2 = item.get_global_id(1); \
|
auto iter2 = item.get_global_id(1); \
|
||||||
auto lane = item.get_global_id(2); \
|
auto lane = item.get_global_id(2); \
|
||||||
|
Loading…
Reference in New Issue
Block a user