1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

Coalescing version of the kernel

This commit is contained in:
paboyle 2018-06-13 20:52:29 +01:00
parent 73bb2d5128
commit 6c97a6a071

View File

@ -31,59 +31,150 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// Gpu implementation; view code at a premium and less unroll // Gpu implementation; thread loop is implicit
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
__host__ __device__ inline void synchronise(void)
#define GPU_STENCIL_LEG_PROJ(Dir,spProj) \ {
#ifdef __CUDA_ARCH__
__syncthreads();
#endif
return;
}
#define GPU_DSLASH_COALESCE
#ifdef GPU_DSLASH_COALESCE
__host__ __device__ inline int get_my_lanes(int Nsimd)
{
#ifdef __CUDA_ARCH__
return 1;
#else
return Nsimd;
#endif
}
__host__ __device__ inline int get_my_lane_offset(int Nsimd)
{
#ifdef __CUDA_ARCH__
return ( (threadIdx.x) % Nsimd);
#else
return 0;
#endif
}
////////////////////////////////////////////////////////////////////////
// Extract/Insert a single lane; do this locally in this file.
// Don't need a global version really.
////////////////////////////////////////////////////////////////////////
template<class vobj> accelerator_inline
typename vobj::scalar_object extractLaneGpu(int lane, const vobj & __restrict__ vec)
{
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
constexpr int words=sizeof(vobj)/sizeof(vector_type);
constexpr int Nsimd=vector_type::Nsimd();
scalar_object extracted;
scalar_type * __restrict__ sp = (scalar_type *)&extracted; // Type pun
scalar_type * __restrict__ vp = (scalar_type *)&vec;
for(int w=0;w<words;w++){
sp[w]=vp[w*Nsimd+lane];
}
return extracted;
}
template<class vobj> accelerator_inline
void insertLaneFloat2(int lane, vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
constexpr int words=sizeof(vobj)/sizeof(vector_type);
constexpr int Nsimd=vector_type::Nsimd();
float2 * __restrict__ sp = (float2 *)&extracted;
float2 * __restrict__ vp = (float2 *)&vec;
for(int w=0;w<words;w++){
vp[w*Nsimd+lane]=sp[w];
}
}
template<class vobj> accelerator_inline
typename vobj::scalar_object extractLaneFloat2(int lane, const vobj & __restrict__ vec)
{
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
constexpr int words=sizeof(vobj)/sizeof(vector_type);
constexpr int Nsimd=vector_type::Nsimd();
scalar_object extracted;
float2 * __restrict__ sp = (float2 *)&extracted; // Type pun
float2 * __restrict__ vp = (float2 *)&vec;
for(int w=0;w<words;w++){
sp[w]=vp[w*Nsimd+lane];
}
return extracted;
}
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
if (SE->_is_local) { \ if (SE->_is_local) { \
spProj(chi, in[SE->_offset]); \ auto in_l = extractLaneGpu(lane,in[SE->_offset]); \
if (SE->_permute) { \ spProj(chi,in_l); \
permute(tmp, chi, ptype); \
chi = tmp; \
} \
} else { \ } else { \
chi = buf[SE->_offset]; \ chi = extractLaneGpu(lane,buf[SE->_offset]); \
} }
#define GPU_STENCIL_LEG_RECON(Recon) Recon(result, Uchi);
// Xp is mu= 0
template <class Impl> template <class Impl>
accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,int LLs,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
SiteHalfSpinor tmp; typename SiteHalfSpinor::scalar_object tmp;
SiteHalfSpinor chi; typename SiteHalfSpinor::scalar_object chi;
SiteHalfSpinor Uchi; typename SiteHalfSpinor::scalar_object Uchi;
SiteSpinor result; typename SiteSpinor::scalar_object result;
typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd);
uint64_t lanes = get_my_lanes (Nsimd);
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
for(int s=0;s<LLs;s++){
for(int mu=0;mu<2*Nd;mu++) { for(int mu=0;mu<2*Nd;mu++) {
SE = st.GetEntry(ptype, mu, sF); SE = st.GetEntry(ptype, mu, sF);
switch(mu){ switch(mu){
case Xp: case Xp:
GPU_STENCIL_LEG_PROJ(Xp,spProjXp); break; GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); break;
case Yp: case Yp:
GPU_STENCIL_LEG_PROJ(Yp,spProjYp); break; GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp); break;
case Zp: case Zp:
GPU_STENCIL_LEG_PROJ(Zp,spProjZp); break; GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp); break;
case Tp: case Tp:
GPU_STENCIL_LEG_PROJ(Tp,spProjTp); break; GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp); break;
case Xm: case Xm:
GPU_STENCIL_LEG_PROJ(Xm,spProjXm); break; GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm); break;
case Ym: case Ym:
GPU_STENCIL_LEG_PROJ(Ym,spProjYm); break; GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm); break;
case Zm: case Zm:
GPU_STENCIL_LEG_PROJ(Zm,spProjZm); break; GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm); break;
case Tm: case Tm:
default: default:
GPU_STENCIL_LEG_PROJ(Tm,spProjTm); break; GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); break;
} }
synchronise();
Impl::multLink(Uchi, U[sU], chi, mu, SE, st); auto U_l = extractLaneGpu(lane,U[sU](mu));
Uchi()=U_l*chi();
switch(mu){ switch(mu){
case Xp: case Xp:
@ -104,47 +195,110 @@ accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGau
default: default:
accumReconTm(result, Uchi); break; accumReconTm(result, Uchi); break;
} }
synchronise();
} }
vstream(out[sF], result); insertLane (lane,out[sF],result);
sF++;
}}
}; };
template <class Impl> template <class Impl>
accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,int LLs,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
SiteHalfSpinor tmp; typename SiteHalfSpinor::scalar_object tmp;
SiteHalfSpinor chi; typename SiteHalfSpinor::scalar_object chi;
SiteHalfSpinor Uchi; typename SiteHalfSpinor::scalar_object Uchi;
SiteSpinor result; typename SiteSpinor::scalar_object result;
typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd);
uint64_t lanes = get_my_lanes(Nsimd);
// printf("Evaluating site %d Nsimd %d : lanes %ld %ld - %ld\n",sF,Nsimd,lanes,lane_offset,lane_offset+lanes-1);
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
for(int s=0;s<LLs;s++){
#if 1
int mu=0;
SE = st.GetEntry(ptype, mu, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
{ auto U_l = extractLaneFloat2(lane,U[sU](mu)); Uchi() = U_l * chi();}
spReconXm(result, Uchi);
mu++; SE = st.GetEntry(ptype, mu, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
{ auto U_l = extractLaneFloat2(lane,U[sU](mu)); Uchi() = U_l * chi();}
accumReconYm(result, Uchi);
mu++; SE = st.GetEntry(ptype, mu, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
{ auto U_l = extractLaneFloat2(lane,U[sU](mu)); Uchi() = U_l * chi();}
accumReconZm(result, Uchi);
mu++; SE = st.GetEntry(ptype, mu, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
{ auto U_l = extractLaneFloat2(lane,U[sU](mu)); Uchi() = U_l * chi();}
accumReconTm(result, Uchi);
mu++; SE = st.GetEntry(ptype, mu, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
{ auto U_l = extractLaneFloat2(lane,U[sU](mu)); Uchi() = U_l * chi();}
accumReconXp(result, Uchi);
mu++; SE = st.GetEntry(ptype, mu, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
{ auto U_l = extractLaneFloat2(lane,U[sU](mu)); Uchi() = U_l * chi();}
accumReconYp(result, Uchi);
mu++; SE = st.GetEntry(ptype, mu, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
{ auto U_l = extractLaneFloat2(lane,U[sU](mu)); Uchi() = U_l * chi();}
accumReconZp(result, Uchi);
mu++; SE = st.GetEntry(ptype, mu, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
{ auto U_l = extractLaneFloat2(lane,U[sU](mu)); Uchi() = U_l * chi();}
accumReconTp(result, Uchi);
#else
for(int mu=0;mu<2*Nd;mu++) { for(int mu=0;mu<2*Nd;mu++) {
SE = st.GetEntry(ptype, mu, sF); SE = st.GetEntry(ptype, mu, sF);
switch(mu){ switch(mu){
case Xp: case Xp:
GPU_STENCIL_LEG_PROJ(Xp,spProjXm); break; GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); break;
case Yp: case Yp:
GPU_STENCIL_LEG_PROJ(Yp,spProjYm); break; GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm); break;
case Zp: case Zp:
GPU_STENCIL_LEG_PROJ(Zp,spProjZm); break; GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm); break;
case Tp: case Tp:
GPU_STENCIL_LEG_PROJ(Tp,spProjTm); break; GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm); break;
case Xm: case Xm:
GPU_STENCIL_LEG_PROJ(Xm,spProjXp); break; GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp); break;
case Ym: case Ym:
GPU_STENCIL_LEG_PROJ(Ym,spProjYp); break; GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp); break;
case Zm: case Zm:
GPU_STENCIL_LEG_PROJ(Zm,spProjZp); break; GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp); break;
case Tm: case Tm:
default: default:
GPU_STENCIL_LEG_PROJ(Tm,spProjTp); break; GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); break;
} }
Impl::multLink(Uchi, U[sU], chi, mu, SE, st); auto U_l = extractLaneGpu(lane,U[sU](mu));
auto tmp = U_l * chi();
Uchi() = tmp;
switch(mu){ switch(mu){
case Xp: case Xp:
@ -166,10 +320,44 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
accumReconTp(result, Uchi); break; accumReconTp(result, Uchi); break;
} }
} }
vstream(out[sF], result); #endif
insertLaneFloat2 (lane,out[sF],result);
sF++;
}}
}; };
// Template specialise Gparity to empty for now
#define GPU_EMPTY(A) \
template <> \
accelerator void \
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
DoubledGaugeFieldView &U, \
SiteHalfSpinor *buf, int sF, int LLs, \
int sU, \
const FermionFieldView &in, \
FermionFieldView &out) {}; \
template <> \
accelerator void \
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
DoubledGaugeFieldView &U, \
SiteHalfSpinor *buf, int sF,int LLs, \
int sU, \
const FermionFieldView &in, \
FermionFieldView &out) {};
GPU_EMPTY(GparityWilsonImplF);
GPU_EMPTY(GparityWilsonImplFH);
GPU_EMPTY(GparityWilsonImplD);
GPU_EMPTY(GparityWilsonImplDF);
GPU_EMPTY(DomainWallVec5dImplF);
GPU_EMPTY(DomainWallVec5dImplFH);
GPU_EMPTY(DomainWallVec5dImplD);
GPU_EMPTY(DomainWallVec5dImplDF);
GPU_EMPTY(ZDomainWallVec5dImplF);
GPU_EMPTY(ZDomainWallVec5dImplFH);
GPU_EMPTY(ZDomainWallVec5dImplD);
GPU_EMPTY(ZDomainWallVec5dImplDF);
FermOpTemplateInstantiate(WilsonKernels); FermOpTemplateInstantiate(WilsonKernels);
NAMESPACE_END(Grid); NAMESPACE_END(Grid);