1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-25 19:22:03 +01:00

Remove Gpu only kernels.

This commit is contained in:
Peter Boyle
2019-06-09 11:20:01 +01:00
parent 9fbcfe612c
commit 3e41b1055c
8 changed files with 64 additions and 321 deletions

View File

@ -33,233 +33,6 @@ directory
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////
// Gpu implementation; thread loop is implicit ; move to header
//////////////////////////////////////////////////////////////
accelerator_inline int get_my_lanes(int Nsimd)
{
#ifdef __CUDA_ARCH__
return 1;
#else
return Nsimd;
#endif
}
accelerator_inline int get_my_lane_offset(int Nsimd)
{
#ifdef __CUDA_ARCH__
return ( (threadIdx.x) % Nsimd);
#else
return 0;
#endif
}
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
{
#ifdef __CUDA_ARCH__
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
uint4 * chip_pun = (uint4 *)&chip;
* chip_pun = * mem_pun;
#else
chip = *mem;
#endif
return;
}
#if 1
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
if (SE._is_local) { \
int mask = Nsimd >> (ptype + 1); \
int plane= SE._permute ? (lane ^ mask) : lane; \
auto in_l = extractLane(plane,in[SE._offset+s]); \
spProj(chi,in_l); \
} else { \
chi = extractLane(lane,buf[SE._offset+s]); \
} \
synchronise();
#else
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
if (SE._is_local) { \
auto in_t = in[SE._offset+s]; \
decltype(chi) tmp; \
if (SE._permute) { \
spProj(tmp, in_t); \
permute(chi, tmp, ptype); \
} else { \
spProj(chi, in_t); \
} \
} else { \
chi = (buf[SE._offset+s]; \
} \
synchronise();
#endif
template <class Impl>
accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, SiteDoubledGaugeField &U,
SiteHalfSpinor *buf, int Ls, int s,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typename SiteHalfSpinor::scalar_object chi;
typename SiteHalfSpinor::scalar_object Uchi;
typename SiteSpinor::scalar_object result;
typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd);
uint64_t lanes = get_my_lanes(Nsimd);
StencilEntry *SE_mem;
StencilEntry SE;
int ptype;
uint64_t ssF = Ls * sU;
uint64_t sF = ssF + s;
#ifndef __CUDA_ARCH__
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
int lane = lane_offset; {
#endif
SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);
Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
spReconXp(result, Uchi);
SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
accumReconYp(result, Uchi);
SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
accumReconZp(result, Uchi);
SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
accumReconTp(result, Uchi);
SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
accumReconXm(result, Uchi);
SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
accumReconYm(result, Uchi);
SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
accumReconZm(result, Uchi);
SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
accumReconTm(result, Uchi);
insertLane (lane,out[sF],result);
}
}
template <class Impl>
accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
SiteHalfSpinor *buf, int Ls, int s,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typename SiteHalfSpinor::scalar_object chi;
typename SiteHalfSpinor::scalar_object Uchi;
typename SiteSpinor::scalar_object result;
typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd);
uint64_t lanes = get_my_lanes(Nsimd);
StencilEntry *SE_mem;
StencilEntry SE;
int ptype;
// Forces some degree of coalesce on the table look ups
// Could also use wide load instructions on the data structure
uint64_t ssF = Ls * sU;
uint64_t sF = ssF + s;
#ifndef __CUDA_ARCH__
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
int lane = lane_offset; {
#endif
SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
spReconXm(result, Uchi);
SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
accumReconYm(result, Uchi);
SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
accumReconZm(result, Uchi);
SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
accumReconTm(result, Uchi);
SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
accumReconXp(result, Uchi);
SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
accumReconYp(result, Uchi);
SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
accumReconZp(result, Uchi);
SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
accumReconTp(result, Uchi);
insertLane (lane,out[sF],result);
}
};
// Template specialise Gparity to empty for now
#define GPU_EMPTY(A) \
template <> \
accelerator_inline void \
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
SiteDoubledGaugeField &U, \
SiteHalfSpinor *buf, int Ls, int sF, \
int sU, \
const FermionFieldView &in, \
FermionFieldView &out) { assert(0);}; \
template <> \
accelerator_inline void \
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
SiteDoubledGaugeField &U, \
SiteHalfSpinor *buf, int Ls,int sF, \
int sU, \
const FermionFieldView &in, \
FermionFieldView &out) { assert(0);};
GPU_EMPTY(GparityWilsonImplF);
GPU_EMPTY(GparityWilsonImplFH);
GPU_EMPTY(GparityWilsonImplD);
GPU_EMPTY(GparityWilsonImplDF);
#define KERNEL_CALL(A) \
const uint64_t nsimd = Simd::Nsimd(); \
@ -282,6 +55,13 @@ GPU_EMPTY(GparityWilsonImplDF);
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
});
#define ASM_CALL(A) \
SIMT_loop( ss, Nsite, { \
int sU = ss; \
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});
template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
@ -293,17 +73,25 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
auto st_v = st.View();
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGpu) {
KERNEL_CALL(GpuDhopSite);
} else {
HOST_CALL(GenericDhopSite);
}
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif
} else if( interior ) {
HOST_CALL(GenericDhopSiteInt);
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif
} else if( exterior ) {
HOST_CALL(GenericDhopSiteExt);
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
@ -315,17 +103,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
auto out_v = out.View();
auto st_v = st.View();
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGpu) {
KERNEL_CALL(GpuDhopSiteDag);
} else {
HOST_CALL(GenericDhopSiteDag);
}
} else if( interior ) {
HOST_CALL(GenericDhopSiteDagInt);
} else if( exterior ) {
HOST_CALL(GenericDhopSiteDagExt);
}
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDag); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDag); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDagInt); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDagExt); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
NAMESPACE_END(Grid);

View File

@ -38,6 +38,19 @@ NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////
// Generic implementation; move to different file?
////////////////////////////////////////////
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
{
#ifdef __CUDA_ARCH__
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
uint4 * chip_pun = (uint4 *)&chip;
* chip_pun = * mem_pun;
#else
chip = *mem;
#endif
return;
}
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \