1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 15:55:37 +00:00

Finally starting to get decent performance on Volta

This commit is contained in:
Peter Boyle 2018-07-13 12:06:18 -04:00
parent 2cc07450f4
commit b2b5137d28
5 changed files with 203 additions and 98 deletions

View File

@ -215,6 +215,14 @@ public:
mult(&phi(), &U(mu), &chi()); mult(&phi(), &U(mu), &chi());
} }
#ifdef GPU_VEC
static accelerator_inline void copyLinkGpu(int lane,
SiteDoubledGaugeField & UU,
const SiteDoubledGaugeField &U)
{
auto U_l = extractLane(lane,U);
insertLane(lane,UU,U_l);
}
static accelerator_inline void multLinkGpu(int lane, static accelerator_inline void multLinkGpu(int lane,
typename SiteHalfSpinor::scalar_object &phi, typename SiteHalfSpinor::scalar_object &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
@ -224,6 +232,17 @@ public:
auto U_l = extractLane(lane,U(mu)); auto U_l = extractLane(lane,U(mu));
phi() = U_l * chi(); phi() = U_l * chi();
} }
#else
static accelerator_inline void multLinkGpu(int lane,
SiteHalfSpinor &phi,
const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi,
int mu)
{
auto U_l = U(mu);
phi() = U_l * chi();
}
#endif
static accelerator_inline void multLinkProp(SitePropagator &phi, static accelerator_inline void multLinkProp(SitePropagator &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
@ -364,7 +383,13 @@ public:
} }
mult(&phi(), &UU(), &chi()); mult(&phi(), &UU(), &chi());
} }
#ifdef GPU_VEC
static accelerator_inline void copyLinkGpu(int lane,
SiteDoubledGaugeField & UU,
const SiteDoubledGaugeField &U)
{
UU = U;
}
static accelerator_inline void multLinkGpu(int lane, static accelerator_inline void multLinkGpu(int lane,
typename SiteHalfSpinor::scalar_object &phi, typename SiteHalfSpinor::scalar_object &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
@ -374,6 +399,17 @@ public:
auto U_l = U(mu); auto U_l = U(mu);
phi() = U_l * chi(); phi() = U_l * chi();
} }
#else
static accelerator_inline void multLinkGpu(int lane,
SiteHalfSpinor &phi,
const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi,
int mu)
{
auto U_l = U(mu);
phi() = U_l * chi();
}
#endif
static accelerator_inline void multLinkProp(SitePropagator &phi, static accelerator_inline void multLinkProp(SitePropagator &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,

View File

@ -104,6 +104,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
#ifndef GRID_NVCC
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -121,6 +122,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm); GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm); GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
vstream(out[sF], result); vstream(out[sF], result);
#endif
}; };
template <class Impl> template <class Impl>
@ -128,6 +130,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
#ifndef GRID_NVCC
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -145,6 +148,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm); GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm); GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
vstream(out[sF], result); vstream(out[sF], result);
#endif
}; };
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Interior kernels // Interior kernels
@ -154,6 +158,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, Do
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
#ifndef GRID_NVCC
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -172,6 +177,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, Do
GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm); GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm); GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
vstream(out[sF], result); vstream(out[sF], result);
#endif
}; };
template <class Impl> template <class Impl>
@ -179,6 +185,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, Doubl
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
#ifndef GRID_NVCC
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -196,6 +203,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, Doubl
GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm); GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm); GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
vstream(out[sF], result); vstream(out[sF], result);
#endif
}; };
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Exterior kernels // Exterior kernels
@ -205,6 +213,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, Do
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
#ifndef GRID_NVCC
// SiteHalfSpinor tmp; // SiteHalfSpinor tmp;
// SiteHalfSpinor chi; // SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -225,6 +234,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, Do
if ( nmu ) { if ( nmu ) {
out[sF] = out[sF] + result; out[sF] = out[sF] + result;
} }
#endif
}; };
template <class Impl> template <class Impl>
@ -232,6 +242,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, Doubl
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
#ifndef GRID_NVCC
// SiteHalfSpinor tmp; // SiteHalfSpinor tmp;
// SiteHalfSpinor chi; // SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -252,12 +263,14 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, Doubl
if ( nmu ) { if ( nmu ) {
out[sF] = out[sF] + result; out[sF] = out[sF] + result;
} }
#endif
}; };
template <class Impl> template <class Impl>
accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
{ {
#ifndef GRID_NVCC
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteSpinor result; SiteSpinor result;
@ -275,6 +288,7 @@ accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFie
GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm); GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm); GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
vstream(out[sF], result); vstream(out[sF], result);
#endif
} }
/******************************************************************************* /*******************************************************************************

View File

@ -54,59 +54,11 @@ public:
static void Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, static void Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) int interior=1,int exterior=1) ;
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
const uint64_t nsimd = Simd::Nsimd();
const uint64_t NN = Nsite*Ls*nsimd;
accelerator_loopN( sss, NN, {
uint64_t cur = sss;
// uint64_t lane = cur % nsimd;
cur = cur / nsimd;
uint64_t sF = cur; cur = cur / Ls;
uint64_t sU = cur;
WilsonKernels<Impl>::GpuDhopSite(st_v,U_v,buf,sF,sU,in_v,out_v);
});
} else {
accelerator_loop( ss, U_v, {
int sU = ss;
int sF = Ls * sU;
DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
});
}
}
static void DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, static void DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) int interior=1,int exterior=1) ;
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
const uint64_t nsimd = Simd::Nsimd();
const uint64_t NN = Nsite*Ls*nsimd;
accelerator_loopN( sss, NN, {
uint64_t cur = sss;
// uint64_t lane = cur % nsimd;
cur = cur / nsimd;
uint64_t sF = cur; cur = cur / Ls;
uint64_t sU = cur;
WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,sF,sU,in_v,out_v);
});
} else {
accelerator_loop( ss, U_v, {
int sU = ss;
int sF = Ls * sU;
DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
});
}
}
template <bool EnableBool = true> static accelerator template <bool EnableBool = true> static accelerator
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
@ -275,11 +227,11 @@ public:
private: private:
// Specialised variants // Specialised variants
static accelerator void GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, static accelerator_inline void GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out); int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
static accelerator void GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, static accelerator_inline void GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out); int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out); int sF, int sU, const FermionFieldView &in, FermionFieldView &out);

View File

@ -57,30 +57,51 @@ accelerator_inline int get_my_lane_offset(int Nsimd)
#endif #endif
} }
#ifdef GPU_VEC
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
synchronise(); \ synchronise(); \
if (SE->_is_local) { \ if (SE->_is_local) { \
int mask = Nsimd >> (ptype + 1); \ int mask = Nsimd >> (ptype + 1); \
int plane= SE->_permute ? (lane ^ mask) : lane; \ int plane= SE->_permute ? (lane ^ mask) : lane; \
auto in_l = extractLane(plane,in[SE->_offset]); \ auto in_l = extractLane(plane,in[SE->_offset+s]); \
spProj(chi,in_l); \ spProj(chi,in_l); \
} else { \ } else { \
chi = extractLane(lane,buf[SE->_offset]); \ chi = extractLane(lane,buf[SE->_offset+s]); \
} \ } \
synchronise(); synchronise();
#else
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
if (SE->_is_local) { \
auto in_t = in[SE->_offset+s]; \
if (SE->_permute) { \
spProj(tmp, in_t); \
permute(chi, tmp, ptype); \
} else { \
spProj(chi, in_t); \
} \
} else { \
chi = buf[SE->_offset+s]; \
} \
synchronise();
#endif
template <class Impl> template <class Impl>
accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int Ls, int s,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
#ifdef GPU_VEC
typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object chi;
typename SiteHalfSpinor::scalar_object Uchi; typename SiteHalfSpinor::scalar_object Uchi;
typename SiteSpinor::scalar_object result; typename SiteSpinor::scalar_object result;
#else
SiteHalfSpinor chi;
SiteHalfSpinor Uchi;
SiteHalfSpinor tmp;
SiteSpinor result;
#endif
typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type; typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd); uint64_t lane_offset= get_my_lane_offset(Nsimd);
@ -88,69 +109,80 @@ accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGau
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
uint64_t ssF = Ls * sU;
uint64_t sF = ssF + s;
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){ for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else #else
int lane = lane_offset; { int lane = lane_offset; {
#endif #endif
SE = st.GetEntry(ptype, Xp, sF); SE = st.GetEntry(ptype, Xp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp); Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
spReconXp(result, Uchi); spReconXp(result, Uchi);
SE = st.GetEntry(ptype, Yp, sF); SE = st.GetEntry(ptype, Yp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp); GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp); Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
accumReconYp(result, Uchi); accumReconYp(result, Uchi);
SE = st.GetEntry(ptype, Zp, sF); SE = st.GetEntry(ptype, Zp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp); GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp); Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
accumReconZp(result, Uchi); accumReconZp(result, Uchi);
SE = st.GetEntry(ptype, Tp, sF); SE = st.GetEntry(ptype, Tp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp); GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp); Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
accumReconTp(result, Uchi); accumReconTp(result, Uchi);
SE = st.GetEntry(ptype, Xm, sF); SE = st.GetEntry(ptype, Xm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm); GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm); Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
accumReconXm(result, Uchi); accumReconXm(result, Uchi);
SE = st.GetEntry(ptype, Ym, sF); SE = st.GetEntry(ptype, Ym, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm); GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym); Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
accumReconYm(result, Uchi); accumReconYm(result, Uchi);
SE = st.GetEntry(ptype, Zm, sF); SE = st.GetEntry(ptype, Zm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm); GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm); Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
accumReconZm(result, Uchi); accumReconZm(result, Uchi);
SE = st.GetEntry(ptype, Tm, sF); SE = st.GetEntry(ptype, Tm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm); Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
accumReconTm(result, Uchi); accumReconTm(result, Uchi);
synchronise(); synchronise();
#ifdef GPU_VEC
insertLane (lane,out[sF],result); insertLane (lane,out[sF],result);
#else
vstream(out[sF], result);
#endif
} }
} }
template <class Impl> template <class Impl>
accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int Ls, int s,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
{ {
#ifdef GPU_VEC
typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object chi;
typename SiteHalfSpinor::scalar_object Uchi; typename SiteHalfSpinor::scalar_object Uchi;
typename SiteSpinor::scalar_object result; typename SiteSpinor::scalar_object result;
#else
SiteHalfSpinor chi;
SiteHalfSpinor Uchi;
SiteHalfSpinor tmp;
SiteSpinor result;
#endif
typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type; typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd); uint64_t lane_offset= get_my_lane_offset(Nsimd);
@ -158,54 +190,62 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
// Forces some degree of coalesce on the table look ups
// Could also use wide load instructions on the data structure
uint64_t ssF = Ls * sU;
uint64_t sF = ssF + s;
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){ for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else #else
int lane = lane_offset; { int lane = lane_offset; {
#endif #endif
SE = st.GetEntry(ptype, Xp, sF); SE = st.GetEntry(ptype, Xp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp); Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
spReconXm(result, Uchi); spReconXm(result, Uchi);
SE = st.GetEntry(ptype, Yp, sF); SE = st.GetEntry(ptype, Yp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm); GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp); Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
accumReconYm(result, Uchi); accumReconYm(result, Uchi);
SE = st.GetEntry(ptype, Zp, sF); SE = st.GetEntry(ptype, Zp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm); GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp); Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
accumReconZm(result, Uchi); accumReconZm(result, Uchi);
SE = st.GetEntry(ptype, Tp, sF); SE = st.GetEntry(ptype, Tp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm); GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp); Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
accumReconTm(result, Uchi); accumReconTm(result, Uchi);
SE = st.GetEntry(ptype, Xm, sF); SE = st.GetEntry(ptype, Xm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp); GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm); Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
accumReconXp(result, Uchi); accumReconXp(result, Uchi);
SE = st.GetEntry(ptype, Ym, sF); SE = st.GetEntry(ptype, Ym, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp); GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym); Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
accumReconYp(result, Uchi); accumReconYp(result, Uchi);
SE = st.GetEntry(ptype, Zm, sF); SE = st.GetEntry(ptype, Zm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp); GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm); Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
accumReconZp(result, Uchi); accumReconZp(result, Uchi);
SE = st.GetEntry(ptype, Tm, sF); SE = st.GetEntry(ptype, Tm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm); Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
accumReconTp(result, Uchi); accumReconTp(result, Uchi);
synchronise(); synchronise();
#ifdef GPU_VEC
insertLane (lane,out[sF],result); insertLane (lane,out[sF],result);
#else
vstream(out[sF], result);
#endif
} }
}; };
@ -213,20 +253,20 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
// Template specialise Gparity to empty for now // Template specialise Gparity to empty for now
#define GPU_EMPTY(A) \ #define GPU_EMPTY(A) \
template <> \ template <> \
accelerator void \ accelerator_inline void \
WilsonKernels<A>::GpuDhopSite(StencilView &st, \ WilsonKernels<A>::GpuDhopSite(StencilView &st, \
DoubledGaugeFieldView &U, \ SiteDoubledGaugeField &U, \
SiteHalfSpinor *buf, int sF, \ SiteHalfSpinor *buf, int Ls, int sF, \
int sU, \ int sU, \
const FermionFieldView &in, \ const FermionFieldView &in, \
FermionFieldView &out) { assert(0);}; \ FermionFieldView &out) { assert(0);}; \
template <> \ template <> \
accelerator void \ accelerator_inline void \
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \ WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
DoubledGaugeFieldView &U, \ DoubledGaugeFieldView &U, \
SiteHalfSpinor *buf, int sF, \ SiteHalfSpinor *buf, int Ls,int sF, \
int sU, \ int sU, \
const FermionFieldView &in, \ const FermionFieldView &in, \
FermionFieldView &out) { assert(0);}; FermionFieldView &out) { assert(0);};
GPU_EMPTY(GparityWilsonImplF); GPU_EMPTY(GparityWilsonImplF);
@ -234,6 +274,67 @@ GPU_EMPTY(GparityWilsonImplFH);
GPU_EMPTY(GparityWilsonImplD); GPU_EMPTY(GparityWilsonImplD);
GPU_EMPTY(GparityWilsonImplDF); GPU_EMPTY(GparityWilsonImplDF);
template <class Impl>
void WilsonKernels<Impl>::Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
const uint64_t nsimd = Simd::Nsimd();
const uint64_t NN = Nsite*Ls*nsimd;
accelerator_loopN( sss, NN, {
uint64_t cur = sss;
// uint64_t lane = cur % nsimd;
cur = cur / nsimd;
uint64_t s = cur%Ls;
uint64_t sF = cur; cur = cur / Ls;
uint64_t sU = cur;
WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
});
} else {
accelerator_loop( ss, U_v, {
int sU = ss;
int sF = Ls * sU;
DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
});
}
}
template <class Impl>
void WilsonKernels<Impl>::DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
const uint64_t nsimd = Simd::Nsimd();
const uint64_t NN = Nsite*Ls*nsimd;
accelerator_loopN( sss, NN, {
uint64_t cur = sss;
// uint64_t lane = cur % nsimd;
cur = cur / nsimd;
uint64_t s = cur%Ls;
uint64_t sF = cur; cur = cur / Ls;
uint64_t sU = cur;
WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
});
} else {
accelerator_loop( ss, U_v, {
int sU = ss;
int sF = Ls * sU;
DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
});
}
}
/* /*
GPU_EMPTY(DomainWallVec5dImplF); GPU_EMPTY(DomainWallVec5dImplF);
GPU_EMPTY(DomainWallVec5dImplFH); GPU_EMPTY(DomainWallVec5dImplFH);

View File

@ -36,7 +36,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
#define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta //#define COALESCE_GRANULARITY (64) // bytes for coalesce granularity of target: Pascal, Volta
//#define COALESCE_GRANULARITY (32) // bytes for coalesce granularity of target: Pascal, Volta
#define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta
template<class pair> template<class pair>
class GpuComplex { class GpuComplex {