mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 15:55:37 +00:00
Finally starting to get decent performance on Volta
This commit is contained in:
parent
2cc07450f4
commit
b2b5137d28
@ -215,6 +215,14 @@ public:
|
|||||||
mult(&phi(), &U(mu), &chi());
|
mult(&phi(), &U(mu), &chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GPU_VEC
|
||||||
|
static accelerator_inline void copyLinkGpu(int lane,
|
||||||
|
SiteDoubledGaugeField & UU,
|
||||||
|
const SiteDoubledGaugeField &U)
|
||||||
|
{
|
||||||
|
auto U_l = extractLane(lane,U);
|
||||||
|
insertLane(lane,UU,U_l);
|
||||||
|
}
|
||||||
static accelerator_inline void multLinkGpu(int lane,
|
static accelerator_inline void multLinkGpu(int lane,
|
||||||
typename SiteHalfSpinor::scalar_object &phi,
|
typename SiteHalfSpinor::scalar_object &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
@ -224,6 +232,17 @@ public:
|
|||||||
auto U_l = extractLane(lane,U(mu));
|
auto U_l = extractLane(lane,U(mu));
|
||||||
phi() = U_l * chi();
|
phi() = U_l * chi();
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
static accelerator_inline void multLinkGpu(int lane,
|
||||||
|
SiteHalfSpinor &phi,
|
||||||
|
const SiteDoubledGaugeField &U,
|
||||||
|
const SiteHalfSpinor &chi,
|
||||||
|
int mu)
|
||||||
|
{
|
||||||
|
auto U_l = U(mu);
|
||||||
|
phi() = U_l * chi();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
@ -364,7 +383,13 @@ public:
|
|||||||
}
|
}
|
||||||
mult(&phi(), &UU(), &chi());
|
mult(&phi(), &UU(), &chi());
|
||||||
}
|
}
|
||||||
|
#ifdef GPU_VEC
|
||||||
|
static accelerator_inline void copyLinkGpu(int lane,
|
||||||
|
SiteDoubledGaugeField & UU,
|
||||||
|
const SiteDoubledGaugeField &U)
|
||||||
|
{
|
||||||
|
UU = U;
|
||||||
|
}
|
||||||
static accelerator_inline void multLinkGpu(int lane,
|
static accelerator_inline void multLinkGpu(int lane,
|
||||||
typename SiteHalfSpinor::scalar_object &phi,
|
typename SiteHalfSpinor::scalar_object &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
@ -374,6 +399,17 @@ public:
|
|||||||
auto U_l = U(mu);
|
auto U_l = U(mu);
|
||||||
phi() = U_l * chi();
|
phi() = U_l * chi();
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
static accelerator_inline void multLinkGpu(int lane,
|
||||||
|
SiteHalfSpinor &phi,
|
||||||
|
const SiteDoubledGaugeField &U,
|
||||||
|
const SiteHalfSpinor &chi,
|
||||||
|
int mu)
|
||||||
|
{
|
||||||
|
auto U_l = U(mu);
|
||||||
|
phi() = U_l * chi();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
|
@ -104,6 +104,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
#ifndef GRID_NVCC
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -121,6 +122,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, Double
|
|||||||
GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -128,6 +130,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
#ifndef GRID_NVCC
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -145,6 +148,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGa
|
|||||||
GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Interior kernels
|
// Interior kernels
|
||||||
@ -154,6 +158,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, Do
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
#ifndef GRID_NVCC
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -172,6 +177,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, Do
|
|||||||
GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -179,6 +185,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, Doubl
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
#ifndef GRID_NVCC
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -196,6 +203,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, Doubl
|
|||||||
GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Exterior kernels
|
// Exterior kernels
|
||||||
@ -205,6 +213,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, Do
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
#ifndef GRID_NVCC
|
||||||
// SiteHalfSpinor tmp;
|
// SiteHalfSpinor tmp;
|
||||||
// SiteHalfSpinor chi;
|
// SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -225,6 +234,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, Do
|
|||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
out[sF] = out[sF] + result;
|
out[sF] = out[sF] + result;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -232,6 +242,7 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, Doubl
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
#ifndef GRID_NVCC
|
||||||
// SiteHalfSpinor tmp;
|
// SiteHalfSpinor tmp;
|
||||||
// SiteHalfSpinor chi;
|
// SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -252,12 +263,14 @@ accelerator void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, Doubl
|
|||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
out[sF] = out[sF] + result;
|
out[sF] = out[sF] + result;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||||
{
|
{
|
||||||
|
#ifndef GRID_NVCC
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteSpinor result;
|
SiteSpinor result;
|
||||||
@ -275,6 +288,7 @@ accelerator void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFie
|
|||||||
GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
|
GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
|
||||||
GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
|
GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
|
@ -54,59 +54,11 @@ public:
|
|||||||
|
|
||||||
static void Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
static void Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior=1,int exterior=1)
|
int interior=1,int exterior=1) ;
|
||||||
{
|
|
||||||
auto U_v = U.View();
|
|
||||||
auto in_v = in.View();
|
|
||||||
auto out_v = out.View();
|
|
||||||
auto st_v = st.View();
|
|
||||||
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
|
|
||||||
const uint64_t nsimd = Simd::Nsimd();
|
|
||||||
const uint64_t NN = Nsite*Ls*nsimd;
|
|
||||||
accelerator_loopN( sss, NN, {
|
|
||||||
uint64_t cur = sss;
|
|
||||||
// uint64_t lane = cur % nsimd;
|
|
||||||
cur = cur / nsimd;
|
|
||||||
uint64_t sF = cur; cur = cur / Ls;
|
|
||||||
uint64_t sU = cur;
|
|
||||||
WilsonKernels<Impl>::GpuDhopSite(st_v,U_v,buf,sF,sU,in_v,out_v);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
accelerator_loop( ss, U_v, {
|
|
||||||
int sU = ss;
|
|
||||||
int sF = Ls * sU;
|
|
||||||
DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
static void DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
static void DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior=1,int exterior=1)
|
int interior=1,int exterior=1) ;
|
||||||
{
|
|
||||||
auto U_v = U.View();
|
|
||||||
auto in_v = in.View();
|
|
||||||
auto out_v = out.View();
|
|
||||||
auto st_v = st.View();
|
|
||||||
|
|
||||||
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
|
|
||||||
const uint64_t nsimd = Simd::Nsimd();
|
|
||||||
const uint64_t NN = Nsite*Ls*nsimd;
|
|
||||||
accelerator_loopN( sss, NN, {
|
|
||||||
uint64_t cur = sss;
|
|
||||||
// uint64_t lane = cur % nsimd;
|
|
||||||
cur = cur / nsimd;
|
|
||||||
uint64_t sF = cur; cur = cur / Ls;
|
|
||||||
uint64_t sU = cur;
|
|
||||||
WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,sF,sU,in_v,out_v);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
accelerator_loop( ss, U_v, {
|
|
||||||
int sU = ss;
|
|
||||||
int sF = Ls * sU;
|
|
||||||
DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <bool EnableBool = true> static accelerator
|
template <bool EnableBool = true> static accelerator
|
||||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
||||||
@ -275,11 +227,11 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Specialised variants
|
// Specialised variants
|
||||||
static accelerator void GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator_inline void GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
static accelerator void GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator_inline void GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
@ -57,30 +57,51 @@ accelerator_inline int get_my_lane_offset(int Nsimd)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GPU_VEC
|
||||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||||
synchronise(); \
|
synchronise(); \
|
||||||
if (SE->_is_local) { \
|
if (SE->_is_local) { \
|
||||||
int mask = Nsimd >> (ptype + 1); \
|
int mask = Nsimd >> (ptype + 1); \
|
||||||
int plane= SE->_permute ? (lane ^ mask) : lane; \
|
int plane= SE->_permute ? (lane ^ mask) : lane; \
|
||||||
auto in_l = extractLane(plane,in[SE->_offset]); \
|
auto in_l = extractLane(plane,in[SE->_offset+s]); \
|
||||||
spProj(chi,in_l); \
|
spProj(chi,in_l); \
|
||||||
} else { \
|
} else { \
|
||||||
chi = extractLane(lane,buf[SE->_offset]); \
|
chi = extractLane(lane,buf[SE->_offset+s]); \
|
||||||
} \
|
} \
|
||||||
synchronise();
|
synchronise();
|
||||||
|
#else
|
||||||
|
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||||
|
if (SE->_is_local) { \
|
||||||
|
auto in_t = in[SE->_offset+s]; \
|
||||||
|
if (SE->_permute) { \
|
||||||
|
spProj(tmp, in_t); \
|
||||||
|
permute(chi, tmp, ptype); \
|
||||||
|
} else { \
|
||||||
|
spProj(chi, in_t); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
chi = buf[SE->_offset+s]; \
|
||||||
|
} \
|
||||||
|
synchronise();
|
||||||
|
#endif
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int Ls, int s,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
#ifdef GPU_VEC
|
||||||
typename SiteHalfSpinor::scalar_object chi;
|
typename SiteHalfSpinor::scalar_object chi;
|
||||||
typename SiteHalfSpinor::scalar_object Uchi;
|
typename SiteHalfSpinor::scalar_object Uchi;
|
||||||
typename SiteSpinor::scalar_object result;
|
typename SiteSpinor::scalar_object result;
|
||||||
|
#else
|
||||||
|
SiteHalfSpinor chi;
|
||||||
|
SiteHalfSpinor Uchi;
|
||||||
|
SiteHalfSpinor tmp;
|
||||||
|
SiteSpinor result;
|
||||||
|
#endif
|
||||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
typedef typename SiteSpinor::scalar_type scalar_type;
|
||||||
typedef typename SiteSpinor::vector_type vector_type;
|
typedef typename SiteSpinor::vector_type vector_type;
|
||||||
|
|
||||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||||
|
|
||||||
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
||||||
@ -88,69 +109,80 @@ accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGau
|
|||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
|
uint64_t ssF = Ls * sU;
|
||||||
|
uint64_t sF = ssF + s;
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef __CUDA_ARCH__
|
||||||
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
||||||
#else
|
#else
|
||||||
int lane = lane_offset; {
|
int lane = lane_offset; {
|
||||||
#endif
|
#endif
|
||||||
SE = st.GetEntry(ptype, Xp, sF);
|
SE = st.GetEntry(ptype, Xp, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
|
||||||
spReconXp(result, Uchi);
|
spReconXp(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Yp, sF);
|
SE = st.GetEntry(ptype, Yp, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
|
||||||
accumReconYp(result, Uchi);
|
accumReconYp(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Zp, sF);
|
SE = st.GetEntry(ptype, Zp, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
|
||||||
accumReconZp(result, Uchi);
|
accumReconZp(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Tp, sF);
|
SE = st.GetEntry(ptype, Tp, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
|
||||||
accumReconTp(result, Uchi);
|
accumReconTp(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Xm, sF);
|
SE = st.GetEntry(ptype, Xm, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
|
||||||
accumReconXm(result, Uchi);
|
accumReconXm(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Ym, sF);
|
SE = st.GetEntry(ptype, Ym, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
|
||||||
accumReconYm(result, Uchi);
|
accumReconYm(result, Uchi);
|
||||||
|
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Zm, sF);
|
SE = st.GetEntry(ptype, Zm, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
|
||||||
accumReconZm(result, Uchi);
|
accumReconZm(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Tm, sF);
|
SE = st.GetEntry(ptype, Tm, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
|
||||||
accumReconTm(result, Uchi);
|
accumReconTm(result, Uchi);
|
||||||
|
|
||||||
synchronise();
|
synchronise();
|
||||||
|
#ifdef GPU_VEC
|
||||||
insertLane (lane,out[sF],result);
|
insertLane (lane,out[sF],result);
|
||||||
|
#else
|
||||||
|
vstream(out[sF], result);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int Ls, int s,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
#ifdef GPU_VEC
|
||||||
typename SiteHalfSpinor::scalar_object chi;
|
typename SiteHalfSpinor::scalar_object chi;
|
||||||
typename SiteHalfSpinor::scalar_object Uchi;
|
typename SiteHalfSpinor::scalar_object Uchi;
|
||||||
typename SiteSpinor::scalar_object result;
|
typename SiteSpinor::scalar_object result;
|
||||||
|
#else
|
||||||
|
SiteHalfSpinor chi;
|
||||||
|
SiteHalfSpinor Uchi;
|
||||||
|
SiteHalfSpinor tmp;
|
||||||
|
SiteSpinor result;
|
||||||
|
#endif
|
||||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
typedef typename SiteSpinor::scalar_type scalar_type;
|
||||||
typedef typename SiteSpinor::vector_type vector_type;
|
typedef typename SiteSpinor::vector_type vector_type;
|
||||||
|
|
||||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||||
|
|
||||||
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
||||||
@ -158,54 +190,62 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
|
|||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
|
// Forces some degree of coalesce on the table look ups
|
||||||
|
// Could also use wide load instructions on the data structure
|
||||||
|
uint64_t ssF = Ls * sU;
|
||||||
|
uint64_t sF = ssF + s;
|
||||||
|
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef __CUDA_ARCH__
|
||||||
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
||||||
#else
|
#else
|
||||||
int lane = lane_offset; {
|
int lane = lane_offset; {
|
||||||
#endif
|
#endif
|
||||||
SE = st.GetEntry(ptype, Xp, sF);
|
SE = st.GetEntry(ptype, Xp, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
|
||||||
spReconXm(result, Uchi);
|
spReconXm(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Yp, sF);
|
SE = st.GetEntry(ptype, Yp, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
|
||||||
accumReconYm(result, Uchi);
|
accumReconYm(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Zp, sF);
|
SE = st.GetEntry(ptype, Zp, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
|
||||||
accumReconZm(result, Uchi);
|
accumReconZm(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Tp, sF);
|
SE = st.GetEntry(ptype, Tp, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
|
||||||
accumReconTm(result, Uchi);
|
accumReconTm(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Xm, sF);
|
SE = st.GetEntry(ptype, Xm, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
|
||||||
accumReconXp(result, Uchi);
|
accumReconXp(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Ym, sF);
|
SE = st.GetEntry(ptype, Ym, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
|
||||||
accumReconYp(result, Uchi);
|
accumReconYp(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Zm, sF);
|
SE = st.GetEntry(ptype, Zm, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
|
||||||
accumReconZp(result, Uchi);
|
accumReconZp(result, Uchi);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Tm, sF);
|
SE = st.GetEntry(ptype, Tm, ssF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
|
||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
|
||||||
accumReconTp(result, Uchi);
|
accumReconTp(result, Uchi);
|
||||||
|
|
||||||
synchronise();
|
synchronise();
|
||||||
|
#ifdef GPU_VEC
|
||||||
insertLane (lane,out[sF],result);
|
insertLane (lane,out[sF],result);
|
||||||
|
#else
|
||||||
|
vstream(out[sF], result);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -213,20 +253,20 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
|
|||||||
// Template specialise Gparity to empty for now
|
// Template specialise Gparity to empty for now
|
||||||
#define GPU_EMPTY(A) \
|
#define GPU_EMPTY(A) \
|
||||||
template <> \
|
template <> \
|
||||||
accelerator void \
|
accelerator_inline void \
|
||||||
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
|
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
|
||||||
DoubledGaugeFieldView &U, \
|
SiteDoubledGaugeField &U, \
|
||||||
SiteHalfSpinor *buf, int sF, \
|
SiteHalfSpinor *buf, int Ls, int sF, \
|
||||||
int sU, \
|
int sU, \
|
||||||
const FermionFieldView &in, \
|
const FermionFieldView &in, \
|
||||||
FermionFieldView &out) { assert(0);}; \
|
FermionFieldView &out) { assert(0);}; \
|
||||||
template <> \
|
template <> \
|
||||||
accelerator void \
|
accelerator_inline void \
|
||||||
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
|
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
|
||||||
DoubledGaugeFieldView &U, \
|
DoubledGaugeFieldView &U, \
|
||||||
SiteHalfSpinor *buf, int sF, \
|
SiteHalfSpinor *buf, int Ls,int sF, \
|
||||||
int sU, \
|
int sU, \
|
||||||
const FermionFieldView &in, \
|
const FermionFieldView &in, \
|
||||||
FermionFieldView &out) { assert(0);};
|
FermionFieldView &out) { assert(0);};
|
||||||
|
|
||||||
GPU_EMPTY(GparityWilsonImplF);
|
GPU_EMPTY(GparityWilsonImplF);
|
||||||
@ -234,6 +274,67 @@ GPU_EMPTY(GparityWilsonImplFH);
|
|||||||
GPU_EMPTY(GparityWilsonImplD);
|
GPU_EMPTY(GparityWilsonImplD);
|
||||||
GPU_EMPTY(GparityWilsonImplDF);
|
GPU_EMPTY(GparityWilsonImplDF);
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonKernels<Impl>::Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
|
int interior,int exterior)
|
||||||
|
{
|
||||||
|
auto U_v = U.View();
|
||||||
|
auto in_v = in.View();
|
||||||
|
auto out_v = out.View();
|
||||||
|
auto st_v = st.View();
|
||||||
|
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
|
||||||
|
const uint64_t nsimd = Simd::Nsimd();
|
||||||
|
const uint64_t NN = Nsite*Ls*nsimd;
|
||||||
|
accelerator_loopN( sss, NN, {
|
||||||
|
uint64_t cur = sss;
|
||||||
|
// uint64_t lane = cur % nsimd;
|
||||||
|
cur = cur / nsimd;
|
||||||
|
uint64_t s = cur%Ls;
|
||||||
|
uint64_t sF = cur; cur = cur / Ls;
|
||||||
|
uint64_t sU = cur;
|
||||||
|
WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
accelerator_loop( ss, U_v, {
|
||||||
|
int sU = ss;
|
||||||
|
int sF = Ls * sU;
|
||||||
|
DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonKernels<Impl>::DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
|
int interior,int exterior)
|
||||||
|
{
|
||||||
|
auto U_v = U.View();
|
||||||
|
auto in_v = in.View();
|
||||||
|
auto out_v = out.View();
|
||||||
|
auto st_v = st.View();
|
||||||
|
|
||||||
|
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
|
||||||
|
const uint64_t nsimd = Simd::Nsimd();
|
||||||
|
const uint64_t NN = Nsite*Ls*nsimd;
|
||||||
|
accelerator_loopN( sss, NN, {
|
||||||
|
uint64_t cur = sss;
|
||||||
|
// uint64_t lane = cur % nsimd;
|
||||||
|
cur = cur / nsimd;
|
||||||
|
uint64_t s = cur%Ls;
|
||||||
|
uint64_t sF = cur; cur = cur / Ls;
|
||||||
|
uint64_t sU = cur;
|
||||||
|
WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
accelerator_loop( ss, U_v, {
|
||||||
|
int sU = ss;
|
||||||
|
int sF = Ls * sU;
|
||||||
|
DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
GPU_EMPTY(DomainWallVec5dImplF);
|
GPU_EMPTY(DomainWallVec5dImplF);
|
||||||
GPU_EMPTY(DomainWallVec5dImplFH);
|
GPU_EMPTY(DomainWallVec5dImplFH);
|
||||||
|
@ -36,7 +36,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
#define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta
|
//#define COALESCE_GRANULARITY (64) // bytes for coalesce granularity of target: Pascal, Volta
|
||||||
|
//#define COALESCE_GRANULARITY (32) // bytes for coalesce granularity of target: Pascal, Volta
|
||||||
|
#define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta
|
||||||
|
|
||||||
template<class pair>
|
template<class pair>
|
||||||
class GpuComplex {
|
class GpuComplex {
|
||||||
|
Loading…
Reference in New Issue
Block a user