mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Remove Gpu only kernels.
This commit is contained in:
parent
9fbcfe612c
commit
3e41b1055c
@ -101,47 +101,6 @@ public:
|
|||||||
mult(&phi(), &UU(), &chi());
|
mult(&phi(), &UU(), &chi());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#ifdef GPU_VEC
|
|
||||||
static accelerator_inline void copyLinkGpu(int lane,
|
|
||||||
SiteDoubledGaugeField & UU,
|
|
||||||
const SiteDoubledGaugeField &U)
|
|
||||||
{
|
|
||||||
UU = U;
|
|
||||||
}
|
|
||||||
static accelerator_inline void multLinkGpu(int lane,
|
|
||||||
typename SiteHalfSpinor::scalar_object &phi,
|
|
||||||
const SiteDoubledGaugeField &U,
|
|
||||||
const typename SiteHalfSpinor::scalar_object &chi,
|
|
||||||
int mu)
|
|
||||||
{
|
|
||||||
#if 1
|
|
||||||
typedef typename ExtractTypeMap<typename Simd::scalar_type>::extract_type extract_type;
|
|
||||||
|
|
||||||
SiteScalarGaugeLink U_l;
|
|
||||||
|
|
||||||
extract_type * U_mem = (extract_type *) &U(mu);
|
|
||||||
extract_type * U_stack= (extract_type *) &U_l;
|
|
||||||
|
|
||||||
for(int w=0;w<(sizeof(U_l)/sizeof(extract_type)) ;w++) U_stack[w] = U_mem[w];
|
|
||||||
|
|
||||||
phi() = U_l() * chi();
|
|
||||||
#else
|
|
||||||
auto U_l = U(mu);
|
|
||||||
|
|
||||||
phi() = U_l * chi();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static accelerator_inline void multLinkGpu(int lane,
|
|
||||||
SiteHalfSpinor &phi,
|
|
||||||
const SiteDoubledGaugeField &U,
|
|
||||||
const SiteHalfSpinor &chi,
|
|
||||||
int mu)
|
|
||||||
{
|
|
||||||
auto U_l = U(mu);
|
|
||||||
phi() = U_l * chi();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
|
@ -90,16 +90,6 @@ public:
|
|||||||
auto UU = coalescedRead(U(mu));
|
auto UU = coalescedRead(U(mu));
|
||||||
mult(&phi(), &UU, &chi());
|
mult(&phi(), &UU, &chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
static accelerator_inline void multLinkGpu(int lane,
|
|
||||||
typename SiteHalfSpinor::scalar_object &phi,
|
|
||||||
const SiteDoubledGaugeField &U,
|
|
||||||
const typename SiteHalfSpinor::scalar_object &chi,
|
|
||||||
int mu)
|
|
||||||
{
|
|
||||||
auto U_l = extractLane(lane,U(mu));
|
|
||||||
phi() = U_l * chi();
|
|
||||||
}
|
|
||||||
|
|
||||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
|
@ -38,7 +38,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
class WilsonKernelsStatic {
|
class WilsonKernelsStatic {
|
||||||
public:
|
public:
|
||||||
enum { OptGeneric, OptHandUnroll, OptInlineAsm, OptGpu };
|
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
||||||
enum { CommsAndCompute, CommsThenCompute };
|
enum { CommsAndCompute, CommsThenCompute };
|
||||||
static int Opt;
|
static int Opt;
|
||||||
static int Comms;
|
static int Comms;
|
||||||
@ -100,12 +100,6 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Specialised variants
|
// Specialised variants
|
||||||
static accelerator void GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, SiteHalfSpinor * buf,
|
|
||||||
int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
|
||||||
|
|
||||||
static accelerator void GpuDhopSiteDag(StencilView &st, SiteDoubledGaugeField &U, SiteHalfSpinor * buf,
|
|
||||||
int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
|
||||||
|
|
||||||
static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
@ -143,25 +137,23 @@ private:
|
|||||||
int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
// Keep Hand unrolled temporarily
|
// Keep Hand unrolled temporarily
|
||||||
#if 1
|
|
||||||
static accelerator void HandDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator void HandDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
static accelerator void HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator void HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
static accelerator void HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator void HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
static accelerator void HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator void HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
static accelerator void HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator void HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
|
|
||||||
static accelerator void HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
static accelerator void HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||||
#endif
|
|
||||||
public:
|
public:
|
||||||
WilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
|
WilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
|
||||||
};
|
};
|
||||||
|
@ -33,233 +33,6 @@ directory
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////
|
|
||||||
// Gpu implementation; thread loop is implicit ; move to header
|
|
||||||
//////////////////////////////////////////////////////////////
|
|
||||||
accelerator_inline int get_my_lanes(int Nsimd)
|
|
||||||
{
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
return 1;
|
|
||||||
#else
|
|
||||||
return Nsimd;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
accelerator_inline int get_my_lane_offset(int Nsimd)
|
|
||||||
{
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
return ( (threadIdx.x) % Nsimd);
|
|
||||||
#else
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|
||||||
{
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
|
||||||
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
|
||||||
uint4 * chip_pun = (uint4 *)&chip;
|
|
||||||
* chip_pun = * mem_pun;
|
|
||||||
#else
|
|
||||||
chip = *mem;
|
|
||||||
#endif
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
|
||||||
if (SE._is_local) { \
|
|
||||||
int mask = Nsimd >> (ptype + 1); \
|
|
||||||
int plane= SE._permute ? (lane ^ mask) : lane; \
|
|
||||||
auto in_l = extractLane(plane,in[SE._offset+s]); \
|
|
||||||
spProj(chi,in_l); \
|
|
||||||
} else { \
|
|
||||||
chi = extractLane(lane,buf[SE._offset+s]); \
|
|
||||||
} \
|
|
||||||
synchronise();
|
|
||||||
#else
|
|
||||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
|
||||||
if (SE._is_local) { \
|
|
||||||
auto in_t = in[SE._offset+s]; \
|
|
||||||
decltype(chi) tmp; \
|
|
||||||
if (SE._permute) { \
|
|
||||||
spProj(tmp, in_t); \
|
|
||||||
permute(chi, tmp, ptype); \
|
|
||||||
} else { \
|
|
||||||
spProj(chi, in_t); \
|
|
||||||
} \
|
|
||||||
} else { \
|
|
||||||
chi = (buf[SE._offset+s]; \
|
|
||||||
} \
|
|
||||||
synchronise();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, SiteDoubledGaugeField &U,
|
|
||||||
SiteHalfSpinor *buf, int Ls, int s,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typename SiteHalfSpinor::scalar_object chi;
|
|
||||||
typename SiteHalfSpinor::scalar_object Uchi;
|
|
||||||
typename SiteSpinor::scalar_object result;
|
|
||||||
|
|
||||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
|
||||||
typedef typename SiteSpinor::vector_type vector_type;
|
|
||||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
|
||||||
|
|
||||||
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
|
||||||
uint64_t lanes = get_my_lanes(Nsimd);
|
|
||||||
|
|
||||||
StencilEntry *SE_mem;
|
|
||||||
StencilEntry SE;
|
|
||||||
|
|
||||||
int ptype;
|
|
||||||
uint64_t ssF = Ls * sU;
|
|
||||||
uint64_t sF = ssF + s;
|
|
||||||
#ifndef __CUDA_ARCH__
|
|
||||||
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
|
||||||
#else
|
|
||||||
int lane = lane_offset; {
|
|
||||||
#endif
|
|
||||||
SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
|
|
||||||
spReconXp(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
|
|
||||||
accumReconYp(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
|
|
||||||
accumReconZp(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
|
|
||||||
accumReconTp(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
|
|
||||||
accumReconXm(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
|
|
||||||
accumReconYm(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
|
|
||||||
accumReconZm(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
|
|
||||||
accumReconTm(result, Uchi);
|
|
||||||
insertLane (lane,out[sF],result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
|
|
||||||
SiteHalfSpinor *buf, int Ls, int s,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typename SiteHalfSpinor::scalar_object chi;
|
|
||||||
typename SiteHalfSpinor::scalar_object Uchi;
|
|
||||||
typename SiteSpinor::scalar_object result;
|
|
||||||
|
|
||||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
|
||||||
typedef typename SiteSpinor::vector_type vector_type;
|
|
||||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
|
||||||
|
|
||||||
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
|
||||||
uint64_t lanes = get_my_lanes(Nsimd);
|
|
||||||
|
|
||||||
StencilEntry *SE_mem;
|
|
||||||
StencilEntry SE;
|
|
||||||
int ptype;
|
|
||||||
// Forces some degree of coalesce on the table look ups
|
|
||||||
// Could also use wide load instructions on the data structure
|
|
||||||
uint64_t ssF = Ls * sU;
|
|
||||||
uint64_t sF = ssF + s;
|
|
||||||
|
|
||||||
#ifndef __CUDA_ARCH__
|
|
||||||
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
|
||||||
#else
|
|
||||||
int lane = lane_offset; {
|
|
||||||
#endif
|
|
||||||
SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
|
|
||||||
spReconXm(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
|
|
||||||
accumReconYm(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
|
|
||||||
accumReconZm(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
|
|
||||||
accumReconTm(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
|
|
||||||
accumReconXp(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
|
|
||||||
accumReconYp(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
|
|
||||||
accumReconZp(result, Uchi);
|
|
||||||
|
|
||||||
SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
|
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
|
|
||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
|
|
||||||
accumReconTp(result, Uchi);
|
|
||||||
|
|
||||||
insertLane (lane,out[sF],result);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Template specialise Gparity to empty for now
|
|
||||||
#define GPU_EMPTY(A) \
|
|
||||||
template <> \
|
|
||||||
accelerator_inline void \
|
|
||||||
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
|
|
||||||
SiteDoubledGaugeField &U, \
|
|
||||||
SiteHalfSpinor *buf, int Ls, int sF, \
|
|
||||||
int sU, \
|
|
||||||
const FermionFieldView &in, \
|
|
||||||
FermionFieldView &out) { assert(0);}; \
|
|
||||||
template <> \
|
|
||||||
accelerator_inline void \
|
|
||||||
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
|
|
||||||
SiteDoubledGaugeField &U, \
|
|
||||||
SiteHalfSpinor *buf, int Ls,int sF, \
|
|
||||||
int sU, \
|
|
||||||
const FermionFieldView &in, \
|
|
||||||
FermionFieldView &out) { assert(0);};
|
|
||||||
|
|
||||||
GPU_EMPTY(GparityWilsonImplF);
|
|
||||||
GPU_EMPTY(GparityWilsonImplFH);
|
|
||||||
GPU_EMPTY(GparityWilsonImplD);
|
|
||||||
GPU_EMPTY(GparityWilsonImplDF);
|
|
||||||
|
|
||||||
#define KERNEL_CALL(A) \
|
#define KERNEL_CALL(A) \
|
||||||
const uint64_t nsimd = Simd::Nsimd(); \
|
const uint64_t nsimd = Simd::Nsimd(); \
|
||||||
@ -282,6 +55,13 @@ GPU_EMPTY(GparityWilsonImplDF);
|
|||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#define ASM_CALL(A) \
|
||||||
|
SIMT_loop( ss, Nsite, { \
|
||||||
|
int sU = ss; \
|
||||||
|
int sF = ss*Ls; \
|
||||||
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||||
|
});
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
@ -293,17 +73,25 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
auto st_v = st.View();
|
auto st_v = st.View();
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGpu) {
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;}
|
||||||
KERNEL_CALL(GpuDhopSite);
|
#ifndef GRID_NVCC
|
||||||
} else {
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;}
|
||||||
HOST_CALL(GenericDhopSite);
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
||||||
}
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
HOST_CALL(GenericDhopSiteInt);
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;}
|
||||||
|
#ifndef GRID_NVCC
|
||||||
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;}
|
||||||
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||||
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
HOST_CALL(GenericDhopSiteExt);
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;}
|
||||||
|
#ifndef GRID_NVCC
|
||||||
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;}
|
||||||
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
@ -315,17 +103,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
auto out_v = out.View();
|
auto out_v = out.View();
|
||||||
auto st_v = st.View();
|
auto st_v = st.View();
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGpu) {
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDag); return;}
|
||||||
KERNEL_CALL(GpuDhopSiteDag);
|
#ifndef GRID_NVCC
|
||||||
} else {
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDag); return;}
|
||||||
HOST_CALL(GenericDhopSiteDag);
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||||
}
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
HOST_CALL(GenericDhopSiteDagInt);
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDagInt); return;}
|
||||||
} else if( exterior ) {
|
#ifndef GRID_NVCC
|
||||||
HOST_CALL(GenericDhopSiteDagExt);
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDagInt); return;}
|
||||||
}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||||
|
#endif
|
||||||
|
} else if( exterior ) {
|
||||||
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDagExt); return;}
|
||||||
|
#ifndef GRID_NVCC
|
||||||
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDagExt); return;}
|
||||||
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -38,6 +38,19 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// Generic implementation; move to different file?
|
// Generic implementation; move to different file?
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
|
||||||
|
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||||
|
{
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
||||||
|
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
||||||
|
uint4 * chip_pun = (uint4 *)&chip;
|
||||||
|
* chip_pun = * mem_pun;
|
||||||
|
#else
|
||||||
|
chip = *mem;
|
||||||
|
#endif
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
||||||
SE = st.GetEntry(ptype, Dir, sF); \
|
SE = st.GetEntry(ptype, Dir, sF); \
|
||||||
|
@ -433,9 +433,6 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptHandUnroll;
|
WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptHandUnroll;
|
||||||
StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptHandUnroll;
|
StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptHandUnroll;
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-gpu") ){
|
|
||||||
WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptGpu;
|
|
||||||
}
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
|
||||||
WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptInlineAsm;
|
WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptInlineAsm;
|
||||||
StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptInlineAsm;
|
StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptInlineAsm;
|
||||||
|
@ -177,7 +177,6 @@ int main (int argc, char ** argv)
|
|||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||||
#endif
|
#endif
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGpu ) std::cout << GridLogMessage<< "* Using Gpu WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
@ -317,7 +316,6 @@ int main (int argc, char ** argv)
|
|||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||||
#endif
|
#endif
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGpu ) std::cout << GridLogMessage<< "* Using Gpu WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
@ -358,10 +358,7 @@ case ${ax_cv_cxx_compiler_vendor} in
|
|||||||
esac;;
|
esac;;
|
||||||
intel)
|
intel)
|
||||||
case ${ac_SIMD} in
|
case ${ac_SIMD} in
|
||||||
GPU)
|
GPU|VGPU)
|
||||||
AC_DEFINE([GPU],[1],[GPU float4 vectors])
|
|
||||||
SIMD_FLAGS='';;
|
|
||||||
VGPU)
|
|
||||||
AC_DEFINE([GPU_VEC],[1],[GPU vectorised ])
|
AC_DEFINE([GPU_VEC],[1],[GPU vectorised ])
|
||||||
SIMD_FLAGS='';;
|
SIMD_FLAGS='';;
|
||||||
SSE4)
|
SSE4)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user