mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
GPU dslash updates
This commit is contained in:
parent
f8e880b445
commit
3a50afe7e7
@ -41,9 +41,6 @@ __host__ __device__ inline void synchronise(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GPU_DSLASH_COALESCE
|
|
||||||
#ifdef GPU_DSLASH_COALESCE
|
|
||||||
|
|
||||||
__host__ __device__ inline int get_my_lanes(int Nsimd)
|
__host__ __device__ inline int get_my_lanes(int Nsimd)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
@ -118,40 +115,42 @@ typename vobj::scalar_object extractLaneFloat2(int lane, const vobj & __restrict
|
|||||||
return extracted;
|
return extracted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
if (SE->_is_local) { \
|
||||||
if (SE->_is_local) { \
|
int mask = Nsimd >> (ptype + 1); \
|
||||||
auto in_l = extractLaneGpu(lane,in[SE->_offset]); \
|
int plane= lane; \
|
||||||
spProj(chi,in_l); \
|
if (SE->_permute) plane = (lane ^ mask); \
|
||||||
} else { \
|
auto in_l = extractLaneGpu(plane,in[SE->_offset]); \
|
||||||
chi = extractLaneGpu(lane,buf[SE->_offset]); \
|
spProj(chi,in_l); \
|
||||||
|
} else { \
|
||||||
|
chi = extractLaneGpu(lane,buf[SE->_offset]); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,int LLs,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
typename SiteHalfSpinor::scalar_object tmp;
|
|
||||||
typename SiteHalfSpinor::scalar_object chi;
|
typename SiteHalfSpinor::scalar_object chi;
|
||||||
typename SiteHalfSpinor::scalar_object Uchi;
|
typename SiteHalfSpinor::scalar_object Uchi;
|
||||||
typename SiteSpinor::scalar_object result;
|
typename SiteSpinor::scalar_object result;
|
||||||
|
|
||||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
typedef typename SiteSpinor::scalar_type scalar_type;
|
||||||
typedef typename SiteSpinor::vector_type vector_type;
|
typedef typename SiteSpinor::vector_type vector_type;
|
||||||
|
|
||||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||||
|
|
||||||
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
||||||
uint64_t lanes = get_my_lanes (Nsimd);
|
uint64_t lanes = get_my_lanes(Nsimd);
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
|
|
||||||
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
for(int mu=0;mu<2*Nd;mu++) {
|
for(int mu=0;mu<2*Nd;mu++) {
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, mu, sF);
|
SE = st.GetEntry(ptype, mu, sF);
|
||||||
|
|
||||||
switch(mu){
|
switch(mu){
|
||||||
case Xp:
|
case Xp:
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); break;
|
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); break;
|
||||||
@ -171,14 +170,12 @@ accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGau
|
|||||||
default:
|
default:
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); break;
|
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); break;
|
||||||
}
|
}
|
||||||
synchronise();
|
|
||||||
|
|
||||||
auto U_l = extractLaneGpu(lane,U[sU](mu));
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,mu);
|
||||||
Uchi()=U_l*chi();
|
|
||||||
|
|
||||||
switch(mu){
|
switch(mu){
|
||||||
case Xp:
|
case Xp:
|
||||||
spReconXp(result, Uchi); break;
|
spReconXp(result, Uchi); break;
|
||||||
case Yp:
|
case Yp:
|
||||||
accumReconYp(result, Uchi); break;
|
accumReconYp(result, Uchi); break;
|
||||||
case Zp:
|
case Zp:
|
||||||
@ -195,19 +192,16 @@ accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGau
|
|||||||
default:
|
default:
|
||||||
accumReconTm(result, Uchi); break;
|
accumReconTm(result, Uchi); break;
|
||||||
}
|
}
|
||||||
synchronise();
|
|
||||||
}
|
}
|
||||||
insertLane (lane,out[sF],result);
|
insertLaneFloat2 (lane,out[sF],result);
|
||||||
sF++;
|
}
|
||||||
}}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,int LLs,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
typename SiteHalfSpinor::scalar_object tmp;
|
|
||||||
typename SiteHalfSpinor::scalar_object chi;
|
typename SiteHalfSpinor::scalar_object chi;
|
||||||
typename SiteHalfSpinor::scalar_object Uchi;
|
typename SiteHalfSpinor::scalar_object Uchi;
|
||||||
typename SiteSpinor::scalar_object result;
|
typename SiteSpinor::scalar_object result;
|
||||||
@ -219,14 +213,11 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
|
|||||||
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
||||||
uint64_t lanes = get_my_lanes(Nsimd);
|
uint64_t lanes = get_my_lanes(Nsimd);
|
||||||
|
|
||||||
// printf("Evaluating site %d Nsimd %d : lanes %ld %ld - %ld\n",sF,Nsimd,lanes,lane_offset,lane_offset+lanes-1);
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
|
|
||||||
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
||||||
for(int s=0;s<LLs;s++){
|
#if 0
|
||||||
#if 1
|
|
||||||
int mu=0;
|
int mu=0;
|
||||||
SE = st.GetEntry(ptype, mu, sF);
|
SE = st.GetEntry(ptype, mu, sF);
|
||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
|
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
|
||||||
@ -294,11 +285,7 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
|
|||||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); break;
|
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); break;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto U_l = extractLaneGpu(lane,U[sU](mu));
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,mu);
|
||||||
|
|
||||||
auto tmp = U_l * chi();
|
|
||||||
|
|
||||||
Uchi() = tmp;
|
|
||||||
|
|
||||||
switch(mu){
|
switch(mu){
|
||||||
case Xp:
|
case Xp:
|
||||||
@ -322,8 +309,7 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
insertLaneFloat2 (lane,out[sF],result);
|
insertLaneFloat2 (lane,out[sF],result);
|
||||||
sF++;
|
}
|
||||||
}}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Template specialise Gparity to empty for now
|
// Template specialise Gparity to empty for now
|
||||||
@ -332,23 +318,25 @@ accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeF
|
|||||||
accelerator void \
|
accelerator void \
|
||||||
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
|
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
|
||||||
DoubledGaugeFieldView &U, \
|
DoubledGaugeFieldView &U, \
|
||||||
SiteHalfSpinor *buf, int sF, int LLs, \
|
SiteHalfSpinor *buf, int sF, \
|
||||||
int sU, \
|
int sU, \
|
||||||
const FermionFieldView &in, \
|
const FermionFieldView &in, \
|
||||||
FermionFieldView &out) {}; \
|
FermionFieldView &out) { assert(0);}; \
|
||||||
template <> \
|
template <> \
|
||||||
accelerator void \
|
accelerator void \
|
||||||
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
|
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
|
||||||
DoubledGaugeFieldView &U, \
|
DoubledGaugeFieldView &U, \
|
||||||
SiteHalfSpinor *buf, int sF,int LLs, \
|
SiteHalfSpinor *buf, int sF, \
|
||||||
int sU, \
|
int sU, \
|
||||||
const FermionFieldView &in, \
|
const FermionFieldView &in, \
|
||||||
FermionFieldView &out) {};
|
FermionFieldView &out) { assert(0);};
|
||||||
|
|
||||||
GPU_EMPTY(GparityWilsonImplF);
|
GPU_EMPTY(GparityWilsonImplF);
|
||||||
GPU_EMPTY(GparityWilsonImplFH);
|
GPU_EMPTY(GparityWilsonImplFH);
|
||||||
GPU_EMPTY(GparityWilsonImplD);
|
GPU_EMPTY(GparityWilsonImplD);
|
||||||
GPU_EMPTY(GparityWilsonImplDF);
|
GPU_EMPTY(GparityWilsonImplDF);
|
||||||
|
|
||||||
|
/*
|
||||||
GPU_EMPTY(DomainWallVec5dImplF);
|
GPU_EMPTY(DomainWallVec5dImplF);
|
||||||
GPU_EMPTY(DomainWallVec5dImplFH);
|
GPU_EMPTY(DomainWallVec5dImplFH);
|
||||||
GPU_EMPTY(DomainWallVec5dImplD);
|
GPU_EMPTY(DomainWallVec5dImplD);
|
||||||
@ -357,8 +345,11 @@ GPU_EMPTY(ZDomainWallVec5dImplF);
|
|||||||
GPU_EMPTY(ZDomainWallVec5dImplFH);
|
GPU_EMPTY(ZDomainWallVec5dImplFH);
|
||||||
GPU_EMPTY(ZDomainWallVec5dImplD);
|
GPU_EMPTY(ZDomainWallVec5dImplD);
|
||||||
GPU_EMPTY(ZDomainWallVec5dImplDF);
|
GPU_EMPTY(ZDomainWallVec5dImplDF);
|
||||||
|
*/
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonKernels);
|
FermOpTemplateInstantiate(WilsonKernels);
|
||||||
|
AdjointFermOpTemplateInstantiate(WilsonKernels);
|
||||||
|
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user