1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-15 02:05:37 +00:00
Grid/lib/qcd/action/fermion/WilsonKernelsGpu.cc

355 lines
10 KiB
C++
Raw Normal View History

2018-05-15 00:41:17 +01:00
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc
Copyright (C) 2018
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////
// Gpu implementation; thread loop is implicit ; move to header
2018-05-15 00:41:17 +01:00
//////////////////////////////////////////////////////////////
accelerator_inline void synchronise(void)
2018-06-13 20:52:29 +01:00
{
#ifdef __CUDA_ARCH__
__syncthreads();
#endif
return;
}
accelerator_inline int get_my_lanes(int Nsimd)
2018-06-13 20:52:29 +01:00
{
#ifdef __CUDA_ARCH__
return 1;
#else
return Nsimd;
#endif
}
accelerator_inline int get_my_lane_offset(int Nsimd)
2018-06-13 20:52:29 +01:00
{
#ifdef __CUDA_ARCH__
return ( (threadIdx.x) % Nsimd);
#else
return 0;
#endif
}
#ifdef GPU_VEC
2018-06-27 22:32:21 +01:00
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
synchronise(); \
2018-06-27 22:32:21 +01:00
if (SE->_is_local) { \
int mask = Nsimd >> (ptype + 1); \
int plane= SE->_permute ? (lane ^ mask) : lane; \
auto in_l = extractLane(plane,in[SE->_offset+s]); \
2018-06-27 22:32:21 +01:00
spProj(chi,in_l); \
} else { \
chi = extractLane(lane,buf[SE->_offset+s]); \
} \
synchronise();
#else
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
if (SE->_is_local) { \
auto in_t = in[SE->_offset+s]; \
if (SE->_permute) { \
spProj(tmp, in_t); \
permute(chi, tmp, ptype); \
} else { \
spProj(chi, in_t); \
} \
} else { \
chi = buf[SE->_offset+s]; \
} \
synchronise();
#endif
2018-06-27 22:32:21 +01:00
2018-05-15 00:41:17 +01:00
template <class Impl>
accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int Ls, int s,
2018-05-15 00:41:17 +01:00
int sU, const FermionFieldView &in, FermionFieldView &out)
{
#ifdef GPU_VEC
2018-06-13 20:52:29 +01:00
typename SiteHalfSpinor::scalar_object chi;
typename SiteHalfSpinor::scalar_object Uchi;
typename SiteSpinor::scalar_object result;
#else
SiteHalfSpinor chi;
SiteHalfSpinor Uchi;
SiteHalfSpinor tmp;
SiteSpinor result;
#endif
2018-06-13 20:52:29 +01:00
typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd);
2018-06-27 22:32:21 +01:00
uint64_t lanes = get_my_lanes(Nsimd);
2018-06-13 20:52:29 +01:00
2018-05-15 00:41:17 +01:00
StencilEntry *SE;
int ptype;
uint64_t ssF = Ls * sU;
uint64_t sF = ssF + s;
#ifndef __CUDA_ARCH__
2018-06-13 20:52:29 +01:00
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
int lane = lane_offset; {
#endif
SE = st.GetEntry(ptype, Xp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
spReconXp(result, Uchi);
SE = st.GetEntry(ptype, Yp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
accumReconYp(result, Uchi);
SE = st.GetEntry(ptype, Zp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
accumReconZp(result, Uchi);
SE = st.GetEntry(ptype, Tp, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
accumReconTp(result, Uchi);
SE = st.GetEntry(ptype, Xm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
accumReconXm(result, Uchi);
SE = st.GetEntry(ptype, Ym, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
accumReconYm(result, Uchi);
SE = st.GetEntry(ptype, Zm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
accumReconZm(result, Uchi);
SE = st.GetEntry(ptype, Tm, ssF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
accumReconTm(result, Uchi);
synchronise();
#ifdef GPU_VEC
insertLane (lane,out[sF],result);
#else
vstream(out[sF], result);
#endif
2018-06-27 22:32:21 +01:00
}
}
2018-05-15 00:41:17 +01:00
template <class Impl>
accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
SiteHalfSpinor *buf, int Ls, int s,
2018-05-15 00:41:17 +01:00
int sU, const FermionFieldView &in, FermionFieldView &out)
{
#ifdef GPU_VEC
2018-06-13 20:52:29 +01:00
typename SiteHalfSpinor::scalar_object chi;
typename SiteHalfSpinor::scalar_object Uchi;
typename SiteSpinor::scalar_object result;
#else
SiteHalfSpinor chi;
SiteHalfSpinor Uchi;
SiteHalfSpinor tmp;
SiteSpinor result;
#endif
2018-06-13 20:52:29 +01:00
typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd);
uint64_t lanes = get_my_lanes(Nsimd);
2018-05-15 00:41:17 +01:00
StencilEntry *SE;
int ptype;
// Forces some degree of coalesce on the table look ups
// Could also use wide load instructions on the data structure
uint64_t ssF = Ls * sU;
uint64_t sF = ssF + s;
2018-06-27 22:32:21 +01:00
#ifndef __CUDA_ARCH__
2018-06-13 20:52:29 +01:00
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
int lane = lane_offset; {
#endif
SE = st.GetEntry(ptype, Xp, ssF);
2018-06-13 20:52:29 +01:00
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
2018-06-13 20:52:29 +01:00
spReconXm(result, Uchi);
SE = st.GetEntry(ptype, Yp, ssF);
2018-06-13 20:52:29 +01:00
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
2018-06-13 20:52:29 +01:00
accumReconYm(result, Uchi);
SE = st.GetEntry(ptype, Zp, ssF);
2018-06-13 20:52:29 +01:00
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
2018-06-13 20:52:29 +01:00
accumReconZm(result, Uchi);
SE = st.GetEntry(ptype, Tp, ssF);
2018-06-13 20:52:29 +01:00
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
2018-06-13 20:52:29 +01:00
accumReconTm(result, Uchi);
SE = st.GetEntry(ptype, Xm, ssF);
2018-06-13 20:52:29 +01:00
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
2018-06-13 20:52:29 +01:00
accumReconXp(result, Uchi);
SE = st.GetEntry(ptype, Ym, ssF);
2018-06-13 20:52:29 +01:00
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
2018-06-13 20:52:29 +01:00
accumReconYp(result, Uchi);
SE = st.GetEntry(ptype, Zm, ssF);
2018-06-13 20:52:29 +01:00
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
2018-06-13 20:52:29 +01:00
accumReconZp(result, Uchi);
SE = st.GetEntry(ptype, Tm, ssF);
2018-06-13 20:52:29 +01:00
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
2018-06-13 20:52:29 +01:00
accumReconTp(result, Uchi);
synchronise();
#ifdef GPU_VEC
insertLane (lane,out[sF],result);
#else
vstream(out[sF], result);
#endif
2018-06-27 22:32:21 +01:00
}
2018-05-15 00:41:17 +01:00
};
2018-06-13 20:52:29 +01:00
// Template specialise Gparity to empty for now
#define GPU_EMPTY(A) \
template <> \
accelerator_inline void \
2018-06-13 20:52:29 +01:00
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
SiteDoubledGaugeField &U, \
SiteHalfSpinor *buf, int Ls, int sF, \
2018-06-13 20:52:29 +01:00
int sU, \
const FermionFieldView &in, \
2018-06-27 22:32:21 +01:00
FermionFieldView &out) { assert(0);}; \
2018-06-13 20:52:29 +01:00
template <> \
accelerator_inline void \
2018-06-13 20:52:29 +01:00
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
DoubledGaugeFieldView &U, \
SiteHalfSpinor *buf, int Ls,int sF, \
int sU, \
const FermionFieldView &in, \
2018-06-27 22:32:21 +01:00
FermionFieldView &out) { assert(0);};
2018-06-13 20:52:29 +01:00
GPU_EMPTY(GparityWilsonImplF);
GPU_EMPTY(GparityWilsonImplFH);
GPU_EMPTY(GparityWilsonImplD);
GPU_EMPTY(GparityWilsonImplDF);
2018-06-27 22:32:21 +01:00
template <class Impl>
void WilsonKernels<Impl>::Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
const uint64_t nsimd = Simd::Nsimd();
const uint64_t NN = Nsite*Ls*nsimd;
accelerator_loopN( sss, NN, {
uint64_t cur = sss;
// uint64_t lane = cur % nsimd;
cur = cur / nsimd;
uint64_t s = cur%Ls;
uint64_t sF = cur; cur = cur / Ls;
uint64_t sU = cur;
WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
});
} else {
accelerator_loop( ss, U_v, {
int sU = ss;
int sF = Ls * sU;
DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
});
}
}
template <class Impl>
void WilsonKernels<Impl>::DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
const uint64_t nsimd = Simd::Nsimd();
const uint64_t NN = Nsite*Ls*nsimd;
accelerator_loopN( sss, NN, {
uint64_t cur = sss;
// uint64_t lane = cur % nsimd;
cur = cur / nsimd;
uint64_t s = cur%Ls;
uint64_t sF = cur; cur = cur / Ls;
uint64_t sU = cur;
WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
});
} else {
accelerator_loop( ss, U_v, {
int sU = ss;
int sF = Ls * sU;
DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
});
}
}
2018-06-27 22:32:21 +01:00
/*
2018-06-13 20:52:29 +01:00
GPU_EMPTY(DomainWallVec5dImplF);
GPU_EMPTY(DomainWallVec5dImplFH);
GPU_EMPTY(DomainWallVec5dImplD);
GPU_EMPTY(DomainWallVec5dImplDF);
GPU_EMPTY(ZDomainWallVec5dImplF);
GPU_EMPTY(ZDomainWallVec5dImplFH);
GPU_EMPTY(ZDomainWallVec5dImplD);
GPU_EMPTY(ZDomainWallVec5dImplDF);
2018-06-27 22:32:21 +01:00
*/
2018-06-13 20:52:29 +01:00
2018-05-15 00:41:17 +01:00
FermOpTemplateInstantiate(WilsonKernels);
2018-06-27 22:32:21 +01:00
AdjointFermOpTemplateInstantiate(WilsonKernels);
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
2018-05-15 00:41:17 +01:00
NAMESPACE_END(Grid);