1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-15 02:05:37 +00:00
Grid/lib/qcd/action/fermion/WilsonKernelsGpu.cc

254 lines
7.6 KiB
C++

/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc
Copyright (C) 2018
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////
// Gpu implementation; thread loop is implicit ; move to header
//////////////////////////////////////////////////////////////
accelerator_inline void synchronise(void)
{
#ifdef __CUDA_ARCH__
__syncthreads();
#endif
return;
}
accelerator_inline int get_my_lanes(int Nsimd)
{
#ifdef __CUDA_ARCH__
return 1;
#else
return Nsimd;
#endif
}
accelerator_inline int get_my_lane_offset(int Nsimd)
{
#ifdef __CUDA_ARCH__
return ( (threadIdx.x) % Nsimd);
#else
return 0;
#endif
}
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
synchronise(); \
if (SE->_is_local) { \
int mask = Nsimd >> (ptype + 1); \
int plane= SE->_permute ? (lane ^ mask) : lane; \
auto in_l = extractLane(plane,in[SE->_offset]); \
spProj(chi,in_l); \
} else { \
chi = extractLane(lane,buf[SE->_offset]); \
} \
synchronise();
template <class Impl>
accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typename SiteHalfSpinor::scalar_object chi;
typename SiteHalfSpinor::scalar_object Uchi;
typename SiteSpinor::scalar_object result;
typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd);
uint64_t lanes = get_my_lanes(Nsimd);
StencilEntry *SE;
int ptype;
#ifndef __CUDA_ARCH__
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
int lane = lane_offset; {
#endif
SE = st.GetEntry(ptype, Xp, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
spReconXp(result, Uchi);
SE = st.GetEntry(ptype, Yp, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
accumReconYp(result, Uchi);
SE = st.GetEntry(ptype, Zp, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
accumReconZp(result, Uchi);
SE = st.GetEntry(ptype, Tp, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
accumReconTp(result, Uchi);
SE = st.GetEntry(ptype, Xm, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
accumReconXm(result, Uchi);
SE = st.GetEntry(ptype, Ym, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
accumReconYm(result, Uchi);
SE = st.GetEntry(ptype, Zm, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
accumReconZm(result, Uchi);
SE = st.GetEntry(ptype, Tm, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
accumReconTm(result, Uchi);
synchronise();
insertLane (lane,out[sF],result);
}
}
template <class Impl>
accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typename SiteHalfSpinor::scalar_object chi;
typename SiteHalfSpinor::scalar_object Uchi;
typename SiteSpinor::scalar_object result;
typedef typename SiteSpinor::scalar_type scalar_type;
typedef typename SiteSpinor::vector_type vector_type;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
uint64_t lane_offset= get_my_lane_offset(Nsimd);
uint64_t lanes = get_my_lanes(Nsimd);
StencilEntry *SE;
int ptype;
#ifndef __CUDA_ARCH__
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
int lane = lane_offset; {
#endif
SE = st.GetEntry(ptype, Xp, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
spReconXm(result, Uchi);
SE = st.GetEntry(ptype, Yp, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
accumReconYm(result, Uchi);
SE = st.GetEntry(ptype, Zp, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
accumReconZm(result, Uchi);
SE = st.GetEntry(ptype, Tp, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
accumReconTm(result, Uchi);
SE = st.GetEntry(ptype, Xm, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
accumReconXp(result, Uchi);
SE = st.GetEntry(ptype, Ym, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
accumReconYp(result, Uchi);
SE = st.GetEntry(ptype, Zm, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
accumReconZp(result, Uchi);
SE = st.GetEntry(ptype, Tm, sF);
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
accumReconTp(result, Uchi);
synchronise();
insertLane (lane,out[sF],result);
}
};
// Template specialise Gparity to empty for now
#define GPU_EMPTY(A) \
template <> \
accelerator void \
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
DoubledGaugeFieldView &U, \
SiteHalfSpinor *buf, int sF, \
int sU, \
const FermionFieldView &in, \
FermionFieldView &out) { assert(0);}; \
template <> \
accelerator void \
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
DoubledGaugeFieldView &U, \
SiteHalfSpinor *buf, int sF, \
int sU, \
const FermionFieldView &in, \
FermionFieldView &out) { assert(0);};
GPU_EMPTY(GparityWilsonImplF);
GPU_EMPTY(GparityWilsonImplFH);
GPU_EMPTY(GparityWilsonImplD);
GPU_EMPTY(GparityWilsonImplDF);
/*
GPU_EMPTY(DomainWallVec5dImplF);
GPU_EMPTY(DomainWallVec5dImplFH);
GPU_EMPTY(DomainWallVec5dImplD);
GPU_EMPTY(DomainWallVec5dImplDF);
GPU_EMPTY(ZDomainWallVec5dImplF);
GPU_EMPTY(ZDomainWallVec5dImplFH);
GPU_EMPTY(ZDomainWallVec5dImplD);
GPU_EMPTY(ZDomainWallVec5dImplDF);
*/
FermOpTemplateInstantiate(WilsonKernels);
AdjointFermOpTemplateInstantiate(WilsonKernels);
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
NAMESPACE_END(Grid);