diff --git a/lib/qcd/action/fermion/WilsonKernelsGpu.cc b/lib/qcd/action/fermion/WilsonKernelsGpu.cc index 6bc01386..6a478a31 100644 --- a/lib/qcd/action/fermion/WilsonKernelsGpu.cc +++ b/lib/qcd/action/fermion/WilsonKernelsGpu.cc @@ -31,59 +31,150 @@ directory NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////// -// Gpu implementation; view code at a premium and less unroll +// Gpu implementation; thread loop is implicit ////////////////////////////////////////////////////////////// - -#define GPU_STENCIL_LEG_PROJ(Dir,spProj) \ +__host__ __device__ inline void synchronise(void) +{ +#ifdef __CUDA_ARCH__ + __syncthreads(); +#endif + return; +} + +#define GPU_DSLASH_COALESCE +#ifdef GPU_DSLASH_COALESCE + +__host__ __device__ inline int get_my_lanes(int Nsimd) +{ +#ifdef __CUDA_ARCH__ + return 1; +#else + return Nsimd; +#endif +} +__host__ __device__ inline int get_my_lane_offset(int Nsimd) +{ +#ifdef __CUDA_ARCH__ + return ( (threadIdx.x) % Nsimd); +#else + return 0; +#endif +} + +//////////////////////////////////////////////////////////////////////// +// Extract/Insert a single lane; do this locally in this file. +// Don't need a global version really. +//////////////////////////////////////////////////////////////////////// +template accelerator_inline +typename vobj::scalar_object extractLaneGpu(int lane, const vobj & __restrict__ vec) +{ + typedef typename vobj::scalar_object scalar_object; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + constexpr int words=sizeof(vobj)/sizeof(vector_type); + constexpr int Nsimd=vector_type::Nsimd(); + + scalar_object extracted; + scalar_type * __restrict__ sp = (scalar_type *)&extracted; // Type pun + scalar_type * __restrict__ vp = (scalar_type *)&vec; + for(int w=0;w accelerator_inline +void insertLaneFloat2(int lane, vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + constexpr int words=sizeof(vobj)/sizeof(vector_type); + constexpr int Nsimd=vector_type::Nsimd(); + + float2 * __restrict__ sp = (float2 *)&extracted; + float2 * __restrict__ vp = (float2 *)&vec; + for(int w=0;w accelerator_inline +typename vobj::scalar_object extractLaneFloat2(int lane, const vobj & __restrict__ vec) +{ + typedef typename vobj::scalar_object scalar_object; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + constexpr int words=sizeof(vobj)/sizeof(vector_type); + constexpr int Nsimd=vector_type::Nsimd(); + + scalar_object extracted; + float2 * __restrict__ sp = (float2 *)&extracted; // Type pun + float2 * __restrict__ vp = (float2 *)&vec; + for(int w=0;w_is_local) { \ - spProj(chi, in[SE->_offset]); \ - if (SE->_permute) { \ - permute(tmp, chi, ptype); \ - chi = tmp; \ - } \ + auto in_l = extractLaneGpu(lane,in[SE->_offset]); \ + spProj(chi,in_l); \ } else { \ - chi = buf[SE->_offset]; \ + chi = extractLaneGpu(lane,buf[SE->_offset]); \ } -#define GPU_STENCIL_LEG_RECON(Recon) Recon(result, Uchi); -// Xp is mu= 0 template accelerator void WilsonKernels::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int sF,int LLs, int sU, const FermionFieldView &in, FermionFieldView &out) { - SiteHalfSpinor tmp; - SiteHalfSpinor chi; - SiteHalfSpinor Uchi; - SiteSpinor result; + typename SiteHalfSpinor::scalar_object tmp; + typename SiteHalfSpinor::scalar_object chi; + typename SiteHalfSpinor::scalar_object Uchi; + typename SiteSpinor::scalar_object result; + + typedef typename SiteSpinor::scalar_type scalar_type; + typedef typename SiteSpinor::vector_type vector_type; + constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); + + uint64_t lane_offset= get_my_lane_offset(Nsimd); + uint64_t lanes = get_my_lanes (Nsimd); + StencilEntry *SE; int ptype; + for(int lane = lane_offset;lane::GpuDhopSiteDag(StencilView &st, DoubledGau default: accumReconTm(result, Uchi); break; } + synchronise(); } - vstream(out[sF], result); + insertLane (lane,out[sF],result); + sF++; + }} }; template accelerator void WilsonKernels::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int sF,int LLs, int sU, const FermionFieldView &in, FermionFieldView &out) { - SiteHalfSpinor tmp; - SiteHalfSpinor chi; - SiteHalfSpinor Uchi; - SiteSpinor result; + typename SiteHalfSpinor::scalar_object tmp; + typename SiteHalfSpinor::scalar_object chi; + typename SiteHalfSpinor::scalar_object Uchi; + typename SiteSpinor::scalar_object result; + typedef typename SiteSpinor::scalar_type scalar_type; + typedef typename SiteSpinor::vector_type vector_type; + + constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); + + uint64_t lane_offset= get_my_lane_offset(Nsimd); + uint64_t lanes = get_my_lanes(Nsimd); + + // printf("Evaluating site %d Nsimd %d : lanes %ld %ld - %ld\n",sF,Nsimd,lanes,lane_offset,lane_offset+lanes-1); + StencilEntry *SE; int ptype; + for(int lane = lane_offset;lane::GpuDhopSite(StencilView &st, DoubledGaugeF accumReconTp(result, Uchi); break; } } - vstream(out[sF], result); - +#endif + insertLaneFloat2 (lane,out[sF],result); + sF++; + }} }; +// Template specialise Gparity to empty for now +#define GPU_EMPTY(A) \ + template <> \ +accelerator void \ +WilsonKernels::GpuDhopSite(StencilView &st, \ + DoubledGaugeFieldView &U, \ + SiteHalfSpinor *buf, int sF, int LLs, \ + int sU, \ + const FermionFieldView &in, \ + FermionFieldView &out) {}; \ + template <> \ + accelerator void \ + WilsonKernels::GpuDhopSiteDag(StencilView &st, \ + DoubledGaugeFieldView &U, \ + SiteHalfSpinor *buf, int sF,int LLs, \ + int sU, \ + const FermionFieldView &in, \ + FermionFieldView &out) {}; + +GPU_EMPTY(GparityWilsonImplF); +GPU_EMPTY(GparityWilsonImplFH); +GPU_EMPTY(GparityWilsonImplD); +GPU_EMPTY(GparityWilsonImplDF); +GPU_EMPTY(DomainWallVec5dImplF); +GPU_EMPTY(DomainWallVec5dImplFH); +GPU_EMPTY(DomainWallVec5dImplD); +GPU_EMPTY(DomainWallVec5dImplDF); +GPU_EMPTY(ZDomainWallVec5dImplF); +GPU_EMPTY(ZDomainWallVec5dImplFH); +GPU_EMPTY(ZDomainWallVec5dImplD); +GPU_EMPTY(ZDomainWallVec5dImplDF); + FermOpTemplateInstantiate(WilsonKernels); NAMESPACE_END(Grid);