/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc Copyright (C) 2018 Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////// // Gpu implementation; thread loop is implicit ; move to header ////////////////////////////////////////////////////////////// accelerator_inline void synchronise(void) { #ifdef __CUDA_ARCH__ __syncthreads(); #endif return; } accelerator_inline int get_my_lanes(int Nsimd) { #ifdef __CUDA_ARCH__ return 1; #else return Nsimd; #endif } accelerator_inline int get_my_lane_offset(int Nsimd) { #ifdef __CUDA_ARCH__ return ( (threadIdx.x) % Nsimd); #else return 0; #endif } accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) { #if 0 chip = *mem; #else assert(sizeof(StencilEntry)==sizeof(uint4)); uint4 * mem_pun = (uint4 *)mem; uint4 * chip_pun = (uint4 *)&chip; * chip_pun = * mem_pun; #endif return; } #ifdef GPU_VEC #if 1 #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ synchronise(); \ if (SE._is_local) { \ int mask = Nsimd >> (ptype + 1); \ int plane= SE._permute ? (lane ^ mask) : lane; \ auto in_l = extractLane(plane,in[SE._offset+s]); \ spProj(chi,in_l); \ } else { \ chi = extractLane(lane,buf[SE._offset+s]); \ } \ synchronise(); #else #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ { int mask = Nsimd >> (ptype + 1); \ int plane= SE._permute ? (lane ^ mask) : lane; \ synchronise(); \ auto in_l = extractLane(plane,in[SE._offset+s]); \ synchronise(); \ spProj(chi,in_l); } #endif #else #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ synchronise(); \ if (SE._is_local) { \ auto in_t = in[SE._offset+s]; \ if (SE._permute) { \ spProj(tmp, in_t); \ permute(chi, tmp, ptype); \ } else { \ spProj(chi, in_t); \ } \ } else { \ chi = buf[SE._offset+s]; \ } \ synchronise(); #endif template accelerator_inline void WilsonKernels::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int Ls, int s, int sU, const FermionFieldView &in, FermionFieldView &out) { #ifdef GPU_VEC typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object Uchi; typename SiteSpinor::scalar_object result; #else SiteHalfSpinor chi; SiteHalfSpinor Uchi; SiteHalfSpinor tmp; SiteSpinor result; #endif typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::vector_type vector_type; constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t lane_offset= get_my_lane_offset(Nsimd); uint64_t lanes = get_my_lanes(Nsimd); StencilEntry *SE_mem; StencilEntry SE; int ptype; uint64_t ssF = Ls * sU; uint64_t sF = ssF + s; #ifndef __CUDA_ARCH__ for(int lane = lane_offset;lane accelerator_inline void WilsonKernels::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, SiteHalfSpinor *buf, int Ls, int s, int sU, const FermionFieldView &in, FermionFieldView &out) { #ifdef GPU_VEC typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object Uchi; typename SiteSpinor::scalar_object result; #else SiteHalfSpinor chi; SiteHalfSpinor Uchi; SiteHalfSpinor tmp; SiteSpinor result; #endif typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::vector_type vector_type; constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t lane_offset= get_my_lane_offset(Nsimd); uint64_t lanes = get_my_lanes(Nsimd); // printf (" sU %d s %d Nsimd %d lanes %ld lane_off %ld\n",sU, s, Nsimd, lanes, lane_offset); StencilEntry *SE_mem; StencilEntry SE; int ptype; // Forces some degree of coalesce on the table look ups // Could also use wide load instructions on the data structure uint64_t ssF = Ls * sU; uint64_t sF = ssF + s; #ifndef __CUDA_ARCH__ for(int lane = lane_offset;lane \ accelerator_inline void \ WilsonKernels::GpuDhopSite(StencilView &st, \ SiteDoubledGaugeField &U, \ SiteHalfSpinor *buf, int Ls, int sF, \ int sU, \ const FermionFieldView &in, \ FermionFieldView &out) { assert(0);}; \ template <> \ accelerator_inline void \ WilsonKernels ::GpuDhopSiteDag(StencilView &st, \ DoubledGaugeFieldView &U, \ SiteHalfSpinor *buf, int Ls,int sF, \ int sU, \ const FermionFieldView &in, \ FermionFieldView &out) { assert(0);}; GPU_EMPTY(GparityWilsonImplF); GPU_EMPTY(GparityWilsonImplFH); GPU_EMPTY(GparityWilsonImplD); GPU_EMPTY(GparityWilsonImplDF); template void WilsonKernels::Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior,int exterior) { auto U_v = U.View(); auto in_v = in.View(); auto out_v = out.View(); auto st_v = st.View(); if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { const uint64_t nsimd = Simd::Nsimd(); const uint64_t NN = Nsite*Ls*nsimd; accelerator_loopN( sss, NN, { uint64_t cur = sss; // uint64_t lane = cur % nsimd; cur = cur / nsimd; uint64_t s = cur%Ls; uint64_t sF = cur; cur = cur / Ls; uint64_t sU = cur; WilsonKernels::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v); }); } else { accelerator_loop( ss, U_v, { int sU = ss; int sF = Ls * sU; DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v); }); } } template void WilsonKernels::DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior,int exterior) { auto U_v = U.View(); auto in_v = in.View(); auto out_v = out.View(); auto st_v = st.View(); if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { const uint64_t nsimd = Simd::Nsimd(); const uint64_t NN = Nsite*Ls*nsimd; accelerator_loopN( sss, NN, { uint64_t cur = sss; // uint64_t lane = cur % nsimd; cur = cur / nsimd; uint64_t s = cur%Ls; uint64_t sF = cur; cur = cur / Ls; uint64_t sU = cur; WilsonKernels::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v); }); } else { accelerator_loop( ss, U_v, { int sU = ss; int sF = Ls * sU; DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v); }); } } /* GPU_EMPTY(DomainWallVec5dImplF); GPU_EMPTY(DomainWallVec5dImplFH); GPU_EMPTY(DomainWallVec5dImplD); GPU_EMPTY(DomainWallVec5dImplDF); GPU_EMPTY(ZDomainWallVec5dImplF); GPU_EMPTY(ZDomainWallVec5dImplFH); GPU_EMPTY(ZDomainWallVec5dImplD); GPU_EMPTY(ZDomainWallVec5dImplDF); */ FermOpTemplateInstantiate(WilsonKernels); AdjointFermOpTemplateInstantiate(WilsonKernels); TwoIndexFermOpTemplateInstantiate(WilsonKernels); NAMESPACE_END(Grid);