/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc Copyright (C) 2018 Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////// // Gpu implementation; thread loop is implicit ////////////////////////////////////////////////////////////// __host__ __device__ inline void synchronise(void) { #ifdef __CUDA_ARCH__ __syncthreads(); #endif return; } #define GPU_DSLASH_COALESCE #ifdef GPU_DSLASH_COALESCE __host__ __device__ inline int get_my_lanes(int Nsimd) { #ifdef __CUDA_ARCH__ return 1; #else return Nsimd; #endif } __host__ __device__ inline int get_my_lane_offset(int Nsimd) { #ifdef __CUDA_ARCH__ return ( (threadIdx.x) % Nsimd); #else return 0; #endif } //////////////////////////////////////////////////////////////////////// // Extract/Insert a single lane; do this locally in this file. // Don't need a global version really. //////////////////////////////////////////////////////////////////////// template accelerator_inline typename vobj::scalar_object extractLaneGpu(int lane, const vobj & __restrict__ vec) { typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; constexpr int words=sizeof(vobj)/sizeof(vector_type); constexpr int Nsimd=vector_type::Nsimd(); scalar_object extracted; scalar_type * __restrict__ sp = (scalar_type *)&extracted; // Type pun scalar_type * __restrict__ vp = (scalar_type *)&vec; for(int w=0;w accelerator_inline void insertLaneFloat2(int lane, vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted) { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; constexpr int words=sizeof(vobj)/sizeof(vector_type); constexpr int Nsimd=vector_type::Nsimd(); float2 * __restrict__ sp = (float2 *)&extracted; float2 * __restrict__ vp = (float2 *)&vec; for(int w=0;w accelerator_inline typename vobj::scalar_object extractLaneFloat2(int lane, const vobj & __restrict__ vec) { typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; constexpr int words=sizeof(vobj)/sizeof(vector_type); constexpr int Nsimd=vector_type::Nsimd(); scalar_object extracted; float2 * __restrict__ sp = (float2 *)&extracted; // Type pun float2 * __restrict__ vp = (float2 *)&vec; for(int w=0;w_is_local) { \ auto in_l = extractLaneGpu(lane,in[SE->_offset]); \ spProj(chi,in_l); \ } else { \ chi = extractLaneGpu(lane,buf[SE->_offset]); \ } template accelerator void WilsonKernels::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF,int LLs, int sU, const FermionFieldView &in, FermionFieldView &out) { typename SiteHalfSpinor::scalar_object tmp; typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object Uchi; typename SiteSpinor::scalar_object result; typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::vector_type vector_type; constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t lane_offset= get_my_lane_offset(Nsimd); uint64_t lanes = get_my_lanes (Nsimd); StencilEntry *SE; int ptype; for(int lane = lane_offset;lane accelerator void WilsonKernels::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF,int LLs, int sU, const FermionFieldView &in, FermionFieldView &out) { typename SiteHalfSpinor::scalar_object tmp; typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object Uchi; typename SiteSpinor::scalar_object result; typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::vector_type vector_type; constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t lane_offset= get_my_lane_offset(Nsimd); uint64_t lanes = get_my_lanes(Nsimd); // printf("Evaluating site %d Nsimd %d : lanes %ld %ld - %ld\n",sF,Nsimd,lanes,lane_offset,lane_offset+lanes-1); StencilEntry *SE; int ptype; for(int lane = lane_offset;lane \ accelerator void \ WilsonKernels::GpuDhopSite(StencilView &st, \ DoubledGaugeFieldView &U, \ SiteHalfSpinor *buf, int sF, int LLs, \ int sU, \ const FermionFieldView &in, \ FermionFieldView &out) {}; \ template <> \ accelerator void \ WilsonKernels ::GpuDhopSiteDag(StencilView &st, \ DoubledGaugeFieldView &U, \ SiteHalfSpinor *buf, int sF,int LLs, \ int sU, \ const FermionFieldView &in, \ FermionFieldView &out) {}; GPU_EMPTY(GparityWilsonImplF); GPU_EMPTY(GparityWilsonImplFH); GPU_EMPTY(GparityWilsonImplD); GPU_EMPTY(GparityWilsonImplDF); GPU_EMPTY(DomainWallVec5dImplF); GPU_EMPTY(DomainWallVec5dImplFH); GPU_EMPTY(DomainWallVec5dImplD); GPU_EMPTY(DomainWallVec5dImplDF); GPU_EMPTY(ZDomainWallVec5dImplF); GPU_EMPTY(ZDomainWallVec5dImplFH); GPU_EMPTY(ZDomainWallVec5dImplD); GPU_EMPTY(ZDomainWallVec5dImplDF); FermOpTemplateInstantiate(WilsonKernels); NAMESPACE_END(Grid);