/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc Copyright (C) 2018 Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////// // Gpu implementation; thread loop is implicit ; move to header ////////////////////////////////////////////////////////////// accelerator_inline void synchronise(void) { #ifdef __CUDA_ARCH__ __syncthreads(); #endif return; } accelerator_inline int get_my_lanes(int Nsimd) { #ifdef __CUDA_ARCH__ return 1; #else return Nsimd; #endif } accelerator_inline int get_my_lane_offset(int Nsimd) { #ifdef __CUDA_ARCH__ return ( (threadIdx.x) % Nsimd); #else return 0; #endif } #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ synchronise(); \ if (SE->_is_local) { \ int mask = Nsimd >> (ptype + 1); \ int plane= SE->_permute ? (lane ^ mask) : lane; \ auto in_l = extractLane(plane,in[SE->_offset]); \ spProj(chi,in_l); \ } else { \ chi = extractLane(lane,buf[SE->_offset]); \ } \ synchronise(); template accelerator void WilsonKernels::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object Uchi; typename SiteSpinor::scalar_object result; typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::vector_type vector_type; constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t lane_offset= get_my_lane_offset(Nsimd); uint64_t lanes = get_my_lanes(Nsimd); StencilEntry *SE; int ptype; #ifndef __CUDA_ARCH__ for(int lane = lane_offset;lane accelerator void WilsonKernels::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object Uchi; typename SiteSpinor::scalar_object result; typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::vector_type vector_type; constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t lane_offset= get_my_lane_offset(Nsimd); uint64_t lanes = get_my_lanes(Nsimd); StencilEntry *SE; int ptype; #ifndef __CUDA_ARCH__ for(int lane = lane_offset;lane \ accelerator void \ WilsonKernels::GpuDhopSite(StencilView &st, \ DoubledGaugeFieldView &U, \ SiteHalfSpinor *buf, int sF, \ int sU, \ const FermionFieldView &in, \ FermionFieldView &out) { assert(0);}; \ template <> \ accelerator void \ WilsonKernels::GpuDhopSiteDag(StencilView &st, \ DoubledGaugeFieldView &U, \ SiteHalfSpinor *buf, int sF, \ int sU, \ const FermionFieldView &in, \ FermionFieldView &out) { assert(0);}; GPU_EMPTY(GparityWilsonImplF); GPU_EMPTY(GparityWilsonImplFH); GPU_EMPTY(GparityWilsonImplD); GPU_EMPTY(GparityWilsonImplDF); /* GPU_EMPTY(DomainWallVec5dImplF); GPU_EMPTY(DomainWallVec5dImplFH); GPU_EMPTY(DomainWallVec5dImplD); GPU_EMPTY(DomainWallVec5dImplDF); GPU_EMPTY(ZDomainWallVec5dImplF); GPU_EMPTY(ZDomainWallVec5dImplFH); GPU_EMPTY(ZDomainWallVec5dImplD); GPU_EMPTY(ZDomainWallVec5dImplDF); */ FermOpTemplateInstantiate(WilsonKernels); AdjointFermOpTemplateInstantiate(WilsonKernels); TwoIndexFermOpTemplateInstantiate(WilsonKernels); NAMESPACE_END(Grid);