From 7b59ab5bd74f94ef1fb4fef1872119bea357d196 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 3 Jun 2019 15:46:26 +0100 Subject: [PATCH] Compiling after reorganisation --- .../WilsonKernelsGpuImplementation.h | 341 ++++++++++++++++++ 1 file changed, 341 insertions(+) create mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h new file mode 100644 index 00000000..f3bdb54f --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h @@ -0,0 +1,341 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc + +Copyright (C) 2018 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ + +#pragma once + +#include + +NAMESPACE_BEGIN(Grid); + +////////////////////////////////////////////////////////////// +// Gpu implementation; thread loop is implicit ; move to header +////////////////////////////////////////////////////////////// +accelerator_inline void synchronise(void) +{ +#ifdef __CUDA_ARCH__ + __syncthreads(); +#endif + return; +} +accelerator_inline int get_my_lanes(int Nsimd) +{ +#ifdef __CUDA_ARCH__ + return 1; +#else + return Nsimd; +#endif +} +accelerator_inline int get_my_lane_offset(int Nsimd) +{ +#ifdef __CUDA_ARCH__ + return ( (threadIdx.x) % Nsimd); +#else + return 0; +#endif +} + +accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) +{ +#ifdef __CUDA_ARCH__ + static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); + uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads + uint4 * chip_pun = (uint4 *)&chip; + * chip_pun = * mem_pun; +#else + chip = *mem; +#endif + return; +} + +#ifdef GPU_VEC +#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ + if (SE._is_local) { \ + int mask = Nsimd >> (ptype + 1); \ + int plane= SE._permute ? (lane ^ mask) : lane; \ + auto in_l = extractLane(plane,in[SE._offset+s]); \ + spProj(chi,in_l); \ + } else { \ + chi = extractLane(lane,buf[SE._offset+s]); \ + } \ + synchronise(); +#else +#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ + if (SE._is_local) { \ + auto in_t = in[SE._offset+s]; \ + if (SE._permute) { \ + spProj(tmp, in_t); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in_t); \ + } \ + } else { \ + chi = buf[SE._offset+s]; \ + } \ + synchronise(); +#endif + +template +accelerator_inline void WilsonKernels::GpuDhopSiteDag(StencilView &st, SiteDoubledGaugeField &U, + SiteHalfSpinor *buf, int Ls, int s, + int sU, const FermionFieldView &in, FermionFieldView &out) +{ + typename SiteHalfSpinor::scalar_object chi; + typename SiteHalfSpinor::scalar_object Uchi; + typename SiteSpinor::scalar_object result; + typedef typename SiteSpinor::scalar_type scalar_type; + typedef typename SiteSpinor::vector_type vector_type; + constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); + + uint64_t lane_offset= get_my_lane_offset(Nsimd); + uint64_t lanes = get_my_lanes(Nsimd); + + StencilEntry *SE_mem; + StencilEntry SE; + + int ptype; + uint64_t ssF = Ls * sU; + uint64_t sF = ssF + s; +#ifndef __CUDA_ARCH__ + for(int lane = lane_offset;lane +accelerator_inline void WilsonKernels::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, + SiteHalfSpinor *buf, int Ls, int s, + int sU, const FermionFieldView &in, FermionFieldView &out) +{ + typename SiteHalfSpinor::scalar_object chi; + typename SiteHalfSpinor::scalar_object Uchi; + typename SiteSpinor::scalar_object result; + typedef typename SiteSpinor::scalar_type scalar_type; + typedef typename SiteSpinor::vector_type vector_type; + constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); + + uint64_t lane_offset= get_my_lane_offset(Nsimd); + uint64_t lanes = get_my_lanes(Nsimd); + + StencilEntry *SE_mem; + StencilEntry SE; + int ptype; + // Forces some degree of coalesce on the table look ups + // Could also use wide load instructions on the data structure + uint64_t ssF = Ls * sU; + uint64_t sF = ssF + s; + +#ifndef __CUDA_ARCH__ + for(int lane = lane_offset;lane \ +accelerator_inline void \ +WilsonKernels::GpuDhopSite(StencilView &st, \ + SiteDoubledGaugeField &U, \ + SiteHalfSpinor *buf, int Ls, int sF, \ + int sU, \ + const FermionFieldView &in, \ + FermionFieldView &out) { assert(0);}; \ +template <> \ +accelerator_inline void \ +WilsonKernels::GpuDhopSiteDag(StencilView &st, \ + SiteDoubledGaugeField &U, \ + SiteHalfSpinor *buf, int Ls,int sF, \ + int sU, \ + const FermionFieldView &in, \ + FermionFieldView &out) { assert(0);}; + +GPU_EMPTY(GparityWilsonImplF); +GPU_EMPTY(GparityWilsonImplFH); +GPU_EMPTY(GparityWilsonImplD); +GPU_EMPTY(GparityWilsonImplDF); + +template +void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + int interior,int exterior) +{ + auto U_v = U.View(); + auto in_v = in.View(); + auto out_v = out.View(); + auto st_v = st.View(); + if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { + const uint64_t nsimd = Simd::Nsimd(); + const uint64_t NN = Nsite*Ls*nsimd; + accelerator_loopN( sss, NN, { + uint64_t cur = sss; + // uint64_t lane = cur % nsimd; + cur = cur / nsimd; + uint64_t s = cur%Ls; + // uint64_t sF = cur; + cur = cur / Ls; + uint64_t sU = cur; + WilsonKernels::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v); + }); + } else { + /* + accelerator_loop( ss, U_v, { + int sU = ss; + int sF = Ls * sU; + WilsonKernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v); + }); + */ + } + } + template + void WilsonKernels::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + int interior,int exterior) + { + auto U_v = U.View(); + auto in_v = in.View(); + auto out_v = out.View(); + auto st_v = st.View(); + + if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { + const uint64_t nsimd = Simd::Nsimd(); + const uint64_t NN = Nsite*Ls*nsimd; + accelerator_loopN( sss, NN, { + uint64_t cur = sss; + // uint64_t lane = cur % nsimd; + cur = cur / nsimd; + uint64_t s = cur%Ls; + // uint64_t sF = cur; + cur = cur / Ls; + uint64_t sU = cur; + WilsonKernels::GpuDhopSiteDag(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v); + }); + } else { + // accelerator_loop( ss, U_v, { + // int sU = ss; + // int sF = Ls * sU; + // WilsonKernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v); + // }); + } + } + +NAMESPACE_END(Grid); +