diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index e7a20abd..4bb8ce33 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -215,6 +215,14 @@ public: mult(&phi(), &U(mu), &chi()); } +#ifdef GPU_VEC + static accelerator_inline void copyLinkGpu(int lane, + SiteDoubledGaugeField & UU, + const SiteDoubledGaugeField &U) + { + auto U_l = extractLane(lane,U); + insertLane(lane,UU,U_l); + } static accelerator_inline void multLinkGpu(int lane, typename SiteHalfSpinor::scalar_object &phi, const SiteDoubledGaugeField &U, @@ -224,6 +232,17 @@ public: auto U_l = extractLane(lane,U(mu)); phi() = U_l * chi(); } +#else + static accelerator_inline void multLinkGpu(int lane, + SiteHalfSpinor &phi, + const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, + int mu) + { + auto U_l = U(mu); + phi() = U_l * chi(); + } +#endif static accelerator_inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U, @@ -364,7 +383,13 @@ public: } mult(&phi(), &UU(), &chi()); } - +#ifdef GPU_VEC + static accelerator_inline void copyLinkGpu(int lane, + SiteDoubledGaugeField & UU, + const SiteDoubledGaugeField &U) + { + UU = U; + } static accelerator_inline void multLinkGpu(int lane, typename SiteHalfSpinor::scalar_object &phi, const SiteDoubledGaugeField &U, @@ -374,6 +399,17 @@ public: auto U_l = U(mu); phi() = U_l * chi(); } +#else + static accelerator_inline void multLinkGpu(int lane, + SiteHalfSpinor &phi, + const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, + int mu) + { + auto U_l = U(mu); + phi() = U_l * chi(); + } +#endif static accelerator_inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U, diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 0d01263c..893aee3e 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -104,6 +104,7 @@ accelerator void WilsonKernels::GenericDhopSiteDag(StencilView &st, Double SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { +#ifndef GRID_NVCC SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -121,6 +122,7 @@ accelerator void WilsonKernels::GenericDhopSiteDag(StencilView &st, Double GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm); GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm); vstream(out[sF], result); +#endif }; template @@ -128,6 +130,7 @@ accelerator void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGa SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { +#ifndef GRID_NVCC SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -145,6 +148,7 @@ accelerator void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGa GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm); GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm); vstream(out[sF], result); +#endif }; //////////////////////////////////////////////////////////////////// // Interior kernels @@ -154,6 +158,7 @@ accelerator void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, Do SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { +#ifndef GRID_NVCC SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -172,6 +177,7 @@ accelerator void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, Do GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm); GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm); vstream(out[sF], result); +#endif }; template @@ -179,6 +185,7 @@ accelerator void WilsonKernels::GenericDhopSiteInt(StencilView &st, Doubl SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { +#ifndef GRID_NVCC SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -196,6 +203,7 @@ accelerator void WilsonKernels::GenericDhopSiteInt(StencilView &st, Doubl GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm); GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm); vstream(out[sF], result); +#endif }; //////////////////////////////////////////////////////////////////// // Exterior kernels @@ -205,6 +213,7 @@ accelerator void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, Do SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { +#ifndef GRID_NVCC // SiteHalfSpinor tmp; // SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -225,6 +234,7 @@ accelerator void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, Do if ( nmu ) { out[sF] = out[sF] + result; } +#endif }; template @@ -232,6 +242,7 @@ accelerator void WilsonKernels::GenericDhopSiteExt(StencilView &st, Doubl SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { +#ifndef GRID_NVCC // SiteHalfSpinor tmp; // SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -252,12 +263,14 @@ accelerator void WilsonKernels::GenericDhopSiteExt(StencilView &st, Doubl if ( nmu ) { out[sF] = out[sF] + result; } +#endif }; template accelerator void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { +#ifndef GRID_NVCC SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteSpinor result; @@ -275,6 +288,7 @@ accelerator void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFie GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm); GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm); vstream(out[sF], result); +#endif } /******************************************************************************* diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 09c086d8..a0922934 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -54,59 +54,11 @@ public: static void Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, - int interior=1,int exterior=1) - { - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); - if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { - const uint64_t nsimd = Simd::Nsimd(); - const uint64_t NN = Nsite*Ls*nsimd; - accelerator_loopN( sss, NN, { - uint64_t cur = sss; - // uint64_t lane = cur % nsimd; - cur = cur / nsimd; - uint64_t sF = cur; cur = cur / Ls; - uint64_t sU = cur; - WilsonKernels::GpuDhopSite(st_v,U_v,buf,sF,sU,in_v,out_v); - }); - } else { - accelerator_loop( ss, U_v, { - int sU = ss; - int sF = Ls * sU; - DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v); - }); - } - } + int interior=1,int exterior=1) ; static void DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, - int interior=1,int exterior=1) - { - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + int interior=1,int exterior=1) ; - if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { - const uint64_t nsimd = Simd::Nsimd(); - const uint64_t NN = Nsite*Ls*nsimd; - accelerator_loopN( sss, NN, { - uint64_t cur = sss; - // uint64_t lane = cur % nsimd; - cur = cur / nsimd; - uint64_t sF = cur; cur = cur / Ls; - uint64_t sU = cur; - WilsonKernels::GpuDhopSiteDag(st_v,U_v,buf,sF,sU,in_v,out_v); - }); - } else { - accelerator_loop( ss, U_v, { - int sU = ss; - int sF = Ls * sU; - DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v); - }); - } - } template static accelerator typename std::enable_if::type @@ -275,11 +227,11 @@ public: private: // Specialised variants - static accelerator void GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + static accelerator_inline void GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); - static accelerator void GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + static accelerator_inline void GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, + int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); diff --git a/lib/qcd/action/fermion/WilsonKernelsGpu.cc b/lib/qcd/action/fermion/WilsonKernelsGpu.cc index ac0d3ffa..8ac5e55b 100644 --- a/lib/qcd/action/fermion/WilsonKernelsGpu.cc +++ b/lib/qcd/action/fermion/WilsonKernelsGpu.cc @@ -57,30 +57,51 @@ accelerator_inline int get_my_lane_offset(int Nsimd) #endif } - +#ifdef GPU_VEC #define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ synchronise(); \ if (SE->_is_local) { \ int mask = Nsimd >> (ptype + 1); \ int plane= SE->_permute ? (lane ^ mask) : lane; \ - auto in_l = extractLane(plane,in[SE->_offset]); \ + auto in_l = extractLane(plane,in[SE->_offset+s]); \ spProj(chi,in_l); \ } else { \ - chi = extractLane(lane,buf[SE->_offset]); \ + chi = extractLane(lane,buf[SE->_offset+s]); \ } \ synchronise(); +#else +#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ + if (SE->_is_local) { \ + auto in_t = in[SE->_offset+s]; \ + if (SE->_permute) { \ + spProj(tmp, in_t); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in_t); \ + } \ + } else { \ + chi = buf[SE->_offset+s]; \ + } \ + synchronise(); +#endif template -accelerator void WilsonKernels::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, +accelerator_inline void WilsonKernels::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, + SiteHalfSpinor *buf, int Ls, int s, int sU, const FermionFieldView &in, FermionFieldView &out) { +#ifdef GPU_VEC typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object Uchi; typename SiteSpinor::scalar_object result; +#else + SiteHalfSpinor chi; + SiteHalfSpinor Uchi; + SiteHalfSpinor tmp; + SiteSpinor result; +#endif typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::vector_type vector_type; - constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t lane_offset= get_my_lane_offset(Nsimd); @@ -88,69 +109,80 @@ accelerator void WilsonKernels::GpuDhopSiteDag(StencilView &st, DoubledGau StencilEntry *SE; int ptype; - + uint64_t ssF = Ls * sU; + uint64_t sF = ssF + s; #ifndef __CUDA_ARCH__ for(int lane = lane_offset;lane -accelerator void WilsonKernels::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, +accelerator_inline void WilsonKernels::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, + SiteHalfSpinor *buf, int Ls, int s, int sU, const FermionFieldView &in, FermionFieldView &out) { +#ifdef GPU_VEC typename SiteHalfSpinor::scalar_object chi; typename SiteHalfSpinor::scalar_object Uchi; typename SiteSpinor::scalar_object result; +#else + SiteHalfSpinor chi; + SiteHalfSpinor Uchi; + SiteHalfSpinor tmp; + SiteSpinor result; +#endif typedef typename SiteSpinor::scalar_type scalar_type; typedef typename SiteSpinor::vector_type vector_type; - constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t lane_offset= get_my_lane_offset(Nsimd); @@ -158,54 +190,62 @@ accelerator void WilsonKernels::GpuDhopSite(StencilView &st, DoubledGaugeF StencilEntry *SE; int ptype; + // Forces some degree of coalesce on the table look ups + // Could also use wide load instructions on the data structure + uint64_t ssF = Ls * sU; + uint64_t sF = ssF + s; #ifndef __CUDA_ARCH__ for(int lane = lane_offset;lane::GpuDhopSite(StencilView &st, DoubledGaugeF // Template specialise Gparity to empty for now #define GPU_EMPTY(A) \ template <> \ -accelerator void \ +accelerator_inline void \ WilsonKernels::GpuDhopSite(StencilView &st, \ - DoubledGaugeFieldView &U, \ - SiteHalfSpinor *buf, int sF, \ + SiteDoubledGaugeField &U, \ + SiteHalfSpinor *buf, int Ls, int sF, \ int sU, \ const FermionFieldView &in, \ FermionFieldView &out) { assert(0);}; \ template <> \ - accelerator void \ + accelerator_inline void \ WilsonKernels::GpuDhopSiteDag(StencilView &st, \ - DoubledGaugeFieldView &U, \ - SiteHalfSpinor *buf, int sF, \ - int sU, \ - const FermionFieldView &in, \ + DoubledGaugeFieldView &U, \ + SiteHalfSpinor *buf, int Ls,int sF, \ + int sU, \ + const FermionFieldView &in, \ FermionFieldView &out) { assert(0);}; GPU_EMPTY(GparityWilsonImplF); @@ -234,6 +274,67 @@ GPU_EMPTY(GparityWilsonImplFH); GPU_EMPTY(GparityWilsonImplD); GPU_EMPTY(GparityWilsonImplDF); +template +void WilsonKernels::Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + int interior,int exterior) +{ + auto U_v = U.View(); + auto in_v = in.View(); + auto out_v = out.View(); + auto st_v = st.View(); + if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { + const uint64_t nsimd = Simd::Nsimd(); + const uint64_t NN = Nsite*Ls*nsimd; + accelerator_loopN( sss, NN, { + uint64_t cur = sss; + // uint64_t lane = cur % nsimd; + cur = cur / nsimd; + uint64_t s = cur%Ls; + uint64_t sF = cur; cur = cur / Ls; + uint64_t sU = cur; + WilsonKernels::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v); + }); + } else { + accelerator_loop( ss, U_v, { + int sU = ss; + int sF = Ls * sU; + DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v); + }); + } + } + template + void WilsonKernels::DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + int interior,int exterior) + { + auto U_v = U.View(); + auto in_v = in.View(); + auto out_v = out.View(); + auto st_v = st.View(); + + if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { + const uint64_t nsimd = Simd::Nsimd(); + const uint64_t NN = Nsite*Ls*nsimd; + accelerator_loopN( sss, NN, { + uint64_t cur = sss; + // uint64_t lane = cur % nsimd; + cur = cur / nsimd; + uint64_t s = cur%Ls; + uint64_t sF = cur; cur = cur / Ls; + uint64_t sU = cur; + WilsonKernels::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v); + }); + } else { + accelerator_loop( ss, U_v, { + int sU = ss; + int sF = Ls * sU; + DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v); + }); + } + } + + /* GPU_EMPTY(DomainWallVec5dImplF); GPU_EMPTY(DomainWallVec5dImplFH); diff --git a/lib/simd/Grid_gpu_vec.h b/lib/simd/Grid_gpu_vec.h index 3850e403..0d30c68d 100644 --- a/lib/simd/Grid_gpu_vec.h +++ b/lib/simd/Grid_gpu_vec.h @@ -36,7 +36,9 @@ Author: Peter Boyle namespace Grid { -#define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta + //#define COALESCE_GRANULARITY (64) // bytes for coalesce granularity of target: Pascal, Volta + //#define COALESCE_GRANULARITY (32) // bytes for coalesce granularity of target: Pascal, Volta + #define COALESCE_GRANULARITY (16) // bytes for coalesce granularity of target: Pascal, Volta template class GpuComplex {