From 3e41b1055cbd624b1ed1120a06ce288a4876adc1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 9 Jun 2019 11:20:01 +0100 Subject: [PATCH] Remove Gpu only kernels. --- Grid/qcd/action/fermion/DomainWallVec5dImpl.h | 41 --- Grid/qcd/action/fermion/WilsonImpl.h | 10 - Grid/qcd/action/fermion/WilsonKernels.h | 22 +- .../WilsonKernelsGpuImplementation.h | 289 +++--------------- .../WilsonKernelsImplementation.h | 13 + Grid/util/Init.cc | 3 - benchmarks/Benchmark_dwf.cc | 2 - configure.ac | 5 +- 8 files changed, 64 insertions(+), 321 deletions(-) diff --git a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h index e5b25557..1ffb01f2 100644 --- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h +++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h @@ -101,47 +101,6 @@ public: mult(&phi(), &UU(), &chi()); #endif } -#ifdef GPU_VEC - static accelerator_inline void copyLinkGpu(int lane, - SiteDoubledGaugeField & UU, - const SiteDoubledGaugeField &U) - { - UU = U; - } - static accelerator_inline void multLinkGpu(int lane, - typename SiteHalfSpinor::scalar_object &phi, - const SiteDoubledGaugeField &U, - const typename SiteHalfSpinor::scalar_object &chi, - int mu) - { -#if 1 - typedef typename ExtractTypeMap::extract_type extract_type; - - SiteScalarGaugeLink U_l; - - extract_type * U_mem = (extract_type *) &U(mu); - extract_type * U_stack= (extract_type *) &U_l; - - for(int w=0;w<(sizeof(U_l)/sizeof(extract_type)) ;w++) U_stack[w] = U_mem[w]; - - phi() = U_l() * chi(); -#else - auto U_l = U(mu); - - phi() = U_l * chi(); -#endif - } -#else - static accelerator_inline void multLinkGpu(int lane, - SiteHalfSpinor &phi, - const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, - int mu) - { - auto U_l = U(mu); - phi() = U_l * chi(); - } -#endif static accelerator_inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U, diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index ff40557f..454ae0c7 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -90,16 +90,6 @@ public: auto UU = coalescedRead(U(mu)); mult(&phi(), &UU, &chi()); } - - static accelerator_inline void multLinkGpu(int lane, - typename SiteHalfSpinor::scalar_object &phi, - const SiteDoubledGaugeField &U, - const typename SiteHalfSpinor::scalar_object &chi, - int mu) - { - auto U_l = extractLane(lane,U(mu)); - phi() = U_l * chi(); - } static accelerator_inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U, diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 43a382d1..a3494347 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -38,7 +38,7 @@ NAMESPACE_BEGIN(Grid); //////////////////////////////////////////////////////////////////////////////////////////////////////////////// class WilsonKernelsStatic { public: - enum { OptGeneric, OptHandUnroll, OptInlineAsm, OptGpu }; + enum { OptGeneric, OptHandUnroll, OptInlineAsm }; enum { CommsAndCompute, CommsThenCompute }; static int Opt; static int Comms; @@ -100,12 +100,6 @@ public: private: // Specialised variants - static accelerator void GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, SiteHalfSpinor * buf, - int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); - - static accelerator void GpuDhopSiteDag(StencilView &st, SiteDoubledGaugeField &U, SiteHalfSpinor * buf, - int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out); - static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); @@ -143,25 +137,23 @@ private: int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out); // Keep Hand unrolled temporarily -#if 1 static accelerator void HandDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); -#endif + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); public: WilsonKernels(const ImplParams &p = ImplParams()) : Base(p){}; }; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h index 6c7ab36d..ee501341 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h @@ -33,233 +33,6 @@ directory NAMESPACE_BEGIN(Grid); -////////////////////////////////////////////////////////////// -// Gpu implementation; thread loop is implicit ; move to header -////////////////////////////////////////////////////////////// -accelerator_inline int get_my_lanes(int Nsimd) -{ -#ifdef __CUDA_ARCH__ - return 1; -#else - return Nsimd; -#endif -} -accelerator_inline int get_my_lane_offset(int Nsimd) -{ -#ifdef __CUDA_ARCH__ - return ( (threadIdx.x) % Nsimd); -#else - return 0; -#endif -} - -accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) -{ -#ifdef __CUDA_ARCH__ - static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); - uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads - uint4 * chip_pun = (uint4 *)&chip; - * chip_pun = * mem_pun; -#else - chip = *mem; -#endif - return; -} - -#if 1 -#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ - if (SE._is_local) { \ - int mask = Nsimd >> (ptype + 1); \ - int plane= SE._permute ? (lane ^ mask) : lane; \ - auto in_l = extractLane(plane,in[SE._offset+s]); \ - spProj(chi,in_l); \ - } else { \ - chi = extractLane(lane,buf[SE._offset+s]); \ - } \ - synchronise(); -#else -#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \ - if (SE._is_local) { \ - auto in_t = in[SE._offset+s]; \ - decltype(chi) tmp; \ - if (SE._permute) { \ - spProj(tmp, in_t); \ - permute(chi, tmp, ptype); \ - } else { \ - spProj(chi, in_t); \ - } \ - } else { \ - chi = (buf[SE._offset+s]; \ - } \ - synchronise(); -#endif - -template -accelerator_inline void WilsonKernels::GpuDhopSiteDag(StencilView &st, SiteDoubledGaugeField &U, - SiteHalfSpinor *buf, int Ls, int s, - int sU, const FermionFieldView &in, FermionFieldView &out) -{ - typename SiteHalfSpinor::scalar_object chi; - typename SiteHalfSpinor::scalar_object Uchi; - typename SiteSpinor::scalar_object result; - - typedef typename SiteSpinor::scalar_type scalar_type; - typedef typename SiteSpinor::vector_type vector_type; - constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); - - uint64_t lane_offset= get_my_lane_offset(Nsimd); - uint64_t lanes = get_my_lanes(Nsimd); - - StencilEntry *SE_mem; - StencilEntry SE; - - int ptype; - uint64_t ssF = Ls * sU; - uint64_t sF = ssF + s; -#ifndef __CUDA_ARCH__ - for(int lane = lane_offset;lane -accelerator_inline void WilsonKernels::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U, - SiteHalfSpinor *buf, int Ls, int s, - int sU, const FermionFieldView &in, FermionFieldView &out) -{ - typename SiteHalfSpinor::scalar_object chi; - typename SiteHalfSpinor::scalar_object Uchi; - typename SiteSpinor::scalar_object result; - - typedef typename SiteSpinor::scalar_type scalar_type; - typedef typename SiteSpinor::vector_type vector_type; - constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); - - uint64_t lane_offset= get_my_lane_offset(Nsimd); - uint64_t lanes = get_my_lanes(Nsimd); - - StencilEntry *SE_mem; - StencilEntry SE; - int ptype; - // Forces some degree of coalesce on the table look ups - // Could also use wide load instructions on the data structure - uint64_t ssF = Ls * sU; - uint64_t sF = ssF + s; - -#ifndef __CUDA_ARCH__ - for(int lane = lane_offset;lane \ -accelerator_inline void \ -WilsonKernels::GpuDhopSite(StencilView &st, \ - SiteDoubledGaugeField &U, \ - SiteHalfSpinor *buf, int Ls, int sF, \ - int sU, \ - const FermionFieldView &in, \ - FermionFieldView &out) { assert(0);}; \ -template <> \ -accelerator_inline void \ -WilsonKernels::GpuDhopSiteDag(StencilView &st, \ - SiteDoubledGaugeField &U, \ - SiteHalfSpinor *buf, int Ls,int sF, \ - int sU, \ - const FermionFieldView &in, \ - FermionFieldView &out) { assert(0);}; - -GPU_EMPTY(GparityWilsonImplF); -GPU_EMPTY(GparityWilsonImplFH); -GPU_EMPTY(GparityWilsonImplD); -GPU_EMPTY(GparityWilsonImplDF); #define KERNEL_CALL(A) \ const uint64_t nsimd = Simd::Nsimd(); \ @@ -282,6 +55,13 @@ GPU_EMPTY(GparityWilsonImplDF); WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ }); +#define ASM_CALL(A) \ + SIMT_loop( ss, Nsite, { \ + int sU = ss; \ + int sF = ss*Ls; \ + WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ + }); + template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, @@ -293,17 +73,25 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField auto st_v = st.View(); if( interior && exterior ) { - if (Opt == WilsonKernelsStatic::OptGpu) { - KERNEL_CALL(GpuDhopSite); - } else { - HOST_CALL(GenericDhopSite); - } + if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;} +#ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} +#endif } else if( interior ) { - HOST_CALL(GenericDhopSiteInt); + if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;} +#ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} +#endif } else if( exterior ) { - HOST_CALL(GenericDhopSiteExt); + if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;} +#ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} +#endif } - + assert(0 && " Kernel optimisation case not covered "); } template void WilsonKernels::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, @@ -315,17 +103,26 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField auto out_v = out.View(); auto st_v = st.View(); - if( interior && exterior ) { - if (Opt == WilsonKernelsStatic::OptGpu) { - KERNEL_CALL(GpuDhopSiteDag); - } else { - HOST_CALL(GenericDhopSiteDag); - } - } else if( interior ) { - HOST_CALL(GenericDhopSiteDagInt); - } else if( exterior ) { - HOST_CALL(GenericDhopSiteDagExt); - } + if( interior && exterior ) { + if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDag); return;} +#ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDag); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} +#endif + } else if( interior ) { + if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDagInt); return;} +#ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} +#endif + } else if( exterior ) { + if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteDagExt); return;} +#ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteDagExt); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} +#endif + } + assert(0 && " Kernel optimisation case not covered "); } NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index c31a5bdf..bd8b31ac 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -38,6 +38,19 @@ NAMESPACE_BEGIN(Grid); //////////////////////////////////////////// // Generic implementation; move to different file? //////////////////////////////////////////// + +accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) +{ +#ifdef __CUDA_ARCH__ + static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); + uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads + uint4 * chip_pun = (uint4 *)&chip; + * chip_pun = * mem_pun; +#else + chip = *mem; +#endif + return; +} #define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 3aac20a0..550a3e20 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -433,9 +433,6 @@ void Grid_init(int *argc,char ***argv) WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptHandUnroll; StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptHandUnroll; } - if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-gpu") ){ - WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptGpu; - } if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){ WilsonKernelsStatic::Opt=WilsonKernelsStatic::OptInlineAsm; StaggeredKernelsStatic::Opt=StaggeredKernelsStatic::OptInlineAsm; diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 207a17f7..4d6b026f 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -177,7 +177,6 @@ int main (int argc, char ** argv) if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <