diff --git a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h index 1ffb01f2..890c680b 100644 --- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h +++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h @@ -102,19 +102,6 @@ public: #endif } - static accelerator_inline void multLinkProp(SitePropagator &phi, - const SiteDoubledGaugeField &U, - const SitePropagator &chi,int mu) - { - SiteGaugeLink UU; - for (int i = 0; i < Dimension; i++) { - for (int j = 0; j < Dimension; j++) { - vsplat(UU()()(i, j), U(mu)()(i, j)); - } - } - mult(&phi(), &UU(), &chi()); - } - inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) { SiteScalarGaugeField ScalarUmu; diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 1e89a68a..a44eefbe 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -73,6 +73,14 @@ public: // provide the multiply by link that is differentiated between Gparity (with // flavour index) and non-Gparity template + static accelerator_inline void multLink(_Spinor &phi, + const SiteDoubledGaugeField &U, + const _Spinor &chi, + int mu) + { + assert(0); + } + template static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U, const _Spinor &chi, @@ -182,14 +190,6 @@ public: } #endif } - // Fixme: Gparity prop * link - static accelerator_inline void multLinkProp(SitePropagator &phi, - const SiteDoubledGaugeField &U, - const SitePropagator &chi, - int mu) - { - assert(0); - } template static accelerator_inline void loadLinkElement(Simd ®, ref &memory) diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 454ae0c7..6a41288d 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -83,21 +83,22 @@ public: static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U, const _Spinor &chi, - int mu, - StencilEntry *SE, - StencilView &St) + int mu) { auto UU = coalescedRead(U(mu)); mult(&phi(), &UU, &chi()); } - - static accelerator_inline void multLinkProp(SitePropagator &phi, - const SiteDoubledGaugeField &U, - const SitePropagator &chi, - int mu) + template + static accelerator_inline void multLink(_Spinor &phi, + const SiteDoubledGaugeField &U, + const _Spinor &chi, + int mu, + StencilEntry *SE, + StencilView &St) { - mult(&phi(), &U(mu), &chi()); + multLink(phi,U,chi,mu); } + template static accelerator_inline void loadLinkElement(Simd ®, ref &memory) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 0d0181b2..d99653c9 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -364,19 +364,19 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} #endif } assert(0 && " Kernel optimisation case not covered "); @@ -440,7 +440,7 @@ void WilsonKernels::ContractConservedCurrentSiteFwd(const SitePropagator & SitePropagator result, tmp; Gamma g5(Gamma::Algebra::Gamma5); - Impl::multLinkProp(tmp, U[sU], q_in_1, mu); + Impl::multLink(tmp, U[sU], q_in_1, mu); result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu); @@ -469,7 +469,7 @@ void WilsonKernels::ContractConservedCurrentSiteBwd(const SitePropagator & SitePropagator result, tmp; Gamma g5(Gamma::Algebra::Gamma5); - Impl::multLinkProp(tmp, U[sU], q_in_1, mu + Nd); + Impl::multLink(tmp, U[sU], q_in_1, mu + Nd); result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu); if (switch_sign) { @@ -496,7 +496,7 @@ void WilsonKernels::SeqConservedCurrentSiteFwd(const SitePropagator &q_in, { SitePropagator result; - Impl::multLinkProp(result, U[sU], q_in, mu); + Impl::multLink(result, U[sU], q_in, mu); result = WilsonCurrentFwd(result, mu); // Zero any unwanted timeslice entries. @@ -525,7 +525,7 @@ void WilsonKernels::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, bool switch_sign) { SitePropagator result; - Impl::multLinkProp(result, U[sU], q_in, mu + Nd); + Impl::multLink(result, U[sU], q_in, mu + Nd); result = WilsonCurrentBwd(result, mu); // Zero any unwanted timeslice entries. diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationGparity.cc.master b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationGparity.cc.master index 7fdd3b71..75f143cb 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationGparity.cc.master +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationGparity.cc.master @@ -66,6 +66,7 @@ void WilsonKernels::ContractConservedCurrentSiteBwd( const SiteP HAND_SPECIALISE_GPARITY(IMPLEMENTATION); + template class WilsonKernels; diff --git a/TODO b/TODO index df79b37a..ca20470c 100644 --- a/TODO +++ b/TODO @@ -6,17 +6,20 @@ GPU branch code item work list * 0) Single GPU - 128 bit integer table load in GPU code. - coalescedRead <- threadIdx.x +- Gianluca's changes to Cayley into gpu-port +- GPU accelerate EOFA - Clean up PRAGMAS, and SIMT_loop thread_loop interface revisit. for_n for +- Staggered kernels -> GPU coalesced loop +- Staggered kernels inline for GPU -- DONE -* 2) 5D terms +* 2) 5D terms & Gianluca - Cayley coefficients -> GPU retention or prefetch - - Gianluca's changes to Cayley into gpu-port - - GPU accelerate EOFA - Mobius kernel fusion. -- Gianluca? + - Make GPU offload reductions optionally deterministic -- Gianluca * 3) Comms/NVlink - OpenMP tasks to run comms threads. @@ -30,12 +33,9 @@ GPU branch code item work list * 5) Misc -- SIMD dirs in stencil - Conserved current clean up. - multLinkProp eliminate -- Staggered kernels -> GPU coalesced loop -- Staggered kernels inline for GPU -- DONE -- Make GPU offload reductions optionally deterministic -- Gianluca +- SIMD dirs in stencil 7) Accelerate the cshift