1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Simplify Impl

This commit is contained in:
Peter Boyle 2019-06-09 22:26:27 +01:00
parent d6c0e0756d
commit 36f06555a2
6 changed files with 33 additions and 44 deletions

View File

@ -102,19 +102,6 @@ public:
#endif #endif
} }
static accelerator_inline void multLinkProp(SitePropagator &phi,
const SiteDoubledGaugeField &U,
const SitePropagator &chi,int mu)
{
SiteGaugeLink UU;
for (int i = 0; i < Dimension; i++) {
for (int j = 0; j < Dimension; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
}
}
mult(&phi(), &UU(), &chi());
}
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
{ {
SiteScalarGaugeField ScalarUmu; SiteScalarGaugeField ScalarUmu;

View File

@ -73,6 +73,14 @@ public:
// provide the multiply by link that is differentiated between Gparity (with // provide the multiply by link that is differentiated between Gparity (with
// flavour index) and non-Gparity // flavour index) and non-Gparity
template<class _Spinor> template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U,
const _Spinor &chi,
int mu)
{
assert(0);
}
template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi, static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
const _Spinor &chi, const _Spinor &chi,
@ -182,14 +190,6 @@ public:
} }
#endif #endif
} }
// Fixme: Gparity prop * link
static accelerator_inline void multLinkProp(SitePropagator &phi,
const SiteDoubledGaugeField &U,
const SitePropagator &chi,
int mu)
{
assert(0);
}
template <class ref> template <class ref>
static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) static accelerator_inline void loadLinkElement(Simd &reg, ref &memory)

View File

@ -83,22 +83,23 @@ public:
static accelerator_inline void multLink(_Spinor &phi, static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
const _Spinor &chi, const _Spinor &chi,
int mu, int mu)
StencilEntry *SE,
StencilView &St)
{ {
auto UU = coalescedRead(U(mu)); auto UU = coalescedRead(U(mu));
mult(&phi(), &UU, &chi()); mult(&phi(), &UU, &chi());
} }
template<class _Spinor>
static accelerator_inline void multLinkProp(SitePropagator &phi, static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
const SitePropagator &chi, const _Spinor &chi,
int mu) int mu,
StencilEntry *SE,
StencilView &St)
{ {
mult(&phi(), &U(mu), &chi()); multLink(phi,U,chi,mu);
} }
template <class ref> template <class ref>
static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) static accelerator_inline void loadLinkElement(Simd &reg, ref &memory)
{ {

View File

@ -364,19 +364,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;}
#ifndef GRID_NVCC #ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
#endif #endif
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;}
#ifndef GRID_NVCC #ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_NVCC #ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
#endif #endif
} }
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");
@ -440,7 +440,7 @@ void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(const SitePropagator &
SitePropagator result, tmp; SitePropagator result, tmp;
Gamma g5(Gamma::Algebra::Gamma5); Gamma g5(Gamma::Algebra::Gamma5);
Impl::multLinkProp(tmp, U[sU], q_in_1, mu); Impl::multLink(tmp, U[sU], q_in_1, mu);
result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu); result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
@ -469,7 +469,7 @@ void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(const SitePropagator &
SitePropagator result, tmp; SitePropagator result, tmp;
Gamma g5(Gamma::Algebra::Gamma5); Gamma g5(Gamma::Algebra::Gamma5);
Impl::multLinkProp(tmp, U[sU], q_in_1, mu + Nd); Impl::multLink(tmp, U[sU], q_in_1, mu + Nd);
result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu); result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
if (switch_sign) { if (switch_sign) {
@ -496,7 +496,7 @@ void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
{ {
SitePropagator result; SitePropagator result;
Impl::multLinkProp(result, U[sU], q_in, mu); Impl::multLink(result, U[sU], q_in, mu);
result = WilsonCurrentFwd(result, mu); result = WilsonCurrentFwd(result, mu);
// Zero any unwanted timeslice entries. // Zero any unwanted timeslice entries.
@ -525,7 +525,7 @@ void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
bool switch_sign) bool switch_sign)
{ {
SitePropagator result; SitePropagator result;
Impl::multLinkProp(result, U[sU], q_in, mu + Nd); Impl::multLink(result, U[sU], q_in, mu + Nd);
result = WilsonCurrentBwd(result, mu); result = WilsonCurrentBwd(result, mu);
// Zero any unwanted timeslice entries. // Zero any unwanted timeslice entries.

View File

@ -66,6 +66,7 @@ void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteBwd( const SiteP
HAND_SPECIALISE_GPARITY(IMPLEMENTATION); HAND_SPECIALISE_GPARITY(IMPLEMENTATION);
template class WilsonKernels<IMPLEMENTATION>; template class WilsonKernels<IMPLEMENTATION>;

14
TODO
View File

@ -6,17 +6,20 @@ GPU branch code item work list
* 0) Single GPU * 0) Single GPU
- 128 bit integer table load in GPU code. - 128 bit integer table load in GPU code.
- coalescedRead <- threadIdx.x - coalescedRead <- threadIdx.x
- Gianluca's changes to Cayley into gpu-port
- GPU accelerate EOFA
- Clean up PRAGMAS, and SIMT_loop - Clean up PRAGMAS, and SIMT_loop
thread_loop interface revisit. thread_loop interface revisit.
for_n for_n
for for
- Staggered kernels -> GPU coalesced loop
- Staggered kernels inline for GPU -- DONE
* 2) 5D terms * 2) 5D terms & Gianluca
- Cayley coefficients -> GPU retention or prefetch - Cayley coefficients -> GPU retention or prefetch
- Gianluca's changes to Cayley into gpu-port
- GPU accelerate EOFA
- Mobius kernel fusion. -- Gianluca? - Mobius kernel fusion. -- Gianluca?
- Make GPU offload reductions optionally deterministic -- Gianluca
* 3) Comms/NVlink * 3) Comms/NVlink
- OpenMP tasks to run comms threads. - OpenMP tasks to run comms threads.
@ -30,12 +33,9 @@ GPU branch code item work list
* 5) Misc * 5) Misc
- SIMD dirs in stencil
- Conserved current clean up. - Conserved current clean up.
- multLinkProp eliminate - multLinkProp eliminate
- Staggered kernels -> GPU coalesced loop - SIMD dirs in stencil
- Staggered kernels inline for GPU -- DONE
- Make GPU offload reductions optionally deterministic -- Gianluca
7) Accelerate the cshift 7) Accelerate the cshift