mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Simplify Impl
This commit is contained in:
parent
d6c0e0756d
commit
36f06555a2
@ -102,19 +102,6 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SitePropagator &chi,int mu)
|
||||
{
|
||||
SiteGaugeLink UU;
|
||||
for (int i = 0; i < Dimension; i++) {
|
||||
for (int j = 0; j < Dimension; j++) {
|
||||
vsplat(UU()()(i, j), U(mu)()(i, j));
|
||||
}
|
||||
}
|
||||
mult(&phi(), &UU(), &chi());
|
||||
}
|
||||
|
||||
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
{
|
||||
SiteScalarGaugeField ScalarUmu;
|
||||
|
@ -73,6 +73,14 @@ public:
|
||||
// provide the multiply by link that is differentiated between Gparity (with
|
||||
// flavour index) and non-Gparity
|
||||
template<class _Spinor>
|
||||
static accelerator_inline void multLink(_Spinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const _Spinor &chi,
|
||||
int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
template<class _Spinor>
|
||||
static accelerator_inline void multLink(_Spinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const _Spinor &chi,
|
||||
@ -182,14 +190,6 @@ public:
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// Fixme: Gparity prop * link
|
||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SitePropagator &chi,
|
||||
int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template <class ref>
|
||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||
|
@ -83,22 +83,23 @@ public:
|
||||
static accelerator_inline void multLink(_Spinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const _Spinor &chi,
|
||||
int mu,
|
||||
StencilEntry *SE,
|
||||
StencilView &St)
|
||||
int mu)
|
||||
{
|
||||
auto UU = coalescedRead(U(mu));
|
||||
mult(&phi(), &UU, &chi());
|
||||
}
|
||||
|
||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||
template<class _Spinor>
|
||||
static accelerator_inline void multLink(_Spinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SitePropagator &chi,
|
||||
int mu)
|
||||
const _Spinor &chi,
|
||||
int mu,
|
||||
StencilEntry *SE,
|
||||
StencilView &St)
|
||||
{
|
||||
mult(&phi(), &U(mu), &chi());
|
||||
multLink(phi,U,chi,mu);
|
||||
}
|
||||
|
||||
|
||||
template <class ref>
|
||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||
{
|
||||
|
@ -364,19 +364,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;}
|
||||
#ifndef GRID_NVCC
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
|
||||
#endif
|
||||
}
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
@ -440,7 +440,7 @@ void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(const SitePropagator &
|
||||
SitePropagator result, tmp;
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
|
||||
Impl::multLinkProp(tmp, U[sU], q_in_1, mu);
|
||||
Impl::multLink(tmp, U[sU], q_in_1, mu);
|
||||
|
||||
result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
|
||||
|
||||
@ -469,7 +469,7 @@ void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(const SitePropagator &
|
||||
SitePropagator result, tmp;
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
|
||||
Impl::multLinkProp(tmp, U[sU], q_in_1, mu + Nd);
|
||||
Impl::multLink(tmp, U[sU], q_in_1, mu + Nd);
|
||||
|
||||
result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
|
||||
if (switch_sign) {
|
||||
@ -496,7 +496,7 @@ void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
|
||||
{
|
||||
SitePropagator result;
|
||||
|
||||
Impl::multLinkProp(result, U[sU], q_in, mu);
|
||||
Impl::multLink(result, U[sU], q_in, mu);
|
||||
result = WilsonCurrentFwd(result, mu);
|
||||
|
||||
// Zero any unwanted timeslice entries.
|
||||
@ -525,7 +525,7 @@ void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result;
|
||||
Impl::multLinkProp(result, U[sU], q_in, mu + Nd);
|
||||
Impl::multLink(result, U[sU], q_in, mu + Nd);
|
||||
result = WilsonCurrentBwd(result, mu);
|
||||
|
||||
// Zero any unwanted timeslice entries.
|
||||
|
@ -66,6 +66,7 @@ void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteBwd( const SiteP
|
||||
|
||||
HAND_SPECIALISE_GPARITY(IMPLEMENTATION);
|
||||
|
||||
|
||||
template class WilsonKernels<IMPLEMENTATION>;
|
||||
|
||||
|
||||
|
14
TODO
14
TODO
@ -6,17 +6,20 @@ GPU branch code item work list
|
||||
* 0) Single GPU
|
||||
- 128 bit integer table load in GPU code.
|
||||
- coalescedRead <- threadIdx.x
|
||||
- Gianluca's changes to Cayley into gpu-port
|
||||
- GPU accelerate EOFA
|
||||
- Clean up PRAGMAS, and SIMT_loop
|
||||
thread_loop interface revisit.
|
||||
for_n
|
||||
for
|
||||
- Staggered kernels -> GPU coalesced loop
|
||||
- Staggered kernels inline for GPU -- DONE
|
||||
|
||||
|
||||
* 2) 5D terms
|
||||
* 2) 5D terms & Gianluca
|
||||
- Cayley coefficients -> GPU retention or prefetch
|
||||
- Gianluca's changes to Cayley into gpu-port
|
||||
- GPU accelerate EOFA
|
||||
- Mobius kernel fusion. -- Gianluca?
|
||||
- Make GPU offload reductions optionally deterministic -- Gianluca
|
||||
|
||||
* 3) Comms/NVlink
|
||||
- OpenMP tasks to run comms threads.
|
||||
@ -30,12 +33,9 @@ GPU branch code item work list
|
||||
|
||||
* 5) Misc
|
||||
|
||||
- SIMD dirs in stencil
|
||||
- Conserved current clean up.
|
||||
- multLinkProp eliminate
|
||||
- Staggered kernels -> GPU coalesced loop
|
||||
- Staggered kernels inline for GPU -- DONE
|
||||
- Make GPU offload reductions optionally deterministic -- Gianluca
|
||||
- SIMD dirs in stencil
|
||||
|
||||
7) Accelerate the cshift
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user