mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Simplify Impl
This commit is contained in:
parent
d6c0e0756d
commit
36f06555a2
@ -102,19 +102,6 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
|
||||||
const SiteDoubledGaugeField &U,
|
|
||||||
const SitePropagator &chi,int mu)
|
|
||||||
{
|
|
||||||
SiteGaugeLink UU;
|
|
||||||
for (int i = 0; i < Dimension; i++) {
|
|
||||||
for (int j = 0; j < Dimension; j++) {
|
|
||||||
vsplat(UU()()(i, j), U(mu)()(i, j));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mult(&phi(), &UU(), &chi());
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
|
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||||
{
|
{
|
||||||
SiteScalarGaugeField ScalarUmu;
|
SiteScalarGaugeField ScalarUmu;
|
||||||
|
@ -73,6 +73,14 @@ public:
|
|||||||
// provide the multiply by link that is differentiated between Gparity (with
|
// provide the multiply by link that is differentiated between Gparity (with
|
||||||
// flavour index) and non-Gparity
|
// flavour index) and non-Gparity
|
||||||
template<class _Spinor>
|
template<class _Spinor>
|
||||||
|
static accelerator_inline void multLink(_Spinor &phi,
|
||||||
|
const SiteDoubledGaugeField &U,
|
||||||
|
const _Spinor &chi,
|
||||||
|
int mu)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
template<class _Spinor>
|
||||||
static accelerator_inline void multLink(_Spinor &phi,
|
static accelerator_inline void multLink(_Spinor &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
const _Spinor &chi,
|
const _Spinor &chi,
|
||||||
@ -182,14 +190,6 @@ public:
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
// Fixme: Gparity prop * link
|
|
||||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
|
||||||
const SiteDoubledGaugeField &U,
|
|
||||||
const SitePropagator &chi,
|
|
||||||
int mu)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class ref>
|
template <class ref>
|
||||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||||
|
@ -83,21 +83,22 @@ public:
|
|||||||
static accelerator_inline void multLink(_Spinor &phi,
|
static accelerator_inline void multLink(_Spinor &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
const _Spinor &chi,
|
const _Spinor &chi,
|
||||||
int mu,
|
int mu)
|
||||||
StencilEntry *SE,
|
|
||||||
StencilView &St)
|
|
||||||
{
|
{
|
||||||
auto UU = coalescedRead(U(mu));
|
auto UU = coalescedRead(U(mu));
|
||||||
mult(&phi(), &UU, &chi());
|
mult(&phi(), &UU, &chi());
|
||||||
}
|
}
|
||||||
|
template<class _Spinor>
|
||||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
static accelerator_inline void multLink(_Spinor &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
const SitePropagator &chi,
|
const _Spinor &chi,
|
||||||
int mu)
|
int mu,
|
||||||
|
StencilEntry *SE,
|
||||||
|
StencilView &St)
|
||||||
{
|
{
|
||||||
mult(&phi(), &U(mu), &chi());
|
multLink(phi,U,chi,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class ref>
|
template <class ref>
|
||||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||||
|
@ -364,19 +364,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSite); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteInt); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { HOST_CALL(GenericDhopSiteExt); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
@ -440,7 +440,7 @@ void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(const SitePropagator &
|
|||||||
SitePropagator result, tmp;
|
SitePropagator result, tmp;
|
||||||
Gamma g5(Gamma::Algebra::Gamma5);
|
Gamma g5(Gamma::Algebra::Gamma5);
|
||||||
|
|
||||||
Impl::multLinkProp(tmp, U[sU], q_in_1, mu);
|
Impl::multLink(tmp, U[sU], q_in_1, mu);
|
||||||
|
|
||||||
result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
|
result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
|
||||||
|
|
||||||
@ -469,7 +469,7 @@ void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(const SitePropagator &
|
|||||||
SitePropagator result, tmp;
|
SitePropagator result, tmp;
|
||||||
Gamma g5(Gamma::Algebra::Gamma5);
|
Gamma g5(Gamma::Algebra::Gamma5);
|
||||||
|
|
||||||
Impl::multLinkProp(tmp, U[sU], q_in_1, mu + Nd);
|
Impl::multLink(tmp, U[sU], q_in_1, mu + Nd);
|
||||||
|
|
||||||
result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
|
result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
|
||||||
if (switch_sign) {
|
if (switch_sign) {
|
||||||
@ -496,7 +496,7 @@ void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
|
|||||||
{
|
{
|
||||||
SitePropagator result;
|
SitePropagator result;
|
||||||
|
|
||||||
Impl::multLinkProp(result, U[sU], q_in, mu);
|
Impl::multLink(result, U[sU], q_in, mu);
|
||||||
result = WilsonCurrentFwd(result, mu);
|
result = WilsonCurrentFwd(result, mu);
|
||||||
|
|
||||||
// Zero any unwanted timeslice entries.
|
// Zero any unwanted timeslice entries.
|
||||||
@ -525,7 +525,7 @@ void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
|
|||||||
bool switch_sign)
|
bool switch_sign)
|
||||||
{
|
{
|
||||||
SitePropagator result;
|
SitePropagator result;
|
||||||
Impl::multLinkProp(result, U[sU], q_in, mu + Nd);
|
Impl::multLink(result, U[sU], q_in, mu + Nd);
|
||||||
result = WilsonCurrentBwd(result, mu);
|
result = WilsonCurrentBwd(result, mu);
|
||||||
|
|
||||||
// Zero any unwanted timeslice entries.
|
// Zero any unwanted timeslice entries.
|
||||||
|
@ -66,6 +66,7 @@ void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteBwd( const SiteP
|
|||||||
|
|
||||||
HAND_SPECIALISE_GPARITY(IMPLEMENTATION);
|
HAND_SPECIALISE_GPARITY(IMPLEMENTATION);
|
||||||
|
|
||||||
|
|
||||||
template class WilsonKernels<IMPLEMENTATION>;
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
|
||||||
|
14
TODO
14
TODO
@ -6,17 +6,20 @@ GPU branch code item work list
|
|||||||
* 0) Single GPU
|
* 0) Single GPU
|
||||||
- 128 bit integer table load in GPU code.
|
- 128 bit integer table load in GPU code.
|
||||||
- coalescedRead <- threadIdx.x
|
- coalescedRead <- threadIdx.x
|
||||||
|
- Gianluca's changes to Cayley into gpu-port
|
||||||
|
- GPU accelerate EOFA
|
||||||
- Clean up PRAGMAS, and SIMT_loop
|
- Clean up PRAGMAS, and SIMT_loop
|
||||||
thread_loop interface revisit.
|
thread_loop interface revisit.
|
||||||
for_n
|
for_n
|
||||||
for
|
for
|
||||||
|
- Staggered kernels -> GPU coalesced loop
|
||||||
|
- Staggered kernels inline for GPU -- DONE
|
||||||
|
|
||||||
|
|
||||||
* 2) 5D terms
|
* 2) 5D terms & Gianluca
|
||||||
- Cayley coefficients -> GPU retention or prefetch
|
- Cayley coefficients -> GPU retention or prefetch
|
||||||
- Gianluca's changes to Cayley into gpu-port
|
|
||||||
- GPU accelerate EOFA
|
|
||||||
- Mobius kernel fusion. -- Gianluca?
|
- Mobius kernel fusion. -- Gianluca?
|
||||||
|
- Make GPU offload reductions optionally deterministic -- Gianluca
|
||||||
|
|
||||||
* 3) Comms/NVlink
|
* 3) Comms/NVlink
|
||||||
- OpenMP tasks to run comms threads.
|
- OpenMP tasks to run comms threads.
|
||||||
@ -30,12 +33,9 @@ GPU branch code item work list
|
|||||||
|
|
||||||
* 5) Misc
|
* 5) Misc
|
||||||
|
|
||||||
- SIMD dirs in stencil
|
|
||||||
- Conserved current clean up.
|
- Conserved current clean up.
|
||||||
- multLinkProp eliminate
|
- multLinkProp eliminate
|
||||||
- Staggered kernels -> GPU coalesced loop
|
- SIMD dirs in stencil
|
||||||
- Staggered kernels inline for GPU -- DONE
|
|
||||||
- Make GPU offload reductions optionally deterministic -- Gianluca
|
|
||||||
|
|
||||||
7) Accelerate the cshift
|
7) Accelerate the cshift
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user