From 3c3d6a94f3ad1df6a38fb3549f016f8c7dc979e6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 4 Jan 2020 03:16:23 -0500 Subject: [PATCH] OPtimising the force term a bit --- .../WilsonKernelsImplementation.h | 84 +++++++++++++------ 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index a787fa79..f13bfdde 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ synchronise(); -#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ - if (gamma == Dir) { \ +#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \ if (SE->_is_local ) { \ int perm= SE->_permute; \ auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ @@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ synchronise(); \ Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \ - Recon(result, Uchi); \ - synchronise(); \ + Recon(result, Uchi); + +#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ + if (gamma == Dir) { \ + GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon); \ } + //////////////////////////////////////////////////////////////////// // All legs kernels ; comms then compute //////////////////////////////////////////////////////////////////// @@ -284,7 +287,36 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField } }; -template +#define DhopDirMacro(Dir,spProj,spRecon) \ + template \ + void WilsonKernels::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ + int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ + { \ + typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; \ + typedef decltype(coalescedRead(in[0])) calcSpinor; \ + calcHalfSpinor chi; \ + calcSpinor result; \ + calcHalfSpinor Uchi; \ + StencilEntry *SE; \ + int ptype; \ + const int Nsimd = SiteHalfSpinor::Nsimd(); \ + const int lane=SIMTlane(Nsimd); \ + \ + SE = st.GetEntry(ptype, dir, sF); \ + GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \ + coalescedWrite(out[sF], result,lane); \ + } + +DhopDirMacro(Xp,spProjXp,spReconXp); +DhopDirMacro(Yp,spProjYp,spReconYp); +DhopDirMacro(Zp,spProjZp,spReconZp); +DhopDirMacro(Tp,spProjTp,spReconTp); +DhopDirMacro(Xm,spProjXm,spReconXm); +DhopDirMacro(Ym,spProjYm,spReconYm); +DhopDirMacro(Zm,spProjZm,spReconZm); +DhopDirMacro(Tm,spProjTm,spReconTm); + +template void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { @@ -299,18 +331,7 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si const int lane=SIMTlane(Nsimd); SE = st.GetEntry(ptype, dir, sF); - if (gamma == Xp) { - if (SE->_is_local ) { - int perm= SE->_permute; - auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); - spProjXp(chi,tmp); - } else { - chi = coalescedRead(buf[SE->_offset],lane); - } - Impl::multLink(Uchi, U[sU], chi, dir, SE, st); - spReconXp(result, Uchi); - } - + GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp); GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp); GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp); @@ -332,13 +353,28 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S auto in_v = in.View(); auto out_v = out.View(); auto st_v = st.View(); - accelerator_for(ss,Nsite,Simd::Nsimd(),{ - for(int s=0;s