diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 68422f28..c5d48095 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -90,22 +90,22 @@ private: // Specialised variants static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static void AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index bdba7cb2..000b5445 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -61,9 +61,13 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ spProj(chi,tmp); \ } else { \ - chi = coalescedRead(buf[SE->_offset],lane); \ + int s = sF %sU ; \ + chi = Zero(); \ + if ( (s==0)||(s==Ls-1)) { \ + chi = coalescedRead(buf[SE->_offset],lane); \ + } \ } \ - acceleratorSynchronise(); \ + acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); @@ -80,11 +84,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ - if (!SE->_is_local ) { \ - auto chi = coalescedRead(buf[SE->_offset],lane); \ - Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ - Recon(result, Uchi); \ - nmu++; \ + if (!SE->_is_local ) { \ + int s = sF %sU ; \ + if ( (s==0)||(s==Ls-1)) { \ + auto chi = coalescedRead(buf[SE->_offset],lane); \ + Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ + Recon(result, Uchi); \ + nmu++; \ + } \ } \ acceleratorSynchronise(); @@ -111,7 +118,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -137,7 +144,7 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV template accelerator_inline void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -166,7 +173,7 @@ void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -194,8 +201,8 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi template accelerator_inline void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + SiteHalfSpinor *buf, int Ls, int sF, + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -224,7 +231,7 @@ void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeField //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -255,7 +262,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi template accelerator_inline void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -420,6 +427,15 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ }); +#define KERNEL_CALLG(A) \ + const uint64_t NN = Nsite*Ls; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + WilsonKernels::A(st_v,U_v,buf,Ls,sF,sU,in_v,out_v); \ + }); \ + accelerator_barrier(); + #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); #define KERNEL_CALL_EXT(A) \ @@ -450,7 +466,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSite); return;} #ifdef SYCL_HACK if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; } #else @@ -460,13 +476,13 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} @@ -485,21 +501,21 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif acceleratorFenceComputeStream(); } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { acceleratorFenceComputeStream(); - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}