From 5ee3ea2144bafee061a883b9aa855e4a78848e8a Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Wed, 3 Jun 2020 11:58:20 +0200 Subject: [PATCH] round-up after testing of prefetches in stencil close --- .../WilsonKernelsAsmBodyA64FX.h | 10 ---------- .../WilsonKernelsImplementation.h | 3 +++ Grid/simd/Grid_a64fx-fixedsize.h | 19 +++++++++++++++++++ Grid/stencil/Stencil.h | 19 +++++++++++++++++++ 4 files changed, 41 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 406e5c25..cebb4327 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -164,12 +164,7 @@ Author: Nils Meyer Regensburg University if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ RECON; \ nmu++; \ } @@ -180,12 +175,7 @@ Author: Nils Meyer Regensburg University if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ RECON; \ nmu++; \ } diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 81216e03..348f1425 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -445,18 +445,21 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); /* printf("."); */ return;} + //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); /* printf("-"); */ return;} + //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); /* printf("+"); */ return;} + //if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} #endif } assert(0 && " Kernel optimisation case not covered "); diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 602d56f6..95e45759 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -799,6 +799,25 @@ typedef veci SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; + +/* PF 256 worse than PF 64 +inline void prefetch_HINT_T0(const char *ptr){ + static int64_t last_ptr; + int64_t vptr = reinterpret_cast(ptr) & 0x7fffffffffffff00ll; + if (last_ptr != vptr) { + last_ptr = vptr; + pred pg1 = Optimization::acle::pg1(); + svprfd(pg1, reinterpret_cast(ptr), SV_PLDL1STRM); + svprfd(pg1, ptr, SV_PLDL1STRM); + } +}; +*/ +/* beneficial for operators? +inline void prefetch_HINT_T0(const char *ptr){ + pred pg1 = Optimization::acle::pg1(); + svprfd(pg1, ptr, SV_PLDL1STRM); +}; +*/ inline void prefetch_HINT_T0(const char *ptr){}; // Function name aliases diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index a56d256d..1f1ebbb2 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -68,8 +68,27 @@ void Gather_plane_simple_table (Vector >& table,const Lattice int num=table.size(); std::pair *table_v = & table[0]; auto rhs_v = rhs.View(); + + // main loop accelerator_forNB( i,num, vobj::Nsimd(), { typedef decltype(coalescedRead(buffer[0])) compressed_t; + // prefetching: + // +1% performance for Wilson on 32**4 + // -2% performance for DW on 24**4 x 12 + /* + const int dist = 2; + if (i+dist < num){ + svbool_t pg1 = svptrue_b64(); + + // prefetch input + auto in = rhs_v(so+table_v[i+dist].second); + svprfd(pg1, (char*)&in, SV_PLDL1STRM); + + // prefetch store buffer + uint64_t o = table_v[i+dist].first; + svprfd(pg1, (char*)&buffer[off+o], SV_PSTL1STRM); + } +*/ compressed_t tmp_c; uint64_t o = table_v[i].first; compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));