From 81f2aeaece16b3ac4ab15ba92e0390095d96ab3c Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Wed, 12 Oct 2016 11:45:22 +0100 Subject: [PATCH 1/6] KNL streaming stores, and KNL performance coutners --- benchmarks/Benchmark_dwf.cc | 2 + benchmarks/Benchmark_dwf_sweep.cc | 2 +- configure.ac | 14 ++ lib/AlignedAllocator.h | 5 +- lib/Stat.cc | 233 ++++++++++++++++++ lib/Stat.h | 100 ++++++++ lib/Threads.h | 6 +- lib/cartesian/Cartesian_red_black.h | 2 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 22 ++ lib/qcd/action/fermion/WilsonFermion5D.h | 3 + lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 4 + lib/simd/Intel512common.h | 7 +- 12 files changed, 393 insertions(+), 7 deletions(-) create mode 100644 lib/Stat.cc create mode 100644 lib/Stat.h diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index bc9ab708..6a283085 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -251,11 +251,13 @@ int main (int argc, char ** argv) sr_o = zero; sDw.ZeroCounters(); + sDw.stat.init("DhopEO"); double t0=usecond(); for (int i = 0; i < ncall; i++) { sDw.DhopEO(ssrc_o, sr_e, DaggerNo); } double t1=usecond(); + sDw.stat.print(); double volume=Ls; for(int mu=0;mu +#include +#include + + +namespace Grid { + + +bool PmuStat::pmu_initialized=false; + + +void PmuStat::init(const char *regname) +{ + name = regname; + if (!pmu_initialized) + { + std::cout<<"initialising pmu"< #ifdef GRID_OMP #include -#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ") +#ifdef GRID_NUMA +#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)") +#else +#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)") +#endif #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)") #else #define PARALLEL_FOR_LOOP diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index 3c10403f..db0508d5 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -178,7 +178,7 @@ public: // all elements of a simd vector must have same checkerboard. // If Ls vectorised, this must still be the case; e.g. dwf rb5d if ( _simd_layout[d]>1 ) { - if ( d != _checker_dim ) { + if ( checker_dim_mask[d] ) { assert( (_rdimensions[d]&0x1) == 0 ); } } diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 3ced3443..a96b6fca 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -416,6 +416,28 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, out); } +#ifdef AVX512 + } else if (stat.is_init() ) { + + int nthreads; + stat.start(); + #pragma omp parallel + { + #pragma omp master + nthreads = omp_get_num_threads(); + int mythread = omp_get_thread_num(); + stat.enter(mythread); + #pragma omp for nowait + for(int ss=0;ssoSites();ss++) + { + int sU=ss; + int sF=LLs*sU; + Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out); + } + stat.exit(mythread); + } + stat.accum(nthreads); +#endif } else { PARALLEL_FOR_LOOP for (int ss = 0; ss < U._grid->oSites(); ss++) { diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index b9c35b7c..bc7ec543 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -31,6 +31,8 @@ Author: paboyle #ifndef GRID_QCD_WILSON_FERMION_5D_H #define GRID_QCD_WILSON_FERMION_5D_H +#include + namespace Grid { namespace QCD { @@ -60,6 +62,7 @@ namespace Grid { public: INHERIT_IMPL_TYPES(Impl); typedef WilsonKernels Kernels; + PmuStat stat; void Report(void); void ZeroCounters(void); diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index d236a774..12579d8c 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -134,7 +134,9 @@ //////////////////////////////// // Xm //////////////////////////////// +#ifndef STREAM_STORE basep= (uint64_t) &out._odata[ss]; +#endif // basep= st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit @@ -229,7 +231,9 @@ LOAD_CHI(base); } base= (uint64_t) &out._odata[ss]; +#ifndef STREAM_STORE PREFETCH_CHIMU(base); +#endif { MULT_2SPIN_DIR_PFTM(Tm,basep); } diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index dabbf6d8..dbfb30c2 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -138,9 +138,14 @@ Author: paboyle #define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri) #define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri) - +#define STREAM_STORE +#ifdef STREAM_STORE +#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#else #define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#endif // Swaps Re/Im ; could unify this with IMCI #define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n" From 9b63e97108f6716795a1f63cc43b5a813dbb3e3e Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Wed, 12 Oct 2016 11:51:21 +0100 Subject: [PATCH 2/6] align not absolutely required and confuses clang++ --- lib/Stat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Stat.h b/lib/Stat.h index e93e1538..f3c0722e 100644 --- a/lib/Stat.h +++ b/lib/Stat.h @@ -40,11 +40,11 @@ struct knl_gbl_ class PmuStat { - const char *name; - __declspec(align(64)) uint64_t counters[8][256]; + uint64_t counters[8][256]; #ifdef _KNIGHTS_LANDING_ static struct knl_gbl_ gbl; #endif + const char *name; uint64_t reads; // memory reads uint64_t writes; // memory writes From 496beffa883ca419e5b20c9a04ea40c85655a5be Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Wed, 12 Oct 2016 12:06:08 +0100 Subject: [PATCH 3/6] Fix non-KNL build --- lib/Stat.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/Stat.h b/lib/Stat.h index f3c0722e..553c4f8a 100644 --- a/lib/Stat.h +++ b/lib/Stat.h @@ -5,11 +5,14 @@ #define _KNIGHTS_LANDING_ #endif -#ifdef _KNIGHTS_LANDING_ +namespace Grid { +/////////////////////////////////////////////////////////////////////////////// +// Extra KNL counters from MCDRAM +/////////////////////////////////////////////////////////////////////////////// +#ifdef _KNIGHTS_LANDING_ #define NMC 6 #define NEDC 8 -namespace Grid { struct ctrs { uint64_t mcrd[NMC]; @@ -37,6 +40,7 @@ struct knl_gbl_ int edc_missm[NEDC]; }; #endif +/////////////////////////////////////////////////////////////////////////////// class PmuStat { @@ -95,6 +99,6 @@ public: }; } - #endif + From bd205a3293535f7be117ce04ca61a6e80223f174 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Wed, 12 Oct 2016 12:09:15 +0100 Subject: [PATCH 4/6] Fixing for non x86 and non KNL --- lib/Stat.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lib/Stat.cc b/lib/Stat.cc index 9a025225..ed568517 100644 --- a/lib/Stat.cc +++ b/lib/Stat.cc @@ -11,6 +11,7 @@ bool PmuStat::pmu_initialized=false; void PmuStat::init(const char *regname) { +#ifdef __x86_64__ name = regname; if (!pmu_initialized) { @@ -19,9 +20,11 @@ void PmuStat::init(const char *regname) pmu_init(); } clear(); +#endif } void PmuStat::clear(void) { +#ifdef __x86_64__ count = 0; tregion = 0; pmc0 = 0; @@ -32,9 +35,11 @@ void PmuStat::clear(void) tcycles = 0; reads = 0; writes = 0; +#endif } void PmuStat::print(void) { +#ifdef __x86_64__ std::cout <<"Reg "< Date: Wed, 12 Oct 2016 12:29:08 +0100 Subject: [PATCH 5/6] __rdpmc needed for gcc, clang++ --- lib/Stat.cc | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/Stat.cc b/lib/Stat.cc index ed568517..7f2e4086 100644 --- a/lib/Stat.cc +++ b/lib/Stat.cc @@ -59,35 +59,35 @@ void PmuStat::start(void) pmu_start(); ++count; xmemctrs(&mrstart, &mwstart); - tstart = _rdtsc(); + tstart = __rdtsc(); #endif } void PmuStat::enter(int t) { #ifdef __x86_64__ - counters[0][t] = _rdpmc(0); - counters[1][t] = _rdpmc(1); - counters[2][t] = _rdpmc((1<<30)|0); - counters[3][t] = _rdpmc((1<<30)|1); - counters[4][t] = _rdpmc((1<<30)|2); - counters[5][t] = _rdtsc(); + counters[0][t] = __rdpmc(0); + counters[1][t] = __rdpmc(1); + counters[2][t] = __rdpmc((1<<30)|0); + counters[3][t] = __rdpmc((1<<30)|1); + counters[4][t] = __rdpmc((1<<30)|2); + counters[5][t] = __rdtsc(); #endif } void PmuStat::exit(int t) { #ifdef __x86_64__ - counters[0][t] = _rdpmc(0) - counters[0][t]; - counters[1][t] = _rdpmc(1) - counters[1][t]; - counters[2][t] = _rdpmc((1<<30)|0) - counters[2][t]; - counters[3][t] = _rdpmc((1<<30)|1) - counters[3][t]; - counters[4][t] = _rdpmc((1<<30)|2) - counters[4][t]; - counters[5][t] = _rdtsc() - counters[5][t]; + counters[0][t] = __rdpmc(0) - counters[0][t]; + counters[1][t] = __rdpmc(1) - counters[1][t]; + counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t]; + counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t]; + counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t]; + counters[5][t] = __rdtsc() - counters[5][t]; #endif } void PmuStat::accum(int nthreads) { #ifdef __x86_64__ - tend = _rdtsc(); + tend = __rdtsc(); xmemctrs(&mrend, &mwend); pmu_stop(); for (int t = 0; t < nthreads; ++t) { From 8bbd9ebc270acb0aa277187d743644a0a5d818b0 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Wed, 12 Oct 2016 13:47:20 +0100 Subject: [PATCH 6/6] Reversing changes to Stencil class --- lib/Stat.h | 2 +- lib/Stencil.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Stat.h b/lib/Stat.h index 553c4f8a..96bd594a 100644 --- a/lib/Stat.h +++ b/lib/Stat.h @@ -2,7 +2,7 @@ #define _GRID_STAT_H #ifdef AVX512 -#define _KNIGHTS_LANDING_ +#define _KNIGHTS_LANDING_ROOTONLY #endif namespace Grid { diff --git a/lib/Stencil.h b/lib/Stencil.h index 83aace54..72c55d0a 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -265,7 +265,7 @@ } inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { //_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0); - uint64_t cbase = (uint64_t) & comm_buf[0]; + uint64_t cbase = (uint64_t) 0; local = _entries[ent]._is_local; perm = _entries[ent]._permute; if (perm) ptype = _permute_type[point]; @@ -273,7 +273,7 @@ else return cbase + _entries[ent]._byte_offset; } inline uint64_t GetPFInfo(int ent,uint64_t base) { - uint64_t cbase = (uint64_t)& comm_buf[0]; + uint64_t cbase = (uint64_t) 0; int local = _entries[ent]._is_local; if (local) return base + _entries[ent]._byte_offset; else return cbase + _entries[ent]._byte_offset;