From 81f2aeaece16b3ac4ab15ba92e0390095d96ab3c Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Wed, 12 Oct 2016 11:45:22 +0100 Subject: [PATCH] KNL streaming stores, and KNL performance coutners --- benchmarks/Benchmark_dwf.cc | 2 + benchmarks/Benchmark_dwf_sweep.cc | 2 +- configure.ac | 14 ++ lib/AlignedAllocator.h | 5 +- lib/Stat.cc | 233 ++++++++++++++++++ lib/Stat.h | 100 ++++++++ lib/Threads.h | 6 +- lib/cartesian/Cartesian_red_black.h | 2 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 22 ++ lib/qcd/action/fermion/WilsonFermion5D.h | 3 + lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 4 + lib/simd/Intel512common.h | 7 +- 12 files changed, 393 insertions(+), 7 deletions(-) create mode 100644 lib/Stat.cc create mode 100644 lib/Stat.h diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index bc9ab708..6a283085 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -251,11 +251,13 @@ int main (int argc, char ** argv) sr_o = zero; sDw.ZeroCounters(); + sDw.stat.init("DhopEO"); double t0=usecond(); for (int i = 0; i < ncall; i++) { sDw.DhopEO(ssrc_o, sr_e, DaggerNo); } double t1=usecond(); + sDw.stat.print(); double volume=Ls; for(int mu=0;mu +#include +#include + + +namespace Grid { + + +bool PmuStat::pmu_initialized=false; + + +void PmuStat::init(const char *regname) +{ + name = regname; + if (!pmu_initialized) + { + std::cout<<"initialising pmu"< #ifdef GRID_OMP #include -#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ") +#ifdef GRID_NUMA +#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)") +#else +#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)") +#endif #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)") #else #define PARALLEL_FOR_LOOP diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index 3c10403f..db0508d5 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -178,7 +178,7 @@ public: // all elements of a simd vector must have same checkerboard. // If Ls vectorised, this must still be the case; e.g. dwf rb5d if ( _simd_layout[d]>1 ) { - if ( d != _checker_dim ) { + if ( checker_dim_mask[d] ) { assert( (_rdimensions[d]&0x1) == 0 ); } } diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 3ced3443..a96b6fca 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -416,6 +416,28 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, out); } +#ifdef AVX512 + } else if (stat.is_init() ) { + + int nthreads; + stat.start(); + #pragma omp parallel + { + #pragma omp master + nthreads = omp_get_num_threads(); + int mythread = omp_get_thread_num(); + stat.enter(mythread); + #pragma omp for nowait + for(int ss=0;ssoSites();ss++) + { + int sU=ss; + int sF=LLs*sU; + Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out); + } + stat.exit(mythread); + } + stat.accum(nthreads); +#endif } else { PARALLEL_FOR_LOOP for (int ss = 0; ss < U._grid->oSites(); ss++) { diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index b9c35b7c..bc7ec543 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -31,6 +31,8 @@ Author: paboyle #ifndef GRID_QCD_WILSON_FERMION_5D_H #define GRID_QCD_WILSON_FERMION_5D_H +#include + namespace Grid { namespace QCD { @@ -60,6 +62,7 @@ namespace Grid { public: INHERIT_IMPL_TYPES(Impl); typedef WilsonKernels Kernels; + PmuStat stat; void Report(void); void ZeroCounters(void); diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index d236a774..12579d8c 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -134,7 +134,9 @@ //////////////////////////////// // Xm //////////////////////////////// +#ifndef STREAM_STORE basep= (uint64_t) &out._odata[ss]; +#endif // basep= st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit @@ -229,7 +231,9 @@ LOAD_CHI(base); } base= (uint64_t) &out._odata[ss]; +#ifndef STREAM_STORE PREFETCH_CHIMU(base); +#endif { MULT_2SPIN_DIR_PFTM(Tm,basep); } diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index dabbf6d8..dbfb30c2 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -138,9 +138,14 @@ Author: paboyle #define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri) #define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri) - +#define STREAM_STORE +#ifdef STREAM_STORE +#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#else #define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#endif // Swaps Re/Im ; could unify this with IMCI #define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"