KNL streaming stores, and KNL performance coutners

2025-07-13 03:27:07 +01:00 · 2016-10-12 11:45:22 +01:00
parent 2d4a45c758
commit 81f2aeaece
12 changed files with 393 additions and 7 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -251,11 +251,13 @@ int main (int argc, char ** argv)
      sr_o = zero;

      sDw.ZeroCounters();
+      sDw.stat.init("DhopEO");
      double t0=usecond();
      for (int i = 0; i < ncall; i++) {
        sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      }
      double t1=usecond();
+      sDw.stat.print();

      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@ -51,7 +51,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  const int Ls=16;
+  const int Ls=8;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

--- a/configure.ac
+++ b/configure.ac
@ -70,6 +70,20 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
 esac

+################## first-touch ####################
+AC_ARG_ENABLE([numa],
+    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], 
+    [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
+
+case ${ac_NUMA} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
+    *)
+        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
+esac
+
 ################## FFTW3 ####################
 AC_ARG_WITH([fftw],    
            [AS_HELP_STRING([--with-fftw=prefix],
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@ -113,9 +113,8 @@ public:

 #endif
    _Tp tmp;
-#undef FIRST_TOUCH_OPTIMISE
-#ifdef FIRST_TOUCH_OPTIMISE
-#pragma omp parallel for 
+#ifdef GRID_NUMA
+#pragma omp parallel for schedule(static)
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
--- a/lib/Stat.cc
+++ b/lib/Stat.cc
@ -0,0 +1,233 @@
+#include <Grid.h>
+#include <PerfCount.h>
+#include <Stat.h>
+
+
+namespace Grid { 
+
+
+bool PmuStat::pmu_initialized=false;
+
+
+void PmuStat::init(const char *regname)
+{
+  name = regname;
+  if (!pmu_initialized)
+    {
+      std::cout<<"initialising pmu"<<std::endl;
+      pmu_initialized = true;
+      pmu_init();
+    }
+  clear();
+}
+void PmuStat::clear(void)
+{
+  count = 0;
+  tregion = 0;
+  pmc0 = 0;
+  pmc1 = 0;
+  inst = 0;
+  cyc = 0;
+  ref = 0;
+  tcycles = 0;
+  reads = 0;
+  writes = 0;
+}
+void PmuStat::print(void)
+{
+  std::cout <<"Reg "<<std::string(name)<<":\n";
+  std::cout <<"  region "<<tregion<<std::endl;
+  std::cout <<"  cycles "<<tcycles<<std::endl;
+  std::cout <<"  inst   "<<inst   <<std::endl;
+  std::cout <<"  cyc    "<<cyc    <<std::endl;
+  std::cout <<"  ref    "<<ref    <<std::endl;
+  std::cout <<"  pmc0   "<<pmc0   <<std::endl;
+  std::cout <<"  pmc1   "<<pmc1   <<std::endl;
+  std::cout <<"  count  "<<count  <<std::endl;
+  std::cout <<"  reads  "<<reads  <<std::endl;
+  std::cout <<"  writes "<<writes <<std::endl;
+}
+void PmuStat::start(void)
+{
+  pmu_start();
+  ++count;
+  xmemctrs(&mrstart, &mwstart);
+  tstart = _rdtsc();
+}
+void PmuStat::enter(int t)
+{
+  counters[0][t] = _rdpmc(0);
+  counters[1][t] = _rdpmc(1);
+  counters[2][t] = _rdpmc((1<<30)|0);
+  counters[3][t] = _rdpmc((1<<30)|1);
+  counters[4][t] = _rdpmc((1<<30)|2);
+  counters[5][t] = _rdtsc();
+}
+void PmuStat::exit(int t)
+{
+  counters[0][t] = _rdpmc(0) - counters[0][t];
+  counters[1][t] = _rdpmc(1) - counters[1][t];
+  counters[2][t] = _rdpmc((1<<30)|0) - counters[2][t];
+  counters[3][t] = _rdpmc((1<<30)|1) - counters[3][t];
+  counters[4][t] = _rdpmc((1<<30)|2) - counters[4][t];
+  counters[5][t] = _rdtsc() - counters[5][t];
+}
+void PmuStat::accum(int nthreads)
+{
+  tend = _rdtsc();
+  xmemctrs(&mrend, &mwend);
+  pmu_stop();
+  for (int t = 0; t < nthreads; ++t) {
+    pmc0 += counters[0][t];
+    pmc1 += counters[1][t];
+    inst += counters[2][t];
+    cyc += counters[3][t];
+    ref += counters[4][t];
+    tcycles += counters[5][t];
+  }
+  uint64_t region = tend - tstart;
+  tregion += region;
+  uint64_t mreads = mrend - mrstart;
+  reads += mreads;
+  uint64_t mwrites = mwend - mwstart;
+  writes += mwrites;
+}
+
+
+void PmuStat::pmu_fini(void) {}
+void PmuStat::pmu_start(void) {};
+void PmuStat::pmu_stop(void) {};
+void PmuStat::pmu_init(void)
+{
+#ifdef _KNIGHTS_LANDING_
+  KNLsetup();
+#endif
+}
+void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
+{
+#ifdef _KNIGHTS_LANDING_
+  ctrs c;
+  KNLreadctrs(c);
+  uint64_t emr = 0, emw = 0;
+  for (int i = 0; i < NEDC; ++i)
+    {
+      emr += c.edcrd[i];
+      emw += c.edcwr[i];
+    }
+  *mr = emr;
+  *mw = emw;
+#else
+  *mr = *mw = 0;
+#endif
+}
+
+#ifdef _KNIGHTS_LANDING_
+
+struct knl_gbl_ PmuStat::gbl;
+
+#define PMU_MEM
+
+void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
+{
+  char fname[1024];
+  snprintf(fname, sizeof(fname), "%s/type", ename);
+  FILE *fp = fopen(fname, "r");
+  if (fp == 0) {
+    ::printf("open %s", fname);
+    ::exit(0);
+  }
+  int type;
+  int ret = fscanf(fp, "%d", &type);
+  assert(ret == 1);
+  fclose(fp);
+  //  std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
+
+  struct perf_event_attr hw = {};
+  hw.size = sizeof(hw);
+  hw.type = type;
+  // see /sys/devices/uncore_*/format/*
+  // All of the events we are interested in are configured the same way, but
+  // that isn't always true. Proper code would parse the format files
+  hw.config = event | (umask << 8);
+  //hw.read_format = PERF_FORMAT_GROUP;
+  // unfortunately the above only works within a single PMU; might
+  // as well just read them one at a time
+  int cpu = 0;
+  fd = perf_event_open(&hw, -1, cpu, -1, 0);
+  if (fd == -1) {
+    ::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
+    ::exit(0);
+  } else { 
+    //    std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
+  }
+}
+
+
+ void PmuStat::KNLsetup(void){
+
+   int ret;
+   char fname[1024];
+
+   // MC RPQ inserts and WPQ inserts (reads & writes)
+   for (int mc = 0; mc < NMC; ++mc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
+       // RPQ Inserts
+       KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
+       // WPQ Inserts
+       KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
+     }
+   // EDC RPQ inserts and WPQ inserts
+   for (int edc=0; edc < NEDC; ++edc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
+       // RPQ inserts
+       KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
+       // WPQ inserts
+       KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
+     }
+   // EDC HitE, HitM, MissE, MissM
+   for (int edc=0; edc < NEDC; ++edc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
+       KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
+       KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
+       KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
+       KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
+     }
+ }
+
+uint64_t PmuStat::KNLreadctr(int fd)
+{
+  uint64_t data;
+  size_t s = ::read(fd, &data, sizeof(data));
+  if (s != sizeof(uint64_t)){
+    ::printf("read counter %lu", s);
+    ::exit(0);
+  }
+  return data;
+}
+
+void PmuStat::KNLreadctrs(ctrs &c)
+{
+  for (int i = 0; i < NMC; ++i)
+    {
+      c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
+      c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
+    }
+  for (int i = 0; i < NEDC; ++i)
+    {
+      c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
+      c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
+    }
+  for (int i = 0; i < NEDC; ++i)
+    {
+      c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
+      c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
+      c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
+      c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
+    }
+}
+
+#endif
+}
--- a/lib/Stat.h
+++ b/lib/Stat.h
@ -0,0 +1,100 @@
+#ifndef _GRID_STAT_H
+#define _GRID_STAT_H
+
+#ifdef AVX512
+#define _KNIGHTS_LANDING_
+#endif
+
+#ifdef _KNIGHTS_LANDING_
+
+#define NMC 6
+#define NEDC 8
+namespace Grid { 
+struct ctrs
+{
+    uint64_t mcrd[NMC];
+    uint64_t mcwr[NMC];
+    uint64_t edcrd[NEDC]; 
+    uint64_t edcwr[NEDC];
+    uint64_t edchite[NEDC];
+    uint64_t edchitm[NEDC];
+    uint64_t edcmisse[NEDC];
+    uint64_t edcmissm[NEDC];
+};
+// Peter/Azusa:
+// Our modification of a code provided by Larry Meadows from Intel
+// Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
+// so is already public and in the linux kernel for KNL.
+struct knl_gbl_
+{
+  int mc_rd[NMC];
+  int mc_wr[NMC];
+  int edc_rd[NEDC];
+  int edc_wr[NEDC];
+  int edc_hite[NEDC];
+  int edc_hitm[NEDC];
+  int edc_misse[NEDC];
+  int edc_missm[NEDC];
+};
+#endif
+
+class PmuStat
+{
+    const char *name;
+    __declspec(align(64)) uint64_t counters[8][256];
+#ifdef _KNIGHTS_LANDING_
+    static struct knl_gbl_ gbl;
+#endif
+
+    uint64_t reads;     // memory reads
+    uint64_t writes;    // memory writes
+    uint64_t mrstart;   // memory read counter at start of parallel region
+    uint64_t mrend;     // memory read counter at end of parallel region
+    uint64_t mwstart;   // memory write counter at start of parallel region
+    uint64_t mwend;     // memory write counter at end of parallel region
+
+    // cumulative counters
+    uint64_t count;     // number of invocations
+    uint64_t tregion;   // total time in parallel region (from thread 0)
+    uint64_t tcycles;   // total cycles inside parallel region
+    uint64_t inst, ref, cyc;   // fixed counters
+    uint64_t pmc0, pmc1;// pmu
+    // add memory counters here
+    // temp variables
+    uint64_t tstart;    // tsc at start of parallel region
+    uint64_t tend;      // tsc at end of parallel region
+    // map for ctrs values
+    // 0 pmc0 start
+    // 1 pmc0 end
+    // 2 pmc1 start
+    // 3 pmc1 end
+    // 4 tsc start
+    // 5 tsc end
+    static bool pmu_initialized;
+public:
+    static bool is_init(void){ return pmu_initialized;}
+    static void pmu_init(void);
+    static void pmu_fini(void);
+    static void pmu_start(void);
+    static void pmu_stop(void);
+    void accum(int nthreads);
+    static void xmemctrs(uint64_t *mr, uint64_t *mw);
+    void start(void);
+    void enter(int t);
+    void exit(int t);
+    void print(void);
+    void init(const char *regname);
+    void clear(void);
+#ifdef _KNIGHTS_LANDING_
+    static void     KNLsetup(void);
+    static uint64_t KNLreadctr(int fd);
+    static void     KNLreadctrs(ctrs &c);
+    static void     KNLevsetup(const char *ename, int &fd, int event, int umask);
+#endif
+    
+  };
+
+}
+
+#endif
+
--- a/lib/Threads.h
+++ b/lib/Threads.h
@ -37,7 +37,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #ifdef GRID_OMP
 #include <omp.h>
-#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ")
+#ifdef GRID_NUMA
+#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
+#else
+#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
+#endif
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #else
 #define PARALLEL_FOR_LOOP 
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -178,7 +178,7 @@ public:
 	// all elements of a simd vector must have same checkerboard.
 	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 	if ( _simd_layout[d]>1 ) {
-	  if ( d != _checker_dim ) { 
+	  if ( checker_dim_mask[d] ) { 
 	    assert( (_rdimensions[d]&0x1) == 0 );
 	  }
 	}
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -416,6 +416,28 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
                                   out);
    }
+#ifdef AVX512
+  } else if (stat.is_init() ) {
+
+    int nthreads;
+    stat.start();
+    #pragma omp parallel
+    {
+    #pragma omp master
+    nthreads = omp_get_num_threads();
+    int mythread = omp_get_thread_num();
+    stat.enter(mythread);
+    #pragma omp for nowait
+   for(int ss=0;ss<U._grid->oSites();ss++)
+    {
+       int sU=ss;
+       int sF=LLs*sU;
+       Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+     }
+    stat.exit(mythread);
+    }
+    stat.accum(nthreads);
+#endif
  } else {
    PARALLEL_FOR_LOOP
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@ -31,6 +31,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_FERMION_5D_H
 #define  GRID_QCD_WILSON_FERMION_5D_H

+#include <Grid/Stat.h>
+
 namespace Grid {

  namespace QCD {
@ -60,6 +62,7 @@ namespace Grid {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
+     PmuStat stat;

     void Report(void);
     void ZeroCounters(void);
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@ -134,7 +134,9 @@
  ////////////////////////////////
  // Xm
  ////////////////////////////////
+#ifndef STREAM_STORE
  basep= (uint64_t) &out._odata[ss];
+#endif
  //  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
@ -229,7 +231,9 @@
    LOAD_CHI(base);
  }
  base= (uint64_t) &out._odata[ss];
+#ifndef STREAM_STORE
  PREFETCH_CHIMU(base);
+#endif
  {
    MULT_2SPIN_DIR_PFTM(Tm,basep);
  }
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@ -138,9 +138,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)

-
+#define STREAM_STORE
+#ifdef STREAM_STORE
+#define VSTOREf(OFF,PTR,SRC)   "vmovntps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREd(OFF,PTR,SRC)   "vmovntpd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#else
 #define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#endif

 // Swaps Re/Im ; could unify this with IMCI
 #define VSHUFd(A,DEST)         "vpshufd  $0x4e," #A "," #DEST  ";\n"