Merge branch 'develop' into feature/hmc_generalise

2025-08-02 20:57:06 +01:00 · 2016-10-20 17:04:41 +01:00
parent 977d844394 7af9b87318
commit 977b0a6dd9
28 changed files with 1119 additions and 348 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,10 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode7.2
      compiler: clang
-    - os:        osx
-      osx_image: xcode7.2
-      compiler: gcc
-      env: VERSION=-5
    - compiler: gcc
      addons:
        apt:
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -86,18 +86,6 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);

-  /*  src=zero;
-  std::vector<int> origin(5,0);
-  SpinColourVector f=zero;
-  for(int sp=0;sp<4;sp++){
-  for(int co=0;co<3;co++){
-    f()(sp)(co)=Complex(1.0,0.0); 
-  }}
-  pokeSite(f,src,origin);
-  */
-
-  ColourMatrix cm = Complex(1.0,0.0);
-
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);

@@ -144,10 +132,12 @@ int main (int argc, char ** argv)

  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
-  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
+  std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
  int ncall =100;
  if (1) {

+    Dw.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
@@ -166,7 +156,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    //    Dw.Report();
+    Dw.Report();
  }

  if (1)
@@ -188,8 +178,9 @@ int main (int argc, char ** argv)
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
-    std::cout<<"src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
+    std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
    double t0=usecond();
+    sDw.ZeroCounters();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.Dhop(ssrc,sresult,0);
@@ -199,23 +190,23 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;

-    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
-    //  sDw.Report();
+    sDw.Report();
  
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-	sDw.Dhop(ssrc,sresult,0);
-	PerformanceCounter Counter(i);
-	Counter.Start();
-	sDw.Dhop(ssrc,sresult,0);
-	Counter.Stop();
-	Counter.Report();
+  sDw.Dhop(ssrc,sresult,0);
+  PerformanceCounter Counter(i);
+  Counter.Start();
+  sDw.Dhop(ssrc,sresult,0);
+  Counter.Stop();
+  Counter.Report();
      }
    }

-    std::cout<<"res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
+    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;


    RealF sum=0;
@@ -230,12 +221,12 @@ int main (int argc, char ** argv)
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
      if (norm2(normal-simd) > 1.0e-6 ) {
-	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
-	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
-	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
+  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
+  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
      }
    }}}}}
-    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
+    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;


    if (1) {
@@ -259,17 +250,21 @@ int main (int argc, char ** argv)
      sr_e = zero;
      sr_o = zero;

+      sDw.ZeroCounters();
+      sDw.stat.init("DhopEO");
      double t0=usecond();
-      for(int i=0;i<ncall;i++){
-	sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+      for (int i = 0; i < ncall; i++) {
+        sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      }
      double t1=usecond();
+      sDw.stat.print();

      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;

      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+      sDw.Report();

      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
@@ -294,18 +289,19 @@ int main (int argc, char ** argv)
      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu+1,1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }

      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
+  std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
@@ -327,6 +323,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;

  {
+    Dw.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
@@ -338,6 +335,7 @@ int main (int argc, char ** argv)

    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  const int Ls=16;
+  const int Ls=8;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

--- a/configure.ac
+++ b/configure.ac
@@ -47,7 +47,7 @@ AC_ARG_WITH([gmp],
    [AS_HELP_STRING([--with-gmp=prefix],
    [try this for a non-standard install prefix of the GMP library])],
    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
-    [AM_LDFLAGS="-L$with_gmp/lib" $AM_LDFLAGS])
+    [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
 AC_ARG_WITH([mpfr],
    [AS_HELP_STRING([--with-mpfr=prefix],
    [try this for a non-standard install prefix of the MPFR library])],
@@ -70,6 +70,20 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
 esac

+################## first-touch ####################
+AC_ARG_ENABLE([numa],
+    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], 
+    [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
+
+case ${ac_NUMA} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
+    *)
+        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
+esac
+
 ################## FFTW3 ####################
 AC_ARG_WITH([fftw],    
            [AS_HELP_STRING([--with-fftw=prefix],
@@ -117,7 +131,7 @@ CXXFLAGS=$CXXFLAGS_CPY
 LDFLAGS=$LDFLAGS_CPY

 ############### SIMD instruction selection
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|AVX512MIC|IMCI|KNL|KNC],\
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVXFMA|AVX2|AVX512|AVX512MIC|IMCI|KNL|KNC],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 	[ac_SIMD=${enable_simd}],[ac_SIMD=GEN])

@@ -133,6 +147,9 @@ case ${ax_cv_cxx_compiler_vendor} in
      AVXFMA4)
        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
        SIMD_FLAGS='-mavx -mfma4';;
+      AVXFMA)
+        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
+        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-mavx2 -mfma';;
@@ -161,7 +178,10 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='-mavx -xavx';;
      AVXFMA4)
        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
-        SIMD_FLAGS='-mavx -xavx -mfma';;
+        SIMD_FLAGS='-mavx -mfma';;
+      AVXFMA)
+        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -113,9 +113,8 @@ public:

 #endif
    _Tp tmp;
-#undef FIRST_TOUCH_OPTIMISE
-#ifdef FIRST_TOUCH_OPTIMISE
-#pragma omp parallel for 
+#ifdef GRID_NUMA
+#pragma omp parallel for schedule(static)
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -246,15 +246,15 @@ void Grid_init(int *argc,char ***argv)
  std::cout <<std::endl;
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<COL_RED  << "__|__|  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
-  std::cout <<COL_RED  << "__|__         "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|_ |  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|_          "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
-  std::cout <<COL_BLUE << "__|__         "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|_          "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
--- a/lib/Stat.cc
+++ b/lib/Stat.cc
@@ -0,0 +1,247 @@
+#include <Grid.h>
+#include <PerfCount.h>
+#include <Stat.h>
+
+
+namespace Grid { 
+
+
+bool PmuStat::pmu_initialized=false;
+
+
+void PmuStat::init(const char *regname)
+{
+#ifdef __x86_64__
+  name = regname;
+  if (!pmu_initialized)
+    {
+      std::cout<<"initialising pmu"<<std::endl;
+      pmu_initialized = true;
+      pmu_init();
+    }
+  clear();
+#endif
+}
+void PmuStat::clear(void)
+{
+#ifdef __x86_64__
+  count = 0;
+  tregion = 0;
+  pmc0 = 0;
+  pmc1 = 0;
+  inst = 0;
+  cyc = 0;
+  ref = 0;
+  tcycles = 0;
+  reads = 0;
+  writes = 0;
+#endif
+}
+void PmuStat::print(void)
+{
+#ifdef __x86_64__
+  std::cout <<"Reg "<<std::string(name)<<":\n";
+  std::cout <<"  region "<<tregion<<std::endl;
+  std::cout <<"  cycles "<<tcycles<<std::endl;
+  std::cout <<"  inst   "<<inst   <<std::endl;
+  std::cout <<"  cyc    "<<cyc    <<std::endl;
+  std::cout <<"  ref    "<<ref    <<std::endl;
+  std::cout <<"  pmc0   "<<pmc0   <<std::endl;
+  std::cout <<"  pmc1   "<<pmc1   <<std::endl;
+  std::cout <<"  count  "<<count  <<std::endl;
+  std::cout <<"  reads  "<<reads  <<std::endl;
+  std::cout <<"  writes "<<writes <<std::endl;
+#endif
+}
+void PmuStat::start(void)
+{
+#ifdef __x86_64__
+  pmu_start();
+  ++count;
+  xmemctrs(&mrstart, &mwstart);
+  tstart = __rdtsc();
+#endif
+}
+void PmuStat::enter(int t)
+{
+#ifdef __x86_64__
+  counters[0][t] = __rdpmc(0);
+  counters[1][t] = __rdpmc(1);
+  counters[2][t] = __rdpmc((1<<30)|0);
+  counters[3][t] = __rdpmc((1<<30)|1);
+  counters[4][t] = __rdpmc((1<<30)|2);
+  counters[5][t] = __rdtsc();
+#endif
+}
+void PmuStat::exit(int t)
+{
+#ifdef __x86_64__
+  counters[0][t] = __rdpmc(0) - counters[0][t];
+  counters[1][t] = __rdpmc(1) - counters[1][t];
+  counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t];
+  counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t];
+  counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t];
+  counters[5][t] = __rdtsc() - counters[5][t];
+#endif
+}
+void PmuStat::accum(int nthreads)
+{
+#ifdef __x86_64__
+  tend = __rdtsc();
+  xmemctrs(&mrend, &mwend);
+  pmu_stop();
+  for (int t = 0; t < nthreads; ++t) {
+    pmc0 += counters[0][t];
+    pmc1 += counters[1][t];
+    inst += counters[2][t];
+    cyc += counters[3][t];
+    ref += counters[4][t];
+    tcycles += counters[5][t];
+  }
+  uint64_t region = tend - tstart;
+  tregion += region;
+  uint64_t mreads = mrend - mrstart;
+  reads += mreads;
+  uint64_t mwrites = mwend - mwstart;
+  writes += mwrites;
+#endif
+}
+
+
+void PmuStat::pmu_fini(void) {}
+void PmuStat::pmu_start(void) {};
+void PmuStat::pmu_stop(void) {};
+void PmuStat::pmu_init(void)
+{
+#ifdef _KNIGHTS_LANDING_
+  KNLsetup();
+#endif
+}
+void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
+{
+#ifdef _KNIGHTS_LANDING_
+  ctrs c;
+  KNLreadctrs(c);
+  uint64_t emr = 0, emw = 0;
+  for (int i = 0; i < NEDC; ++i)
+    {
+      emr += c.edcrd[i];
+      emw += c.edcwr[i];
+    }
+  *mr = emr;
+  *mw = emw;
+#else
+  *mr = *mw = 0;
+#endif
+}
+
+#ifdef _KNIGHTS_LANDING_
+
+struct knl_gbl_ PmuStat::gbl;
+
+#define PMU_MEM
+
+void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
+{
+  char fname[1024];
+  snprintf(fname, sizeof(fname), "%s/type", ename);
+  FILE *fp = fopen(fname, "r");
+  if (fp == 0) {
+    ::printf("open %s", fname);
+    ::exit(0);
+  }
+  int type;
+  int ret = fscanf(fp, "%d", &type);
+  assert(ret == 1);
+  fclose(fp);
+  //  std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
+
+  struct perf_event_attr hw = {};
+  hw.size = sizeof(hw);
+  hw.type = type;
+  // see /sys/devices/uncore_*/format/*
+  // All of the events we are interested in are configured the same way, but
+  // that isn't always true. Proper code would parse the format files
+  hw.config = event | (umask << 8);
+  //hw.read_format = PERF_FORMAT_GROUP;
+  // unfortunately the above only works within a single PMU; might
+  // as well just read them one at a time
+  int cpu = 0;
+  fd = perf_event_open(&hw, -1, cpu, -1, 0);
+  if (fd == -1) {
+    ::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
+    ::exit(0);
+  } else { 
+    //    std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
+  }
+}
+
+
+ void PmuStat::KNLsetup(void){
+
+   int ret;
+   char fname[1024];
+
+   // MC RPQ inserts and WPQ inserts (reads & writes)
+   for (int mc = 0; mc < NMC; ++mc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
+       // RPQ Inserts
+       KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
+       // WPQ Inserts
+       KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
+     }
+   // EDC RPQ inserts and WPQ inserts
+   for (int edc=0; edc < NEDC; ++edc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
+       // RPQ inserts
+       KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
+       // WPQ inserts
+       KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
+     }
+   // EDC HitE, HitM, MissE, MissM
+   for (int edc=0; edc < NEDC; ++edc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
+       KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
+       KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
+       KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
+       KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
+     }
+ }
+
+uint64_t PmuStat::KNLreadctr(int fd)
+{
+  uint64_t data;
+  size_t s = ::read(fd, &data, sizeof(data));
+  if (s != sizeof(uint64_t)){
+    ::printf("read counter %lu", s);
+    ::exit(0);
+  }
+  return data;
+}
+
+void PmuStat::KNLreadctrs(ctrs &c)
+{
+  for (int i = 0; i < NMC; ++i)
+    {
+      c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
+      c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
+    }
+  for (int i = 0; i < NEDC; ++i)
+    {
+      c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
+      c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
+    }
+  for (int i = 0; i < NEDC; ++i)
+    {
+      c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
+      c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
+      c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
+      c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
+    }
+}
+
+#endif
+}
--- a/lib/Stat.h
+++ b/lib/Stat.h
@@ -0,0 +1,104 @@
+#ifndef _GRID_STAT_H
+#define _GRID_STAT_H
+
+#ifdef AVX512
+#define _KNIGHTS_LANDING_ROOTONLY
+#endif
+
+namespace Grid { 
+
+///////////////////////////////////////////////////////////////////////////////
+// Extra KNL counters from MCDRAM
+///////////////////////////////////////////////////////////////////////////////
+#ifdef _KNIGHTS_LANDING_
+#define NMC 6
+#define NEDC 8
+struct ctrs
+{
+    uint64_t mcrd[NMC];
+    uint64_t mcwr[NMC];
+    uint64_t edcrd[NEDC]; 
+    uint64_t edcwr[NEDC];
+    uint64_t edchite[NEDC];
+    uint64_t edchitm[NEDC];
+    uint64_t edcmisse[NEDC];
+    uint64_t edcmissm[NEDC];
+};
+// Peter/Azusa:
+// Our modification of a code provided by Larry Meadows from Intel
+// Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
+// so is already public and in the linux kernel for KNL.
+struct knl_gbl_
+{
+  int mc_rd[NMC];
+  int mc_wr[NMC];
+  int edc_rd[NEDC];
+  int edc_wr[NEDC];
+  int edc_hite[NEDC];
+  int edc_hitm[NEDC];
+  int edc_misse[NEDC];
+  int edc_missm[NEDC];
+};
+#endif
+///////////////////////////////////////////////////////////////////////////////
+
+class PmuStat
+{
+    uint64_t counters[8][256];
+#ifdef _KNIGHTS_LANDING_
+    static struct knl_gbl_ gbl;
+#endif
+    const char *name;
+
+    uint64_t reads;     // memory reads
+    uint64_t writes;    // memory writes
+    uint64_t mrstart;   // memory read counter at start of parallel region
+    uint64_t mrend;     // memory read counter at end of parallel region
+    uint64_t mwstart;   // memory write counter at start of parallel region
+    uint64_t mwend;     // memory write counter at end of parallel region
+
+    // cumulative counters
+    uint64_t count;     // number of invocations
+    uint64_t tregion;   // total time in parallel region (from thread 0)
+    uint64_t tcycles;   // total cycles inside parallel region
+    uint64_t inst, ref, cyc;   // fixed counters
+    uint64_t pmc0, pmc1;// pmu
+    // add memory counters here
+    // temp variables
+    uint64_t tstart;    // tsc at start of parallel region
+    uint64_t tend;      // tsc at end of parallel region
+    // map for ctrs values
+    // 0 pmc0 start
+    // 1 pmc0 end
+    // 2 pmc1 start
+    // 3 pmc1 end
+    // 4 tsc start
+    // 5 tsc end
+    static bool pmu_initialized;
+public:
+    static bool is_init(void){ return pmu_initialized;}
+    static void pmu_init(void);
+    static void pmu_fini(void);
+    static void pmu_start(void);
+    static void pmu_stop(void);
+    void accum(int nthreads);
+    static void xmemctrs(uint64_t *mr, uint64_t *mw);
+    void start(void);
+    void enter(int t);
+    void exit(int t);
+    void print(void);
+    void init(const char *regname);
+    void clear(void);
+#ifdef _KNIGHTS_LANDING_
+    static void     KNLsetup(void);
+    static uint64_t KNLreadctr(int fd);
+    static void     KNLreadctrs(ctrs &c);
+    static void     KNLevsetup(const char *ename, int &fd, int event, int umask);
+#endif
+    
+  };
+
+}
+#endif
+
+
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -70,9 +70,70 @@

 namespace Grid {

+template<class vobj,class cobj,class compressor> void 
+Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
+{
+  table.resize(0);
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask = 0x3;
+  }
+  int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
+
+  int stride=rhs._grid->_slice_stride[dimension];
+  if ( cbmask == 0x3 ) { 
+    table.resize(e1*e2);
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o  = n*stride;
+	int bo = n*e2;
+	table[bo+b]=std::pair<int,int>(bo+b,o+b);
+      }
+    }
+  } else { 
+     int bo=0;
+     table.resize(e1*e2/2);
+     for(int n=0;n<e1;n++){
+       for(int b=0;b<e2;b++){
+	 int o  = n*stride;
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
+	 if ( ocb &cbmask ) {
+	   table[bo]=std::pair<int,int>(bo,o+b); bo++;
+	 }
+       }
+     }
+  }
+}
+
+template<class vobj,class cobj,class compressor> void 
+Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,
+			   compressor &compress, int off,int so)
+{
+PARALLEL_FOR_LOOP     
+     for(int i=0;i<table.size();i++){
+       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+     }
+}
+
+template<class vobj,class cobj,class compressor> void 
+Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
+			     double &t_table ,double & t_data )
+{
+  std::vector<std::pair<int,int> > table;
+  Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table);
+  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  Gather_plane_simple_table         (table,rhs,buffer,compress,off,so);
+}
+
+
+
+
   struct StencilEntry { 
-     uint32_t _offset;
-     uint32_t _byte_offset;
+     uint64_t _offset;
+     uint64_t _byte_offset;
     uint16_t _is_local;
     uint16_t _permute;
     uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
@@ -101,12 +162,14 @@
       };

       std::vector<Packet> Packets;
+
+       int face_table_computed;
+       std::vector<std::vector<std::pair<int,int> > > face_table ;
       
- #define SEND_IMMEDIATE
- #define SERIAL_SENDS
+#define SEND_IMMEDIATE
+#define SERIAL_SENDS

       void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
-	 comms_bytes+=2.0*bytes;
 #ifdef SEND_IMMEDIATE
 	 commtime-=usecond();
 	 _grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
@@ -256,7 +319,8 @@
 	   if( _entries[i]._is_local ) {
 	     _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
 	   } else { 
-	     _entries[i]._byte_offset =(uint64_t)&comm_buf[0]+ _entries[i]._offset*sizeof(cobj);
+	     // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952
+	     _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
 	   }
 	 }
       };
@@ -265,17 +329,21 @@
 	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
       }
       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 //_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
+	 uint64_t cbase = (uint64_t)&comm_buf[0];
 	 local = _entries[ent]._is_local;
 	 perm  = _entries[ent]._permute;
 	 if (perm)  ptype = _permute_type[point]; 
-	 if (local) return base + _entries[ent]._byte_offset;
-	 else       return _entries[ent]._byte_offset;
+	 if (local) {
+	   return  base + _entries[ent]._byte_offset;
+	 } else {
+	   return cbase + _entries[ent]._byte_offset;
+	 }
       }
       inline uint64_t GetPFInfo(int ent,uint64_t base) {
+	 uint64_t cbase = (uint64_t)&comm_buf[0];
 	 int local = _entries[ent]._is_local;
-	 if (local) return base + _entries[ent]._byte_offset;
-	 else       return        _entries[ent]._byte_offset;
+	 if (local) return  base + _entries[ent]._byte_offset;
+	 else       return cbase + _entries[ent]._byte_offset;
       }

       // Comms buffers
@@ -301,6 +369,48 @@
       double gathermtime;
       double splicetime;
       double nosplicetime;
+       double t_data;
+       double t_table;
+       double calls;
+
+       void ZeroCounters(void) {
+         gathertime = 0.;
+         jointime = 0.;
+         commtime = 0.;
+         halogtime = 0.;
+         mergetime = 0.;
+         spintime = 0.;
+         gathermtime = 0.;
+         splicetime = 0.;
+         nosplicetime = 0.;
+	 t_data = 0.0;
+         t_table= 0.0;
+         comms_bytes = 0.;
+         calls = 0.;
+       };
+
+       void Report(void) {
+#define PRINTIT(A)	\
+ std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
+	 if ( calls > 0. ) {
+	   std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
+	   PRINTIT(halogtime);
+	   PRINTIT(gathertime);
+	   PRINTIT(gathermtime);
+	   PRINTIT(mergetime);
+	   if(comms_bytes>1.0){
+	     PRINTIT(comms_bytes);
+	     PRINTIT(commtime);
+	     std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl;
+	   }
+	   PRINTIT(jointime);
+	   PRINTIT(spintime);
+	   PRINTIT(splicetime);
+	   PRINTIT(nosplicetime);
+	   PRINTIT(t_table);
+	   PRINTIT(t_data);
+	 }
+       };
 #endif

   CartesianStencil(GridBase *grid,
@@ -310,18 +420,7 @@
 				      const std::vector<int> &distances) 
     :   _permute_type(npoints), _comm_buf_size(npoints)
     {
- #ifdef TIMING_HACK
-       gathertime=0;
-       jointime=0;
-       commtime=0;
-       halogtime=0;
-       mergetime=0;
-       spintime=0;
-       gathermtime=0;
-       splicetime=0;
-       nosplicetime=0;
-       comms_bytes=0;
- #endif
+       face_table_computed=0;
       _npoints = npoints;
       _grid    = grid;
       _directions = directions;
@@ -623,6 +722,7 @@
       template<class compressor>
       void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
       {
+	 calls++;
 	 Mergers.resize(0);
         Packets.resize(0);
         HaloGather(source,compress);
@@ -648,7 +748,7 @@
       }
 #endif
       template<class compressor>
-       void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point)
+       void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
       {
 	   int dimension    = _directions[point];
 	   int displacement = _distances[point];
@@ -676,23 +776,23 @@
 	     if ( sshift[0] == sshift[1] ) {
 	       if (splice_dim) {
 		 splicetime-=usecond();
-		 GatherSimd(source,dimension,shift,0x3,compress);
+		 GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 		 splicetime+=usecond();
 	       } else { 
 		 nosplicetime-=usecond();
-		 Gather(source,dimension,shift,0x3,compress);
+		 Gather(source,dimension,shift,0x3,compress,face_idx);
 		 nosplicetime+=usecond();
 	       }
 	     } else {
 	       if(splice_dim){
 		 splicetime-=usecond();
-		 GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes
-		 GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration
+		 GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
+		 GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 		 splicetime+=usecond();
 	       } else {
 		 nosplicetime-=usecond();
-		 Gather(source,dimension,shift,0x1,compress);
-		 Gather(source,dimension,shift,0x2,compress);
+		 Gather(source,dimension,shift,0x1,compress,face_idx);
+		 Gather(source,dimension,shift,0x2,compress,face_idx);
 		 nosplicetime+=usecond();
 	       }
 	     }
@@ -710,17 +810,19 @@
 	 u_comm_offset=0;

 	 // Gather all comms buffers
+	 int face_idx=0;
 	 for(int point = 0 ; point < _npoints; point++) {
 	   compress.Point(point);
-	   HaloGatherDir(source,compress,point);
+	   HaloGatherDir(source,compress,point,face_idx);
 	 }
+	 face_table_computed=1;

 	 assert(u_comm_offset==_unified_buffer_size);
 	 halogtime+=usecond();
       }

       template<class compressor>
-	 void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
+	 void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
 	 {
 	   typedef typename cobj::vector_type vector_type;
 	   typedef typename cobj::scalar_type scalar_type;
@@ -757,8 +859,20 @@
 	       int bytes = words * sizeof(cobj);

 	       gathertime-=usecond();
-	       Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset);
+	       int so  = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+	       if ( !face_table_computed ) {
+		 t_table-=usecond();
+		 face_table.resize(face_idx+1);
+		 Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]);
+		 t_table+=usecond();
+	       }
+	       t_data-=usecond();
+	       Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so);
+	       face_idx++;
+	       t_data+=usecond();
 	       gathertime+=usecond();
+	       
+	       //	       Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data);

 	       int rank           = _grid->_processor;
 	       int recv_from_rank;
@@ -781,7 +895,7 @@


       template<class compressor>
-	 void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
+	 void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
 	 {
 	   const int Nsimd = _grid->Nsimd();

--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -37,7 +37,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #ifdef GRID_OMP
 #include <omp.h>
-#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ")
+#ifdef GRID_NUMA
+#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
+#else
+#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
+#endif
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #else
 #define PARALLEL_FOR_LOOP 
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -1,153 +1,168 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/algorithms/iterative/ConjugateGradient.h
+Source file: ./lib/algorithms/iterative/ConjugateGradient.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_H
 #define GRID_CONJUGATE_GRADIENT_H

 namespace Grid {

-    /////////////////////////////////////////////////////////////
-    // Base classes for iterative processes based on operators
-    // single input vec, single output vec.
-    /////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////

-  template<class Field> 
-    class ConjugateGradient : public OperatorFunction<Field> {
-public:                                                
-    bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true.
-    RealD   Tolerance;
-    Integer MaxIterations;
-  ConjugateGradient(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){ 
-    };
+template <class Field>
+class ConjugateGradient : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                           // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+      : Tolerance(tol),
+        MaxIterations(maxit),
+        ErrorOnNoConverge(err_on_no_conv){};

+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
+                  Field &psi) {
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);

-    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+    RealD cp, c, a, d, b, ssq, qq, b_pred;

-      psi.checkerboard = src.checkerboard;
-      conformable(psi,src);
+    Field p(src);
+    Field mmp(src);
+    Field r(src);

-      RealD cp,c,a,d,b,ssq,qq,b_pred;
-      
-      Field   p(src);
-      Field mmp(src);
-      Field   r(src);
-      
-      //Initial residual computation & set up
-      RealD guess = norm2(psi);
-      assert(std::isnan(guess)==0);
+    // Initial residual computation & set up
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);

-      Linop.HermOpAndNorm(psi,mmp,d,b);
-      
-      r= src-mmp;
-      p= r;
-      
-      a  =norm2(p);
-      cp =a;
-      ssq=norm2(src);
+    
+    Linop.HermOpAndNorm(psi, mmp, d, b);
+    

-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
+    r = src - mmp;
+    p = r;

-      RealD rsq =  Tolerance* Tolerance*ssq;
-      
-      //Check if guess is really REALLY good :)
-      if ( cp <= rsq ) {
-	return;
-      }
-      
-      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
+    a = norm2(p);
+    cp = a;
+    ssq = norm2(src);

-      GridStopWatch LinalgTimer;
-      GridStopWatch MatrixTimer;
-      GridStopWatch SolverTimer;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:     p " << a << std::endl;

-      SolverTimer.Start();
-      int k;
-      for (k=1;k<=MaxIterations;k++){
-	
-	c=cp;
+    RealD rsq = Tolerance * Tolerance * ssq;

-	MatrixTimer.Start();
-	Linop.HermOpAndNorm(p,mmp,d,qq);
-	MatrixTimer.Stop();
-
-	LinalgTimer.Start();
-	//	RealD    qqck = norm2(mmp);
-	//	ComplexD dck  = innerProduct(p,mmp);
-      
-	a      = c/d;
-	b_pred = a*(a*qq-d)/c;
-
-	cp = axpy_norm(r,-a,mmp,r);
-	b = cp/c;
-	
-	// Fuse these loops ; should be really easy
-	psi= a*p+psi;
-	p  = p*b+r;
-	  
-	LinalgTimer.Stop();
-	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
-	
-	// Stopping condition
-	if ( cp <= rsq ) { 
-	  
-	  SolverTimer.Stop();
-	  Linop.HermOpAndNorm(psi,mmp,d,qq);
-	  p=mmp-src;
-	  
-	  RealD mmpnorm = sqrt(norm2(mmp));
-	  RealD psinorm = sqrt(norm2(psi));
-	  RealD srcnorm = sqrt(norm2(src));
-	  RealD resnorm = sqrt(norm2(p));
-	  RealD true_residual = resnorm/srcnorm;
-
-	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
-		   <<" computed residual "<<sqrt(cp/ssq)
-		   <<" true residual "    <<true_residual
-		   <<" target "<<Tolerance<<std::endl;
-	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
-	  std::cout<<std::endl;
-	  
-	  if(ErrorOnNoConverge)
-	    assert(true_residual/Tolerance < 1000.0);
-
-	  return;
-	}
-      }
-      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
-      if(ErrorOnNoConverge)	
-	assert(0);
+    // Check if guess is really REALLY good :)
+    if (cp <= rsq) {
+      return;
    }
-  };
+
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq
+              << std::endl;
+
+    GridStopWatch LinalgTimer;
+    GridStopWatch MatrixTimer;
+    GridStopWatch SolverTimer;
+
+    SolverTimer.Start();
+    int k;
+    for (k = 1; k <= MaxIterations; k++) {
+      c = cp;
+
+      MatrixTimer.Start();
+      Linop.HermOpAndNorm(p, mmp, d, qq);
+      MatrixTimer.Stop();
+
+      LinalgTimer.Start();
+      //  RealD    qqck = norm2(mmp);
+      //  ComplexD dck  = innerProduct(p,mmp);
+
+      a = c / d;
+      b_pred = a * (a * qq - d) / c;
+
+      cp = axpy_norm(r, -a, mmp, r);
+      b = cp / c;
+
+      // Fuse these loops ; should be really easy
+      psi = a * p + psi;
+      p = p * b + r;
+
+      LinalgTimer.Stop();
+      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      // Stopping condition
+      if (cp <= rsq) {
+        SolverTimer.Stop();
+        Linop.HermOpAndNorm(psi, mmp, d, qq);
+        p = mmp - src;
+
+        RealD mmpnorm = sqrt(norm2(mmp));
+        RealD psinorm = sqrt(norm2(psi));
+        RealD srcnorm = sqrt(norm2(src));
+        RealD resnorm = sqrt(norm2(p));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage
+                  << "ConjugateGradient: Converged on iteration " << k << std::endl;
+        std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
+                  << " true residual " << true_residual << " target "
+                  << Tolerance << std::endl;
+        std::cout << GridLogMessage << "Time elapsed: Iterations "
+                  << SolverTimer.Elapsed() << " Matrix  "
+                  << MatrixTimer.Elapsed() << " Linalg "
+                  << LinalgTimer.Elapsed();
+        std::cout << std::endl;
+
+        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0);
+
+        return;
+      }
+    }
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
+              << std::endl;
+    if (ErrorOnNoConverge) assert(0);
+  }
+};
 }
 #endif
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -81,11 +81,8 @@ public:
    virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
-    int  CheckerBoardFromOindex (int Oindex){
-      std::vector<int> ocoor;
-      oCoorFromOindex(ocoor,Oindex); 
-      return CheckerBoard(ocoor);
-    }
+    virtual int CheckerBoardFromOindex (int Oindex)=0;
+    virtual int CheckerBoardFromOindexTable (int Oindex)=0;

    //////////////////////////////////////////////////////////////////////////////////////////////
    // Local layout calculations
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -39,6 +39,13 @@ class GridCartesian: public GridBase {

 public:

+    virtual int  CheckerBoardFromOindexTable (int Oindex) {
+      return 0;
+    }
+    virtual int  CheckerBoardFromOindex (int Oindex)
+    {
+      return 0;
+    }
    virtual int CheckerBoarded(int dim){
      return 0;
    }
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -43,6 +43,7 @@ class GridRedBlackCartesian : public GridBase
 public:
    std::vector<int> _checker_dim_mask;
    int              _checker_dim;
+    std::vector<int> _checker_board;

    virtual int CheckerBoarded(int dim){
      if( dim==_checker_dim) return 1;
@@ -72,12 +73,20 @@ public:
      // or by looping over x,y,z and multiply rather than computing checkerboard.
 	  
      if ( (source_cb+ocb)&1 ) {
-
 	return (shift)/2;
      } else {
 	return (shift+1)/2;
      }
    }
+    virtual int  CheckerBoardFromOindexTable (int Oindex) {
+      return _checker_board[Oindex];
+    }
+    virtual int  CheckerBoardFromOindex (int Oindex)
+    {
+      std::vector<int> ocoor;
+      oCoorFromOindex(ocoor,Oindex);
+      return CheckerBoard(ocoor);
+    }
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){

      if(dim != _checker_dim) return shift;
@@ -169,7 +178,7 @@ public:
 	// all elements of a simd vector must have same checkerboard.
 	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 	if ( _simd_layout[d]>1 ) {
-	  if ( d != _checker_dim ) { 
+	  if ( checker_dim_mask[d] ) { 
 	    assert( (_rdimensions[d]&0x1) == 0 );
 	  }
 	}
@@ -185,6 +194,8 @@ public:
 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 	}
+
+
      }
            
      ////////////////////////////////////////////////////////////////////////////////////////////
@@ -205,6 +216,18 @@ public:
 	_slice_nblock[d]=nblock;
 	block = block*_rdimensions[d];
      }
+
+      ////////////////////////////////////////////////
+      // Create a checkerboard lookup table
+      ////////////////////////////////////////////////
+      int rvol = 1;
+      for(int d=0;d<_ndimension;d++){
+	rvol=rvol * _rdimensions[d];
+      }
+      _checker_board.resize(rvol);
+      for(int osite=0;osite<_osites;osite++){
+	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      }
      
    };
 protected:
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -1,3 +1,4 @@
+
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -56,6 +57,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+
  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
 PARALLEL_NESTED_LOOP2
@@ -68,15 +70,20 @@ PARALLEL_NESTED_LOOP2
    }
  } else { 
     int bo=0;
+     std::vector<std::pair<int,int> > table;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*stride;
-	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
 	 if ( ocb &cbmask ) {
-	   buffer[off+bo++]=compress(rhs._odata[so+o+b]);
+	   table.push_back(std::pair<int,int> (bo++,o+b));
 	 }
       }
     }
+PARALLEL_FOR_LOOP     
+     for(int i=0;i<table.size();i++){
+       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+     }
  }
 }

--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -186,10 +186,10 @@ namespace Grid {
      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
 	
 	int Ls=Btilde._grid->_fdimensions[0];
-	
 	GaugeLinkField tmp(mat._grid);
 	tmp = zero;
-	PARALLEL_FOR_LOOP
+
+        PARALLEL_FOR_LOOP
 	  for(int sss=0;sss<tmp._grid->oSites();sss++){
 	    int sU=sss;
 	    for(int s=0;s<Ls;s++){
@@ -198,7 +198,7 @@ namespace Grid {
 	    }
 	  }
 	PokeIndex<LorentzIndex>(mat,tmp,mu);
-	
+
      }
    };

--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -42,11 +42,11 @@ const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1
  // 5d lattice for DWF.
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
-				       GridCartesian         &FiveDimGrid,
-				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-				       GridCartesian         &FourDimGrid,
-				       GridRedBlackCartesian &FourDimRedBlackGrid,
-				       RealD _M5,const ImplParams &p) :
+               GridCartesian         &FiveDimGrid,
+               GridRedBlackCartesian &FiveDimRedBlackGrid,
+               GridCartesian         &FourDimGrid,
+               GridRedBlackCartesian &FourDimRedBlackGrid,
+               RealD _M5,const ImplParams &p) :
  Kernels(p),
  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
@@ -135,10 +135,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  /*
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
-				       GridCartesian         &FiveDimGrid,
-				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-				       GridCartesian         &FourDimGrid,
-				       RealD _M5,const ImplParams &p) :
+               GridCartesian         &FiveDimGrid,
+               GridRedBlackCartesian &FiveDimRedBlackGrid,
+               GridCartesian         &FourDimGrid,
+               RealD _M5,const ImplParams &p) :
 {
  int nsimd = Simd::Nsimd();

@@ -175,6 +175,73 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 }  
  */
     
+template<class Impl>
+void WilsonFermion5D<Impl>::Report(void)
+{
+    std::vector<int> latt = GridDefaultLatt();          
+    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+    RealD NP = _FourDimGrid->_Nprocessors;
+
+  if ( DhopCalls > 0 ) {
+    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls  << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
+              << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : "
+              << DhopCommTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : "
+              << DhopComputeTime << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : "
+              << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime;
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+
+   }
+
+  if ( DerivCalls > 0 ) {
+  std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " <<DerivComputeTime <<" us"<<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
+
+
+
+  RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+
+  }
+
+  if (DerivCalls > 0 || DhopCalls > 0){
+  std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
+  }
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::ZeroCounters(void) {
+  DhopCalls       = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+
+  DerivCalls       = 0;
+  DerivCommTime    = 0;
+  DerivComputeTime = 0;
+  DerivDhopComputeTime = 0;
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@@ -215,12 +282,13 @@ PARALLEL_FOR_LOOP

 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
-					  DoubledGaugeField & U,
-					  GaugeField &mat,
-					  const FermionField &A,
-					  const FermionField &B,
-					  int dag)
+            DoubledGaugeField & U,
+            GaugeField &mat,
+            const FermionField &A,
+            const FermionField &B,
+            int dag)
 {
+  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));

  conformable(st._grid,A._grid);
@@ -231,51 +299,53 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);

+  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
+  DerivCommTime+=usecond();

  Atilde=A;

-  for(int mu=0;mu<Nd;mu++){
-      
+  DerivComputeTime-=usecond();
+  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
    ////////////////////////////////////////////////////////////////////////
    int gamma = mu;
-    if ( !dag ) gamma+= Nd;
+    if (!dag) gamma += Nd;

    ////////////////////////
    // Call the single hop
    ////////////////////////

-PARALLEL_FOR_LOOP
-    for(int sss=0;sss<U._grid->oSites();sss++){
-      for(int s=0;s<Ls;s++){
-	int sU=sss;
-	int sF = s+Ls*sU;
+    DerivDhopComputeTime -= usecond();
+    PARALLEL_FOR_LOOP
+    for (int sss = 0; sss < U._grid->oSites(); sss++) {
+      for (int s = 0; s < Ls; s++) {
+        int sU = sss;
+        int sF = s + Ls * sU;

-	assert ( sF< B._grid->oSites());
-	assert ( sU< U._grid->oSites());
+        assert(sF < B._grid->oSites());
+        assert(sU < U._grid->oSites());

-	Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
-
-    ////////////////////////////
-    // spin trace outer product
-    ////////////////////////////
+        Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
+                                 gamma);

+        ////////////////////////////
+        // spin trace outer product
+        ////////////////////////////
      }
-
    }
-
-    Impl::InsertForce5D(mat,Btilde,Atilde,mu);
-
+    DerivDhopComputeTime += usecond();
+    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
+  DerivComputeTime += usecond();
 }

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,
-					    const FermionField &A,
-					    const FermionField &B,
-					    int dag)
+              const FermionField &A,
+              const FermionField &B,
+              int dag)
 {
  conformable(A._grid,FermionGrid());  
  conformable(A._grid,B._grid);
@@ -288,9 +358,9 @@ void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
-					const FermionField &A,
-					const FermionField &B,
-					int dag)
+          const FermionField &A,
+          const FermionField &B,
+          int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
@@ -306,9 +376,9 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-				  const FermionField &A,
-				  const FermionField &B,
-				  int dag)
+          const FermionField &A,
+          const FermionField &B,
+          int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
@@ -323,32 +393,61 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
+           DoubledGaugeField & U,
+           const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls++;
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);

  int LLs = in._grid->_rdimensions[0];
  
+  DhopCommTime-=usecond();
  st.HaloExchange(in,compressor);
+  DhopCommTime+=usecond();
  
+  DhopComputeTime-=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  if ( dag == DaggerYes ) {
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	int sF=LLs*sU;
-	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+  if (dag == DaggerYes) {
+    PARALLEL_FOR_LOOP
+    for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU = ss;
+      int sF = LLs * sU;
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
+                                   out);
    }
+#ifdef AVX512
+  } else if (stat.is_init() ) {
+
+    int nthreads;
+    stat.start();
+    #pragma omp parallel
+    {
+    #pragma omp master
+    nthreads = omp_get_num_threads();
+    int mythread = omp_get_thread_num();
+    stat.enter(mythread);
+    #pragma omp for nowait
+   for(int ss=0;ss<U._grid->oSites();ss++)
+    {
+       int sU=ss;
+       int sF=LLs*sU;
+       Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+     }
+    stat.exit(mythread);
+    }
+    stat.accum(nthreads);
+#endif
  } else {
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<U._grid->oSites();ss++){
-      int sU=ss;
-      int sF=LLs*sU;
-      Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+    PARALLEL_FOR_LOOP
+    for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU = ss;
+      int sF = LLs * sU;
+      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
+                                out);
    }
  }
+  DhopComputeTime+=usecond();
 }


--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -31,6 +31,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_FERMION_5D_H
 #define  GRID_QCD_WILSON_FERMION_5D_H

+#include <Grid/Stat.h>
+
 namespace Grid {

  namespace QCD {
@@ -60,6 +62,18 @@ namespace Grid {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
+     PmuStat stat;
+
+     void Report(void);
+     void ZeroCounters(void);
+     double DhopCalls;
+     double DhopCommTime;
+     double DhopComputeTime;
+
+     double DerivCalls;
+     double DerivCommTime;
+     double DerivComputeTime;
+     double DerivDhopComputeTime;

      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -196,24 +196,11 @@ namespace Grid {
 				    WilsonKernels(const ImplParams &p = ImplParams());
 				  };
    
-	///////////////////////////////////////////////////////////
-	// Default to no assembler implementation
-	///////////////////////////////////////////////////////////
-	template<class Impl>
-	  void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-	{
-	  assert(0);
-	}
-	template<class Impl>
-	  void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-	{
-	  assert(0);
-	}
-  
      }
    }
+
+
+
+
+
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -31,9 +31,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #include <Grid.h>

+
 namespace Grid {
  namespace QCD {
    
+    ///////////////////////////////////////////////////////////
+    // Default to no assembler implementation
+    ///////////////////////////////////////////////////////////
+    template<class Impl>
+      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+                             std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+                             int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
+    template<class Impl>
+      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+                                std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+                                int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
+
+
+
 #if defined(AVX512) 
    
    
@@ -102,6 +123,27 @@ namespace Grid {
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #endif
+
+
+#define INSTANTIATE_ASM(A)\
+template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
+                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
+                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+
+
+INSTANTIATE_ASM(WilsonImplF);
+INSTANTIATE_ASM(WilsonImplD);
+INSTANTIATE_ASM(ZWilsonImplF);
+INSTANTIATE_ASM(ZWilsonImplD);
+INSTANTIATE_ASM(GparityWilsonImplF);
+INSTANTIATE_ASM(GparityWilsonImplD);
+INSTANTIATE_ASM(DomainWallVec5dImplF);
+INSTANTIATE_ASM(DomainWallVec5dImplD);
+INSTANTIATE_ASM(ZDomainWallVec5dImplF);
+INSTANTIATE_ASM(ZDomainWallVec5dImplD);
  }
 }

--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -134,7 +134,9 @@
  ////////////////////////////////
  // Xm
  ////////////////////////////////
+#ifndef STREAM_STORE
  basep= (uint64_t) &out._odata[ss];
+#endif
  //  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
@@ -229,7 +231,9 @@
    LOAD_CHI(base);
  }
  base= (uint64_t) &out._odata[ss];
+#ifndef STREAM_STORE
  PREFETCH_CHIMU(base);
+#endif
  {
    MULT_2SPIN_DIR_PFTM(Tm,basep);
  }
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -131,9 +131,11 @@ namespace Grid{
 	Vpc.MpcDag(PhiOdd,Y);           // Y= Vdag phi
 	X=zero;
 	ActionSolver(Mpc,Y,X);          // X= (MdagM)^-1 Vdag phi
-	Mpc.Mpc(X,Y);                   // Y=  Mdag^-1 Vdag phi
+	//Mpc.Mpc(X,Y);                   // Y=  Mdag^-1 Vdag phi
+	// Multiply by Ydag
+	RealD action = real(innerProduct(Y,X));

-	RealD action = norm2(Y);
+	//RealD action = norm2(Y);

 	// The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
 	// Only really clover term that creates this. Leave the EE portion as a future to do to make most
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -442,7 +442,7 @@ namespace Optimization {
 #define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
 #endif

-#if defined (AVX1) 
+#if defined (AVX1) || defined (AVXFMA)

 #define _mm256_alignr_epi32(ret,a,b,n) {	\
    __m128 aa, bb;				\
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -359,7 +359,67 @@ namespace Optimization {

  //////////////////////////////////////////////
  // Some Template specialization
+
+  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
+#undef GNU_CLANG_COMPILER 
+#ifdef GNU_CLANG_COMPILER
+  //Complex float Reduce
+  template<>
+    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
+    __m512 v1,v2;
+    v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
+    v1= _mm512_add_ps(v1,in);
+    v2=Optimization::Permute::Permute1(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2=Optimization::Permute::Permute2(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    u512f conv; conv.v = v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
+  }
  
+  //Real float Reduce
+  template<>
+    inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
+    __m512 v1,v2;
+    v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
+    v1 = _mm512_add_ps(v1,in);
+    v2 = Optimization::Permute::Permute1(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2 = Optimization::Permute::Permute2(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2 = Optimization::Permute::Permute3(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    u512f conv; conv.v=v1;
+    return conv.f[0];
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+    inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
+    __m512d v1;
+    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
+    v1 = _mm512_add_pd(v1,in);
+    v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
+    v1 = _mm512_add_pd(v1,in);
+    u512d conv; conv.v = v1;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
+  }
+  
+  //Real double Reduce
+  template<>
+    inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
+    __m512d v1,v2;
+    v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
+    v1 = _mm512_add_pd(v1,in);
+      v2 = Optimization::Permute::Permute1(v1); 
+      v1 = _mm512_add_pd(v1,v2);
+      v2 = Optimization::Permute::Permute2(v1); 
+      v1 = _mm512_add_pd(v1,v2);
+     u512d conv; conv.v = v1;
+     return conv.f[0];
+  }
+#else
  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
@@ -371,7 +431,6 @@ namespace Optimization {
    return _mm512_reduce_add_ps(in);
  }
  
-  
  //Complex double Reduce
  template<>
  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
@@ -391,6 +450,7 @@ namespace Optimization {
    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
+#endif
  
  
 }
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -138,9 +138,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)

-
+#define STREAM_STORE
+#ifdef STREAM_STORE
+#define VSTOREf(OFF,PTR,SRC)   "vmovntps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREd(OFF,PTR,SRC)   "vmovntpd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#else
 #define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#endif

 // Swaps Re/Im ; could unify this with IMCI
 #define VSHUFd(A,DEST)         "vpshufd  $0x4e," #A "," #DEST  ";\n"    
--- a/tests/hmc/Test_hmc_EODWFRatio.cc
+++ b/tests/hmc/Test_hmc_EODWFRatio.cc
@@ -75,7 +75,15 @@ public:
    Level1.push_back(&Waction);
    TheAction.push_back(Level1);

+    NumOp.ZeroCounters();
+    DenOp.ZeroCounters();
    Run(argc,argv);
+    
+    std::cout << GridLogMessage << "Numerator report, Pauli-Villars term         : " << std::endl;
+    NumOp.Report();
+    std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl;
+    DenOp.Report();
+    
  };

 };
--- a/tests/solver/Test_dwf_cg_prec.cc
+++ b/tests/solver/Test_dwf_cg_prec.cc
@@ -1,87 +1,105 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./tests/Test_dwf_cg_prec.cc
+Source file: ./tests/Test_dwf_cg_prec.cc

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

-template<class d>
+template <class d>
 struct scal {
  d internal;
 };

-  Gamma::GammaMatrix Gmu [] = {
-    Gamma::GammaX,
-    Gamma::GammaY,
-    Gamma::GammaZ,
-    Gamma::GammaT
-  };
+Gamma::GammaMatrix Gmu[] = {Gamma::GammaX, Gamma::GammaY, Gamma::GammaZ,
+                            Gamma::GammaT};

-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);

-  const int Ls=8;
+  const int Ls = 16;

-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+      GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid =
+      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian* FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+  GridRedBlackCartesian* FrbGrid =
+      SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);

-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);

-  LatticeFermion    src(FGrid); random(RNG5,src);
-  LatticeFermion result(FGrid); result=zero;
-  LatticeGaugeField Umu(UGrid); 
+  LatticeFermion src(FGrid);
+  random(RNG5, src);
+  LatticeFermion result(FGrid);
+  result = zero;
+  LatticeGaugeField Umu(UGrid);

-  SU3::HotConfiguration(RNG4,Umu);
+  SU3::HotConfiguration(RNG4, Umu);

-  std::vector<LatticeColourMatrix> U(4,UGrid);
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt()
+            << "   Ls: " << Ls << std::endl;
+
+  std::vector<LatticeColourMatrix> U(4, UGrid);
+  for (int mu = 0; mu < Nd; mu++) {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
-  
-  RealD mass=0.1;
-  RealD M5=1.8;
-  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);

-  LatticeFermion    src_o(FrbGrid);
+  RealD mass = 0.01;
+  RealD M5 = 1.8;
+  DomainWallFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
+
+  LatticeFermion src_o(FrbGrid);
  LatticeFermion result_o(FrbGrid);
-  pickCheckerboard(Odd,src_o,src);
-  result_o=zero;
+  pickCheckerboard(Odd, src_o, src);
+  result_o = zero;

-  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
-  CG(HermOpEO,src_o,result_o);
+  GridStopWatch CGTimer;
+
+  SchurDiagMooeeOperator<DomainWallFermionR, LatticeFermion> HermOpEO(Ddwf);
+  ConjugateGradient<LatticeFermion> CG(1.0e-8, 10000, 0);// switch off the assert
+
+  CGTimer.Start();
+  CG(HermOpEO, src_o, result_o);
+  CGTimer.Stop();
+
+  std::cout << GridLogMessage << "Total CG time : " << CGTimer.Elapsed()
+            << std::endl;
+
+  std::cout << GridLogMessage << "######## Dhop calls summary" << std::endl;
+  Ddwf.Report();

  Grid_finalize();
 }
--- a/tests/solver/Test_wilson_cg_prec.cc
+++ b/tests/solver/Test_wilson_cg_prec.cc
@@ -83,6 +83,7 @@ int main (int argc, char ** argv)
  SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw);
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  CG(HermOpEO,src_o,result_o);
+  

  Grid_finalize();
 }