Merge branch 'hotfix/v0.5.1'

Double precision compile fix
Double compile fix
2025-06-14 22:07:05 +01:00 · 2016-07-01 16:33:59 +01:00 · 2016-07-01 16:33:51 +01:00 · 2016-06-30 15:15:03 -07:00 · 2016-06-30 14:44:09 -07:00 · 2016-06-30 14:36:20 -07:00
49 changed files with 3045 additions and 1828 deletions
--- a/.gitignore
+++ b/.gitignore
@ -62,6 +62,7 @@ stamp-h1
 config.sub
 config.guess
 INSTALL
 .dirstamp
 # Packages #
 ############
--- a/.travis.yml
+++ b/.travis.yml
@ -1,5 +1,9 @@
 language: cpp
 cache:
  directories:
    - clang
 matrix:
  include:
    - os:        osx
@ -38,29 +42,31 @@ matrix:
        apt:
          sources:
            - ubuntu-toolchain-r-test
            - llvm-toolchain-precise-3.7
          packages:
-            - clang-3.7
+            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - binutils-dev
-      env: VERSION=-3.7
+      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
            - llvm-toolchain-precise-3.8
          packages:
-            - clang-3.8
+            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - binutils-dev
-      env: VERSION=-3.8
+      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
@ -68,6 +74,11 @@ before_install:
 install:
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
    - which $CC
    - $CC  --version
    - which $CXX
    - $CXX --version
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
 script:
@ -77,3 +88,7 @@ script:
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
    - make clean
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
--- a/4
+++ b/4
@ -0,0 +1,4 @@
 Version : 0.5.0
 - AVX512, AVX2, AVX, SSE good
 - Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
@ -45,6 +46,10 @@ struct scal {
  };
 bool overlapComms = false;
 typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
 int main (int argc, char ** argv)
 {
@ -58,12 +63,18 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=8;
+  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
@ -78,7 +89,9 @@ int main (int argc, char ** argv)
  ColourMatrix cm = Complex(1.0,0.0);
-  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
@ -119,14 +132,21 @@ int main (int argc, char ** argv)
  RealD NP = UGrid->_Nprocessors;
  for(int doasm=1;doasm<2;doasm++){
    QCD::WilsonKernelsStatic::AsmOpt=doasm;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=100;
+  int ncall =10;
-  {
+  if (1) {
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      Dw.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
@ -140,10 +160,121 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    Dw.Report();
+    //    Dw.Report();
  }
-  exit(0);
+  if (1)
  {
    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
    LatticeFermion sresult(sFGrid);
    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector tmp;
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.Dhop(ssrc,sresult,0);
      __SSC_STOP;
    }
    double t1=usecond();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    //  sDw.Report();
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 	sDw.Dhop(ssrc,sresult,0);
 	PerformanceCounter Counter(i);
 	Counter.Start();
 	sDw.Dhop(ssrc,sresult,0);
 	Counter.Stop();
 	Counter.Report();
      }
    }
    RealF sum=0;
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector normal, simd;
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
    }}}}}
    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
    if (1) {
      LatticeFermion sr_eo(sFGrid);
      LatticeFermion serr(sFGrid);
      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
      LatticeFermion sr_e   (sFrbGrid);
      LatticeFermion sr_o   (sFrbGrid);
      pickCheckerboard(Even,ssrc_e,ssrc);
      pickCheckerboard(Odd,ssrc_o,ssrc);
      setCheckerboard(sr_eo,ssrc_o);
      setCheckerboard(sr_eo,ssrc_e);
      serr = sr_eo-ssrc; 
      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
      sr_e = zero;
      sr_o = zero;
      double t0=usecond();
      for(int i=0;i<ncall;i++){
 	sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      }
      double t1=usecond();
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
      sDw.Dhop  (ssrc  ,sresult,DaggerNo);
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
      ssrc_o = ssrc_o - sr_o;
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
    }
  }
  if (1)
  { // Naive wilson dag implementation
@ -217,5 +348,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@ -0,0 +1,154 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 bool overlapComms = false;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
    overlapComms = true;
  }
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
  if (1)
  {
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
  RealD mass=0.1;
  RealD M5  =1.8;
  typename DomainWallFermionR::ImplParams params; 
  params.overlapCommsCompute = overlapComms;
  RealD NP = UGrid->_Nprocessors;
  QCD::WilsonKernelsStatic::AsmOpt=1;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall =50;
  if (1) {
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
    double t1=usecond();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    //    Dw.Report();
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@ -0,0 +1,369 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
 void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  if ( getenv("ASMOPT") )  {
    QCD::WilsonKernelsStatic::AsmOpt=1;
  } else { 
    QCD::WilsonKernelsStatic::AsmOpt=0;
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  int Lmax=32;
  int dmin=0;
  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
  for (int L=8;L<=Lmax;L*=2){
    std::vector<int> latt4(4,L);
    for(int d=4;d>dmin;d--){
      if ( d<=3 ) latt4[d]*=2;
      std::cout << GridLogMessage <<"\t";
      for(int d=0;d<Nd;d++){
 	std::cout<<latt4[d]<<"x";
      }
      std::cout <<Ls<<"\t" ;
      benchDw (latt4,Ls,threads,0);
      benchsDw(latt4,Ls,threads,0);
      std::cout<<std::endl;
    }
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  {
    std::vector<int> latt4(4,16);
    std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
    benchDw (latt4,Ls,threads,1);
    std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
    benchsDw(latt4,Ls,threads,1);
  }
  Grid_finalize();
 }
 #undef CHECK
 void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
 #ifdef CHECK 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
 #else 
  LatticeFermion src   (FGrid); src=zero;
  LatticeGaugeField Umu(UGrid); Umu=zero;
 #endif
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
 #ifdef CHECK
  if (1)
  {
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
 #endif
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  double t0=usecond();
  Dw.Dhop(src,result,0);
  double t1=usecond();
 #ifdef TIMERS_OFF
    int ncall =10;
 #else
  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif
  if (ncall < 5 ) exit(0);
  Dw.Dhop(src,result,0);
  PerformanceCounter Counter(8);
  Counter.Start();
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.Dhop(src,result,0);
  }
  t1=usecond();
  Counter.Stop();
  if ( report ) {
    Counter.Report();
  }
  if ( ! report ) 
    {
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=1344*volume*ncall;
      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
    }
 #ifdef CHECK
    err = ref-result; 
    RealD errd = norm2(err);
    if ( errd> 1.0e-4 ) {
      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
      exit(-1);
    }
 #endif
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
  LatticeFermion r_o   (FrbGrid);
  LatticeFermion r_eo  (FGrid);
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  {
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
    }
    double t1=usecond();
    if(!report){
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<< flops/(t1-t0);
    }
  }
 }
 #undef CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
 #ifdef CHECK_SDW
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
 #else 
  LatticeFermion src   (FGrid); src=zero;
  LatticeGaugeField Umu(UGrid); Umu=zero;
 #endif
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  RealD mass=0.1;
  RealD M5  =1.8;
    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
    LatticeFermion sresult(sFGrid);
    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector tmp;
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
    double t0=usecond();
    sDw.Dhop(ssrc,sresult,0);
    double t1=usecond();
 #ifdef TIMERS_OFF
    int ncall =10;
 #else 
    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif
    PerformanceCounter Counter(8);
    Counter.Start();
    t0=usecond();
    for(int i=0;i<ncall;i++){
      sDw.Dhop(ssrc,sresult,0);
    }
    t1=usecond();
    Counter.Stop();
    if ( report ) {
      Counter.Report();
    } else { 
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=1344*volume*ncall;
      std::cout<<"\t"<< flops/(t1-t0);
    }
    LatticeFermion sr_eo(sFGrid);
    LatticeFermion serr(sFGrid);
    LatticeFermion ssrc_e (sFrbGrid);
    LatticeFermion ssrc_o (sFrbGrid);
    LatticeFermion sr_e   (sFrbGrid);
    LatticeFermion sr_o   (sFrbGrid);
    pickCheckerboard(Even,ssrc_e,ssrc);
    pickCheckerboard(Odd,ssrc_o,ssrc);
    setCheckerboard(sr_eo,ssrc_o);
    setCheckerboard(sr_eo,ssrc_e);
    sr_e = zero;
    sr_o = zero;
    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
    PerformanceCounter CounterSdw(8);
    CounterSdw.Start();
    t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      __SSC_STOP;
    }
    t1=usecond();
    CounterSdw.Stop();
    if ( report ) { 
      CounterSdw.Report();
    } else {
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<"\t"<< flops/(t1-t0);
    }
 }
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@ -119,7 +119,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  mfc = flops*ncall/(t1-t0);
  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
-  QCD::WilsonFermion5DStatic::AsmOptDslash=1;
+  QCD::WilsonKernelsStatic::AsmOpt=1;
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.DhopOE(srce,resulta,0);
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@ -1,5 +1,5 @@
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 Benchmark_comms_SOURCES=Benchmark_comms.cc
@ -10,6 +10,14 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 Benchmark_dwf_LDADD=-lGrid
 Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
 Benchmark_dwf_ntpf_LDADD=-lGrid
 Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
 Benchmark_dwf_sweep_LDADD=-lGrid
 Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 Benchmark_memory_asynch_LDADD=-lGrid
--- a/configure.ac
+++ b/configure.ac
@ -55,6 +55,15 @@ echo :::::::::::::::::::::::::::::::::::::::::::
 AC_CHECK_FUNCS([gettimeofday])
 #AC_CHECK_LIB([gmp],[__gmpf_init],,
 #        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
 #Please install or provide the correct path to your installation
 #Info at: http://www.gmplib.org)])
 #AC_CHECK_LIB([mpfr],[mpfr_init],,
 #        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
 #Please install or provide the correct path to your installation
 #Info at: http://www.mpfr.org/)])
 #
 # SIMD instructions selection
@ -199,6 +208,25 @@ case ${ac_RNG} in
     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac
 #
 # SDE timing mode
 #
 AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
 	[Enable system dependent high res timers])],\
 	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 case ${ac_TIMERS} in
     yes)
     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
     ;;
     no)
     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac
 #
 # Chroma regression tests
 #
--- a/lib/.dirstamp
+++ b/lib/.dirstamp
--- a/lib/Init.cc
+++ b/lib/Init.cc
@ -211,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    QCD::WilsonFermionStatic::HandOptDslash=1;
+    QCD::WilsonKernelsStatic::HandOpt=1;
    QCD::WilsonFermion5DStatic::HandOptDslash=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
@ -276,11 +275,6 @@ void Grid_finalize(void)
  Grid_unquiesce_nodes();
 #endif
 }
 double usecond(void) {
  struct timeval tv;
  gettimeofday(&tv,NULL);
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 void * Grid_backtrace_buffer[_NBACKTRACE];
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/PerfCount.cc
+++ b/lib/PerfCount.cc
@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
-
+#define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 #ifdef __linux__
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
+    // 4
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
+#ifdef AVX512
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
+    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
+    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
+    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
-  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
+    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
+    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
+    // 11
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
+#else
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
    // 11
 #endif
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
    //15
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
    //19
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 #endif
 };
 }
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@ -58,6 +58,27 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 }
 #endif
 #ifdef TIMERS_OFF
 inline uint64_t cyclecount(void){ 
  return 0;
 }
 #define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 #define __SSC_STOP  __SSC_MARK(0x110)
 #define __SSC_START __SSC_MARK(0x111)
 #else
 #define __SSC_MARK(mark) 
 #define __SSC_STOP  
 #define __SSC_START 
 /*
 * cycle counters arch dependent
 */
 #ifdef __bgq__
 inline uint64_t cyclecount(void){ 
   uint64_t tmp;
@ -65,18 +86,20 @@ inline uint64_t cyclecount(void){
   return tmp;
 }
 #elif defined __x86_64__
 #include <immintrin.h>
 #ifndef __INTEL_COMPILER
 #include <x86intrin.h>
-#endif
+inline uint64_t cyclecount(void){ 
-inline uint64_t cyclecount(void){
+  return __rdtsc();
-   return __rdtsc();
+  //  unsigned int dummy;
  // return __rdtscp(&dummy);
 }
 #else
-#warning No cycle counter implemented for this architecture
+
 inline uint64_t cyclecount(void){ 
   return 0;
 }
 #endif
 #endif
 class PerformanceCounter {
@ -87,6 +110,7 @@ private:
    uint32_t type;
    uint64_t config;
    const char *name;
    int normalisation;
  } PerformanceCounterConfig; 
  static const PerformanceCounterConfig PerformanceCounterConfigs [];
@ -94,26 +118,12 @@ private:
 public:
  enum PerformanceCounterType {
-    CPUCYCLES=0,
+    CACHE_REFERENCES=0,
-    INSTRUCTIONS,
+    CACHE_MISSES=1,
-    //    STALL_CYCLES,
+    CPUCYCLES=2,
-    CACHE_REFERENCES,
+    INSTRUCTIONS=3,
-    CACHE_MISSES,
+    L1D_READ_ACCESS=4,
-    L1D_READ_MISS,
+    PERFORMANCE_COUNTER_NUM_TYPES=19
    L1D_READ_ACCESS,
    L1D_WRITE_MISS,
    L1D_WRITE_ACCESS,
    L1D_PREFETCH_MISS,
    L1D_PREFETCH_ACCESS,
    LL_READ_MISS,
    //    LL_READ_ACCESS,
    LL_WRITE_MISS,
    LL_WRITE_ACCESS,
    LL_PREFETCH_MISS,
    LL_PREFETCH_ACCESS,
    L1I_READ_MISS,
    L1I_READ_ACCESS,
    PERFORMANCE_COUNTER_NUM_TYPES
  };
 public:
@ -121,7 +131,9 @@ public:
  int PCT;
  long long count;
  long long cycles;
  int fd;
  int cyclefd;
  unsigned long long elapsed;
  uint64_t begin;
@ -134,7 +146,9 @@ public:
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
    cyclefd=-1;
    count=0;
    cycles=0;
    PCT =_pct;
    Open();
 #endif
@ -159,6 +173,15 @@ public:
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
    int norm = PerformanceCounterConfigs[PCT].normalisation;
    pe.type  = PerformanceCounterConfigs[norm].type;
    pe.config= PerformanceCounterConfigs[norm].config;
    name = PerformanceCounterConfigs[norm].name;
    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (cyclefd == -1) {
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
 #endif
  }
@ -168,6 +191,8 @@ public:
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
    }
    begin  =cyclecount();
 #else
@ -177,10 +202,13 @@ public:
  void Stop(void) {
    count=0;
    cycles=0;
 #ifdef __linux__
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
      ::read(cyclefd, &cycles, sizeof(long long));
    }
    elapsed = cyclecount() - begin;
 #else
@ -190,7 +218,11 @@ public:
  }
  void Report(void) {
 #ifdef __linux__
-    std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+    int N = PerformanceCounterConfigs[PCT].normalisation;
    const char * sn = PerformanceCounterConfigs[N].name ;
    const char * sc = PerformanceCounterConfigs[PCT].name;
      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
 		  sc, count, sc,sn, (double)count/(double)cycles);
 #else
    std::printf("%llu cycles \n", elapsed );
 #endif
@ -199,7 +231,7 @@ public:
  ~PerformanceCounter()
  {
 #ifdef __linux__
-    ::close(fd);
+    ::close(fd);    ::close(cyclefd);
 #endif
  }
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
--- a/lib/Timer.h
+++ b/lib/Timer.h
@ -39,7 +39,13 @@ namespace Grid {
  // Dress the output; use std::chrono
 // C++11 time facilities better?
-double usecond(void);
+inline double usecond(void) {
  struct timeval tv;
 #ifdef TIMERS_ON
  gettimeofday(&tv,NULL);
 #endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
@ -63,17 +69,23 @@ public:
  }
  void     Start(void) { 
    assert(running == false);
 #ifdef TIMERS_ON
    start = GridClock::now(); 
 #endif
    running = true;
  }
  void     Stop(void)  { 
    assert(running == true);
 #ifdef TIMERS_ON
    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
 #endif
    running = false; 
  };
  void     Reset(void){
    running = false;
 #ifdef TIMERS_ON
    start = GridClock::now();
 #endif
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
  GridTime Elapsed(void) {
--- a/lib/algorithms/approx/.dirstamp
+++ b/lib/algorithms/approx/.dirstamp
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -170,9 +170,15 @@ public:
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 	assert(_rdimensions[d]>0);
 	// all elements of a simd vector must have same checkerboard.
-	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 
+	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 	if ( _simd_layout[d]>1 ) {
 	  if ( d != _checker_dim ) { 
 	    assert( (_rdimensions[d]&0x1) == 0 );
 	  }
 	}
 	_osites *= _rdimensions[d];
 	_isites *= _simd_layout[d];
--- a/lib/communicator/.dirstamp
+++ b/lib/communicator/.dirstamp
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -53,7 +53,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  std::cout << processors << std::endl;
  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
--- a/lib/qcd/action/fermion/.dirstamp
+++ b/lib/qcd/action/fermion/.dirstamp
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@ -63,7 +63,7 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
-	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
+	//	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 	// Call base setter
 	this->SetCoefficientsTanh(zdata,1.0,0.0);
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@ -53,6 +53,8 @@ namespace QCD {
 	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
 	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
 	mass(_mass),
 	Lebesgue(_grid),
 	LebesgueEvenOdd(_cbgrid),
 	Umu(&Fgrid),
 	UmuEven(&Hgrid),
 	UmuOdd (&Hgrid) 
@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
    out.checkerboard = in.checkerboard;
-    DhopInternal(Stencil,Umu,in,out,dag);
+    DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
  }
  template<class Impl>
@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
    assert(in.checkerboard==Even);
    out.checkerboard = Odd;
-    DhopInternal(StencilEven,UmuOdd,in,out,dag);
+    DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
  }
  template<class Impl>
@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
    assert(in.checkerboard==Odd);
    out.checkerboard = Even;
-    DhopInternal(StencilOdd,UmuEven,in,out,dag);
+    DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
  }
  template<class Impl>
@ -285,43 +287,23 @@ PARALLEL_FOR_LOOP
  };
  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
+  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
    DhopInternalCommsThenCompute(st,U,in,out,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 							 const FermionField &in, FermionField &out,int dag) {
    assert((dag==DaggerNo) ||(dag==DaggerYes));
    Compressor compressor(dag);
    st.HaloExchange(in,compressor);
    if ( dag == DaggerYes ) {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
+      for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 	}
      }
    } else {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
+      for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
+	Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 	}
      }
    }
  };
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@ -111,12 +111,9 @@ namespace Grid {
 			 const FermionField &B,
 			 int dag);
-      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
+      void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;
      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 				    const FermionField &in, FermionField &out,int dag) ;
      // Constructor
      WilsonFermion(GaugeField &_Umu,
 		    GridCartesian         &Fgrid,
@ -149,6 +146,10 @@ namespace Grid {
      DoubledGaugeField Umu;
      DoubledGaugeField UmuEven;
      DoubledGaugeField UmuOdd;
      LebesgueOrder Lebesgue;
      LebesgueOrder LebesgueEvenOdd;
    };
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -1,5 +1,4 @@
-
+/*************************************************************************************
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -39,8 +38,6 @@ namespace QCD {
 // S-direction is INNERMOST and takes no part in the parity.
 const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 int WilsonFermion5DStatic::HandOptDslash;
 int WilsonFermion5DStatic::AsmOptDslash;
  // 5d lattice for DWF.
 template<class Impl>
@ -98,34 +95,27 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  alltime=0;
  commtime=0;
  jointime=0;
  dslashtime=0;
  dslash1time=0;
 }  
 template<class Impl>
-WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
+WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 				       GridCartesian         &FiveDimGrid,
 				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 				       GridCartesian         &FourDimGrid,
 				       GridRedBlackCartesian &FourDimRedBlackGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
-  UmuEven(_FourDimRedBlackGrid),
+  UmuEven(_FourDimGrid),
-  UmuOdd (_FourDimRedBlackGrid),
+  UmuOdd (_FourDimGrid),
  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimRedBlackGrid)
+  LebesgueEvenOdd(_FourDimGrid)
 {
  int nsimd = Simd::Nsimd();
@ -134,7 +124,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
  assert(FourDimGrid._ndimension==4);
  assert(FourDimRedBlackGrid._ndimension==4);
  // Dimension zero of the five-d is the Ls direction
  Ls=FiveDimGrid._fdimensions[0];
@ -147,15 +136,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
  // Other dimensions must match the decomposition of the four-D fields 
  for(int d=0;d<4;d++){
    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
    assert(FourDimGrid._simd_layout[d]=1);
    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
@ -163,8 +147,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
  }
-  // Allocate the required comms buffer
+  {
-  ImportGauge(_Umu);
+    GaugeField HUmu(_Umu._grid);
    HUmu = _Umu*(-0.5);
    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
    UmuEven=Umu;// Really want a reference.
    UmuOdd =Umu;
  }
 }  
@ -297,30 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
  std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
  std::cout<<GridLogMessage << "Wilson5d      time "<<alltime <<" us"<<std::endl;
  std::cout<<GridLogMessage << "HaloBegin     time "<<commtime <<" us"<<std::endl;
  std::cout<<GridLogMessage << "Dslash        time "<<dslashtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Dslash1       time "<<dslash1time<<" us"<<std::endl;
  std::cout<<GridLogMessage << "HaloComplete  time "<<jointime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
  std::cout<<GridLogMessage << "Stencil all gather      time "<<Stencil.halogtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil splice   gather time "<<Stencil.splicetime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 				  const FermionField &A,
@ -342,90 +307,30 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
  Compressor compressor(dag);
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
  int LLs = in._grid->_rdimensions[0];
  commtime -=usecond();
  //  auto handle = st.HaloExchangeBegin(in,compressor);
  //  st.HaloExchangeComplete(handle);
  st.HaloExchange(in,compressor);
  commtime +=usecond();
  jointime -=usecond();
  jointime +=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
+    for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<LLs;s++){
+	int sU=ss;
-	  int sU=ss;
+	int sF=LLs*sU;
-	  int sF = s+LLs*sU;
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<LLs;s++){
 	  int sU=ss;
 	  int sF = s+LLs*sU;
 	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  } else {
    if( this->AsmOptDslash ) {
 PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
+    for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<LLs;s++){
+      int sU=ss;
-	  int sU=ss;
+      int sF=LLs*sU;
-	  int sF = s+LLs*sU;
+      Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
 	  Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP     
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<LLs;s++){
 	  int sU=ss;
 	  int sF = s+LLs*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<LLs;s++){
 	  int sU=ss;
 	  int sF = s+LLs*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
  dslashtime +=usecond();
  alltime+=usecond();
 }
@ -473,7 +378,7 @@ FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
 template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
-
+  
 }}
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@ -49,8 +49,6 @@ namespace Grid {
    class WilsonFermion5DStatic { 
    public:
      // S-direction is INNERMOST and takes no part in the parity.
      static int AsmOptDslash; // these are a temporary hack
      static int HandOptDslash; // these are a temporary hack
      static const std::vector<int> directions;
      static const std::vector<int> displacements;
      const int npoint = 8;
@ -62,11 +60,7 @@ namespace Grid {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
-     double alltime;
+
     double jointime;
     double commtime;
     double dslashtime;
     double dslash1time;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
@ -122,13 +116,6 @@ namespace Grid {
 			FermionField &out,
 			int dag);
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
 			FermionField &out,
 			int dag);
      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
@ -143,13 +130,11 @@ namespace Grid {
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
      void Report(void);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@ -31,12 +31,63 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {
  int WilsonKernelsStatic::HandOpt;
  int WilsonKernelsStatic::AsmOpt;
 template<class Impl> 
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
  // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
 #ifdef AVX512
  if ( AsmOpt ) {
    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
  } else {
 #else
  {  
 #endif
    for(int site=0;site<Ns;site++) {
      for(int s=0;s<Ls;s++) {
 	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
 	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 	sF++;
      }
      sU++;
    }
  }
 }
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
  // No asm implementation yet.
  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
  //  else
  for(int site=0;site<Ns;site++) {
    for(int s=0;s<Ls;s++) {
      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
      sF++;
    }
    sU++;
  }
 }
  ////////////////////////////////////////////
  // Generic implementation; move to different file?
  ////////////////////////////////////////////
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,const FermionField &in, FermionField &out)
 {
@ -214,9 +265,9 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
  // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out)
+						  int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
@ -518,17 +569,9 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
  vstream(out._odata[sF],result);
 }
 #if ( ! defined(AVX512) )
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					      int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 }
 #endif
  FermOpTemplateInstantiate(WilsonKernels);
 template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 template class WilsonKernels<DomainWallRedBlack5dImplD>;
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@ -38,37 +38,56 @@ namespace Grid {
    // Helper routines that implement Wilson stencil for a single site.
    // Common to both the WilsonFermion and WilsonFermion5D
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    class WilsonKernelsStatic { 
    public:
      // S-direction is INNERMOST and takes no part in the parity.
      static int AsmOpt;  // these are a temporary hack
      static int HandOpt; // these are a temporary hack
    };
-    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef FermionOperator<Impl> Base;
    public:
-     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+
     void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			   int sF,int sU,const FermionField &in, FermionField &out);
+			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
-     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in,FermionField &out);
+			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
-     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+    private:
     // Specialised variants
     void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU, const FermionField &in, FermionField &out);
     void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out);
+			      int sF,int sU,const FermionField &in,FermionField &out);
-     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out);
-     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				 int sF,int sU,const FermionField &in, FermionField &out);
    public:
     WilsonKernels(const ImplParams &p= ImplParams());
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@ -2,6 +2,8 @@
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
    Copyright (C) 2015
@ -26,237 +28,93 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #if defined(AVX512) 
 //#if defined (IMCI)
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
 namespace Grid {
 namespace QCD {
  ///////////////////////////////////////////////////////////
  // Default to no assembler implementation
  ///////////////////////////////////////////////////////////
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out)
+					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
-  uint64_t  now;
+  assert(0);
  uint64_t first ;
  int offset,local,perm, ptype;
  const SiteHalfSpinor *pbuf = & buf[0];
  const SiteSpinor   *plocal = & in._odata[0];
  void *pf;
  int osites = in._grid->oSites();
  StencilEntry *SE;
  //#define STAMP(i) timers[i] = cyclecount() ; 
 #define STAMP(i) //timers[i] = cyclecount() ; 
  MASK_REGS;
  first = cyclecount();
  SE=st.GetEntry(ptype,Xm,ss);
  // Xm
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Ym,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    XP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFXM(Xm,pf);
  }
  XP_RECON;
  // Ym
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Zm,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    YP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFYM(Ym,pf);
  }
  YP_RECON_ACCUM;
  // Zm
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Tm,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    ZP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFZM(Zm,pf);
  }
  ZP_RECON_ACCUM;
  // Tm
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  SE=st.GetEntry(ptype,Tp,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    TP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFTM(Tm,pf);
  }
  TP_RECON_ACCUM;
  // Tp
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Zp,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    TM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFTP(Tp,pf);
  }
  TM_RECON_ACCUM;
  // Zp
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Yp,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    ZM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFZP(Zp,pf);
  }
  ZM_RECON_ACCUM;
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Xp,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    YM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFYP(Yp,pf);
  }
  YM_RECON_ACCUM;
  // Xp
  perm   = SE->_permute;
  offset = SE->_offset;
  local  = SE->_is_local;
  // Prefetch
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    XM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFXP(Xp,pf);
  }
  XM_RECON_ACCUM;
 debug:
  SAVE_RESULT(&out._odata[ss]);
 }
-  template class WilsonKernels<WilsonImplF>;		
+#if defined(AVX512) 
-  template class WilsonKernels<WilsonImplD>; 
+
-  template class WilsonKernels<GparityWilsonImplF>;
+
-  template class WilsonKernels<GparityWilsonImplD>;
+  ///////////////////////////////////////////////////////////
-  template class WilsonKernels<DomainWallRedBlack5dImplF>;
+  // If we are AVX512 specialise the single precision routine
-  template class WilsonKernels<DomainWallRedBlack5dImplD>;
+  ///////////////////////////////////////////////////////////
-}}
+
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
 static Vector<vComplexF> signs;
 int setupSigns(void ){
  Vector<vComplexF> bother(2);
  signs = bother;
  vrsign(signs[0]);
  visign(signs[1]);
  return 1;
 }
 static int signInit = setupSigns();
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
 template<>
 void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
 #undef MULT_2SPIN
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
 void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #endif
 template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@ -0,0 +1,186 @@
 {
  int local,perm, ptype;
  uint64_t base;
  uint64_t basep;
  const uint64_t plocal =(uint64_t) & in._odata[0];
  //  vComplexF isigns[2] = { signs[0], signs[1] };
  vComplexF *isigns = &signs[0];
  MASK_REGS;
  int nmax=U._grid->oSites();
  for(int site=0;site<Ns;site++) {
  int sU =lo.Reorder(ssU);
  int ssn=ssU+1; 
  if(ssn>=nmax) ssn=0;
  int sUn=lo.Reorder(ssn);
  for(int s=0;s<Ls;s++) {
  ss =sU*Ls+s;
  ssn=sUn*Ls+s; 
  ////////////////////////////////
  // Xp
  ////////////////////////////////
  int  ent=ss*8;// 2*Ndim
  int nent=ssn*8;
  PF_GAUGE(Xp); 
  base  = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
  PREFETCH1_CHIMU(base);
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);
    XM_PROJMEM(base);
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
  }
  base = st.GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
  PREFETCH_CHIMU(base);
  {
    MULT_2SPIN_DIR_PFXP(Xp,basep);
  }
  LOAD64(%r10,isigns);
  XM_RECON;
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YM_PROJMEM(base);
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
  }
  base  = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
  PREFETCH_CHIMU(base);
  {
    MULT_2SPIN_DIR_PFYP(Yp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YM_RECON_ACCUM;
  ////////////////////////////////
  // Zp
  ////////////////////////////////
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZM_PROJMEM(base);
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
  }
  base  = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
  PREFETCH_CHIMU(base);
  {
    MULT_2SPIN_DIR_PFZP(Zp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZM_RECON_ACCUM;
  ////////////////////////////////
  // Tp
  ////////////////////////////////
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TM_PROJMEM(base);
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
  }
  base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
  PREFETCH_CHIMU(base);
  {
    MULT_2SPIN_DIR_PFTP(Tp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TM_RECON_ACCUM;
  ////////////////////////////////
  // Xm
  ////////////////////////////////
  basep= (uint64_t) &out._odata[ss];
  //  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    XP_PROJMEM(base);
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
  }
  base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
  PREFETCH_CHIMU(base);
  {
    MULT_2SPIN_DIR_PFXM(Xm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  XP_RECON_ACCUM;
  ////////////////////////////////
  // Ym
  ////////////////////////////////
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YP_PROJMEM(base);
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
  }
  base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
  PREFETCH_CHIMU(base);
  {
    MULT_2SPIN_DIR_PFYM(Ym,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YP_RECON_ACCUM;
  ////////////////////////////////
  // Zm
  ////////////////////////////////
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZP_PROJMEM(base);
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
  }
  base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
  PREFETCH_CHIMU(base);
  {
    MULT_2SPIN_DIR_PFZM(Zm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZP_RECON_ACCUM;
  ////////////////////////////////
  // Tm
  ////////////////////////////////
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TP_PROJMEM(base);
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
  }
  base= (uint64_t) &out._odata[ss];
  PREFETCH_CHIMU(base);
  {
    MULT_2SPIN_DIR_PFTM(Tm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TP_RECON_ACCUM;
  basep= st.GetPFInfo(nent,plocal); nent++;
  SAVE_RESULT(base,basep);
  }
  ssU++;
  }
 }
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
@ -0,0 +1,161 @@
 {
  int locala,perma, ptypea;
  int localb,permb, ptypeb;
  uint64_t basea, baseb;
  const uint64_t plocal =(uint64_t) & in._odata[0];
  //  vComplexF isigns[2] = { signs[0], signs[1] };
  vComplexF *isigns = &signs[0];
  MASK_REGS;
  for(int site=0;site<Ns;site++) {
  int sU=lo.Reorder(ssU);  
  for(int s=0;s<Ls;s++) {
  ss=sU*Ls+s;
  ////////////////////////////////
  // Xp
  ////////////////////////////////
  int ent=ss*8;// 2*Ndim
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);
    XM_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR3,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFXP(Xp,baseb);
  }
  LOAD64(%r10,isigns);
  XM_RECON;
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YM_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR2,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFYP(Yp,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YM_RECON_ACCUM;
  ////////////////////////////////
  // Zp
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZM_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR1,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFZP(Zp,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZM_RECON_ACCUM;
  ////////////////////////////////
  // Tp
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TM_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR0,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFTP(Tp,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TM_RECON_ACCUM;
  ////////////////////////////////
  // Xm
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    XP_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR3,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFXM(Xm,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  XP_RECON_ACCUM;
  ////////////////////////////////
  // Ym
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YP_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR2,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFYM(Ym,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YP_RECON_ACCUM;
  ////////////////////////////////
  // Zm
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZP_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR1,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFZM(Zm,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZP_RECON_ACCUM;
  ////////////////////////////////
  // Tm
  ////////////////////////////////
  basea = (uint64_t)&out._odata[ss];
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TP_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR0,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
  {
    MULT_2SPIN_DIR_PFTM(Tm,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TP_RECON_ACCUM;
  SAVE_RESULT(&out._odata[ss],baseb);
  } 
  ssU++;
  }
 }
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
@ -0,0 +1,187 @@
 {
  int locala,perma, ptypea;
  int localb,permb, ptypeb;
  int localc,permc, ptypec;
  uint64_t basea, baseb, basec;
  uint64_t basex;
  const uint64_t plocal =(uint64_t) & in._odata[0];
  //  vComplexF isigns[2] = { signs[0], signs[1] };
  vComplexF *isigns = &signs[0];
  MASK_REGS;
  for(int site=0;site<Ns;site++) {
  int sU=lo.Reorder(ssU);
  for(int s=0;s<Ls;s++) {
  ss     =sU*Ls+s;
  ////////////////////////////////
  // Xp
  ////////////////////////////////
  int ent=ss*8;// 2*Ndim
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  PREFETCH_CHIMU(basea);
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  PREFETCH_CHIMU(baseb);
  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
  PREFETCH_CHIMU(basec);
  basex = basea;
  label(FX(XP) );
  if ( locala ) {
    LOAD64(%r10,isigns);
    XM_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR3,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFXP(Xp,baseb);
  }
  LOAD64(%r10,isigns);
  XM_RECON;
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  PREFETCH_CHIMU(basea);
  label(FX(YP) );
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YM_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR2,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFYP(Yp,basec);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YM_RECON_ACCUM;
  ////////////////////////////////
  // Zp
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  PREFETCH_CHIMU(baseb);
  label(FX(ZP) );
  if ( localc ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZM_PROJMEM(basec);
    MAYBEPERM(PERMUTE_DIR1,permc);
  } else { 
    LOAD_CHI(basec);
  }
  {
    MULT_2SPIN_DIR_PFZP(Zp,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZM_RECON_ACCUM;
  ////////////////////////////////
  // Tp
  ////////////////////////////////
  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
  PREFETCH_CHIMU(basec);
  label(FX(TP) );
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TM_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR0,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFTP(Tp,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TM_RECON_ACCUM;
  ////////////////////////////////
  // Xm
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
  PREFETCH_CHIMU(basea);
  label(FX(XM) );
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    XP_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR3,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFXM(Xm,basec);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  XP_RECON_ACCUM;
  ////////////////////////////////
  // Ym
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
  PREFETCH_CHIMU(baseb);
  label(FX(YM) );
  if ( localc ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YP_PROJMEM(basec);
    MAYBEPERM(PERMUTE_DIR2,permc);
  } else { 
    LOAD_CHI(basec);
  }
  {
    MULT_2SPIN_DIR_PFYM(Ym,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YP_RECON_ACCUM;
  ////////////////////////////////
  // Zm
  ////////////////////////////////
  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
  PREFETCH_CHIMU(basec);
  label(FX(ZM) );
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZP_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR1,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFZM(Zm,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZP_RECON_ACCUM;
  ////////////////////////////////
  // Tm
  ////////////////////////////////
  basea = (uint64_t)&out._odata[ss];
  PREFETCH_CHIMU(basea);
  label(FX(TM) );
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TP_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR0,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFTM(Tm,basec);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TP_RECON_ACCUM;
  //  PREFETCH_CHIMU(basex);
  label(FX(SAV) );
  SAVE_RESULT(&out._odata[ss]);
  }
  ssU++;
  }
 }
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@ -312,7 +312,7 @@ namespace QCD {
 template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@ -552,12 +552,10 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
    vstream(ref()(3)(1),result_31);
    vstream(ref()(3)(2),result_32);
  }
  return 0;
 }
 template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@ -798,7 +796,6 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
    vstream(ref()(3)(1),result_31);
    vstream(ref()(3)(2),result_32);
  }
  return 0;
 }
@ -806,125 +803,80 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
+  assert(0);
  return 0;
 }
 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  assert(0);
  return 0;
 }
 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  assert(0);
  return 0;
 }
 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  assert(0);
  return 0;
 }
-  //////////////
+////////////// Wilson ; uses this implementation /////////////////////
-/*
+// Need Nc=3 though //
 template<>
 int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
  return 0;
 }
-template<>
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
 }
 template<>
 int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
 }
 template<>
 int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
 }
 */
 template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
--- a/lib/qcd/hmc/.dirstamp
+++ b/lib/qcd/hmc/.dirstamp
--- a/lib/qcd/spin/.dirstamp
+++ b/lib/qcd/spin/.dirstamp
--- a/lib/qcd/utils/.dirstamp
+++ b/lib/qcd/utils/.dirstamp
--- a/lib/simd/Grid_empty.h
+++ b/lib/simd/Grid_empty.h
@ -410,6 +410,7 @@ namespace Optimization {
        break;
      default: assert(0);
      }
      return out;
    }
    static inline u128d rotate(u128d in,int n){
      u128d out;
@ -424,6 +425,7 @@ namespace Optimization {
        break;
      default: assert(0);
      }
      return out;
    }
  };
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -367,6 +367,9 @@ namespace Grid {
  template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,S(0.0,0.0)); }// use xor?
  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 
  template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));} 
  template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));} 
  // if not complex overload here 
  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@ -87,14 +87,39 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 #define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
 #define VMOVRDUPf(OFF,PTR,DEST)         "vmovsldup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 #define VMOVIDUPf(OFF,PTR,DEST)         "vmovshdup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 #define VRDUPd(SRC,DEST)       "vpshufd  $0x44," #SRC"," #DEST  ";\n" // 32 bit level: 1,0,3,2
 #define VRDUPf(SRC,DEST)         "vmovsldup " #SRC ", " #DEST  ";\n"
 #define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
 #define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"
 #define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
 #define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
 #define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
 #define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
 #define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
 #define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
 #define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps   " #O"*64("#P "),"#B "," #accum  ";\n"
 #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
 #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMULIDUPd(O,P,B,accum) "vmulpd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
  /*
   * TimesI is used only in the XP recon
   * Could zero the regs and use RECON_ACCUM
   */
 #define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
 #define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
@ -111,6 +136,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 #if 0
 #define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 #define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 #define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
@ -127,6 +154,35 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 #define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 #else
 // o_p must point to floating 1.0f/d
 //
 // Ai, Ar -> tmp (r i)
 // tmp *1.0 
 // ACC i - Ar ; ACC r + Ai
 #define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 #define VACCTIMESMINUSI1f(A,ACC,tmp)  VMADDMEMf(1,%r10,tmp,ACC)
 #define VACCTIMESMINUSI2f(A,ACC,tmp)  
 #define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
 #define VACCTIMESMINUSI1d(A,ACC,tmp)  VMADDMEMd(1,%r10,tmp,ACC)  
 #define VACCTIMESMINUSI2d(A,ACC,tmp)
 // Ai, Ar -> tmp (r i)
 // tmp *1.0 
 // ACC i + Ar ; ACC r - Ai
 #define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
 #define  VACCTIMESI1f(A,ACC,tmp)  VMADDMEMf(0,%r10,tmp,ACC)  
 #define  VACCTIMESI2f(A,ACC,tmp)
 #define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
 #define  VACCTIMESI1d(A,ACC,tmp)  VMADDMEMd(0,%r10,tmp,ACC)  
 #define  VACCTIMESI2d(A,ACC,tmp)
 #endif
 #define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
 #define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
 #define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
--- a/lib/simd/Intel512avxAddsub.h
+++ b/lib/simd/Intel512avxAddsub.h
@ -1,92 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ASM_AV512_ADDSUB_H
 #define GRID_ASM_AV512_ADDSUB_H
 ////////////////////////////////////////////////////////////////
 // Building blocks for SU3 x 2spinor
 // Load columns of U
 // 18   U DUP's  rr/ii
 //  6 Chi shuffles ir,ri
 // 6muls, 30 fmaddsubs
 ////////////////////////////////////////////////////////////////
 #define MULT_ADDSUB_2SPIN(ptr)	\
 	   LOAD64(%r8,ptr)			\
  __asm__ (					\
 	   VMOVIDUPf(0,%r8,Z0 ) \
 	   VMOVIDUPf(3,%r8,Z1 )\
 	   VMOVIDUPf(6,%r8,Z2 )\
           VSHUFf(Chi_00,T1)    \
           VSHUFf(Chi_10,T2)    \
 				\
 	   VMULf(Z0,T1,UChi_00)	           VMOVRDUPf(0,%r8,Z3 ) \
 	   VMULf(Z0,T2,UChi_10)	           VMOVRDUPf(3,%r8,Z4 ) \
 	   VMULf(Z1,T1,UChi_01)	           VMOVRDUPf(6,%r8,Z5 ) \
 	   VMULf(Z1,T2,UChi_11)	           VMOVIDUPf(1,%r8,Z0 ) \
 	   VMULf(Z2,T1,UChi_02)            VMOVIDUPf(4,%r8,Z1 ) \
 	   VMULf(Z2,T2,UChi_12)	           VMOVIDUPf(7,%r8,Z2 ) \
 	   			\
 	   VMADDSUBf(Z3,Chi_00,UChi_00)    VSHUFf(Chi_01,T1)    \
 	   VMADDSUBf(Z3,Chi_10,UChi_10)    VSHUFf(Chi_11,T2)    \
 	   VMADDSUBf(Z4,Chi_00,UChi_01)    VMOVRDUPf(1,%r8,Z3 ) \
 	   VMADDSUBf(Z4,Chi_10,UChi_11)\
 	   VMADDSUBf(Z5,Chi_00,UChi_02)    VMOVRDUPf(4,%r8,Z4 ) \
 	   VMADDSUBf(Z5,Chi_10,UChi_12)\
 	   			       \
 	   VMADDSUBf(Z0,T1,UChi_00) 	   VMOVRDUPf(7,%r8,Z5 ) \ 
 	   VMADDSUBf(Z0,T2,UChi_10)\
 	   VMADDSUBf(Z1,T1,UChi_01) 	   VMOVIDUPf(2,%r8,Z0 ) \
 	   VMADDSUBf(Z1,T2,UChi_11)\
 	   VMADDSUBf(Z2,T1,UChi_02)        VMOVIDUPf(5,%r8,Z1 ) \
 	   VMADDSUBf(Z2,T2,UChi_12)        VMOVIDUPf(8,%r8,Z2 ) \
 					   			\
 	   VMADDSUBf(Z3,Chi_01,UChi_00)    VSHUFf(Chi_02,T1)    \
 	   VMADDSUBf(Z3,Chi_11,UChi_10)    VSHUFf(Chi_12,T2)    \
 	   VMADDSUBf(Z4,Chi_01,UChi_01)	   VMOVRDUPf(2,%r8,Z3 ) \
 	   VMADDSUBf(Z4,Chi_11,UChi_11)\
 	   VMADDSUBf(Z5,Chi_01,UChi_02)	   VMOVRDUPf(5,%r8,Z4 ) \
 	   VMADDSUBf(Z5,Chi_11,UChi_12)\
 	   			\
 	   VMADDSUBf(Z0,T1,UChi_00) 	   VMOVRDUPf(8,%r8,Z5 ) \
 	   VMADDSUBf(Z0,T2,UChi_10)\
 	   VMADDSUBf(Z1,T1,UChi_01)\
 	   VMADDSUBf(Z1,T2,UChi_11)\
 	   VMADDSUBf(Z2,T1,UChi_02)\
 	   VMADDSUBf(Z2,T2,UChi_12)\
 		   		   \
 	   VMADDSUBf(Z3,Chi_02,UChi_00)\
 	   VMADDSUBf(Z3,Chi_12,UChi_10)\
 	   VMADDSUBf(Z4,Chi_02,UChi_01)\
 	   VMADDSUBf(Z4,Chi_12,UChi_11)\
 	   VMADDSUBf(Z5,Chi_02,UChi_02)\
 	   VMADDSUBf(Z5,Chi_12,UChi_12)\
 						);
 #endif
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+   /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ASM_INTEL_COMMON_512_H
 #define GRID_ASM_INTEL_COMMON_512_H
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Peformance options
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 #undef  AVX512_PF_L2_WRITE
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Opcodes common 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -37,6 +42,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
           "mov     $0x5555, %%eax \n"\
           "kmovw    %%eax, %%k7 \n" : : : "%eax");
 //#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
 #define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 #define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
@ -86,8 +93,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 #define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
-#define VPREFETCHG(O,A) 
+#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" 
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
 #ifdef AVX512_PF_L2_WRITE
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 #else
 #define VPREFETCHW(O,A) 
 #endif
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 #define VEVICT(O,A)   
 //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
@ -123,8 +138,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 #define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
--- a/lib/simd/Intel512double.h
+++ b/lib/simd/Intel512double.h
@ -133,3 +133,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #undef VRDUP
 #undef VIDUP
 #undef VMADDSUBMEM
 #undef VMADDMEM
 #undef VMULMEM
 #define VRDUP(SRC,DEST) VRDUPd(SRC,DEST) 
 #define VIDUP(SRC,DEST) VIDUPd(SRC,DEST) 
 #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
 #define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
 #define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
 #undef VMADDSUBRDUP   
 #undef VMADDSUBIDUP   
 #undef VMULRDUP   
 #undef VMULIDUP   
 #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
 #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
 #define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
 #define VMULIDUP(O,P,B,accum)     VMULIDUPd(O,P,B,accum) 
--- a/lib/simd/Intel512single.h
+++ b/lib/simd/Intel512single.h
@ -116,7 +116,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
 #define VSHUF(A,B)                                       VSHUFf(A,B)
 #undef ZEND1
 #undef ZEND2
 #undef ZLOAD
@ -133,3 +132,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #undef VRDUP
 #undef VIDUP
 #undef VMADDSUBMEM
 #undef VMADDMEM
 #undef VMULMEM
 #define VRDUP(SRC,DEST) VRDUPf(SRC,DEST) 
 #define VIDUP(SRC,DEST) VIDUPf(SRC,DEST) 
 #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
 #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
 #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
 #undef VMADDSUBRDUP   
 #undef VMADDSUBIDUP   
 #undef VMULRDUP   
 #undef VMULIDUP   
 #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
 #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
 #define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
 #define VMULIDUP(O,P,B,accum)     VMULIDUPf(O,P,B,accum) 
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -27,9 +27,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #ifndef GRID_ASM_INTEL_512_QCD_H
 #define GRID_ASM_INTEL_512_QCD_H
- 
+
 //////////////////////////////////////////////////////////////////////////////////////////
-// Register allocations for Wilson Kernel are precision and IMCI/AVX512 indept
+// Register allocations for Wilson Kernel are precision indept
 //////////////////////////////////////////////////////////////////////////////////////////
 #define result_00 %zmm0 
 #define result_01 %zmm1
@ -64,7 +64,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define UChi_12 %zmm23 
 #define Uir %zmm24 
 //#define ONE %zmm24 
 #define Uri %zmm25  
 #define T1 %zmm24
 #define T2 %zmm25
@ -92,13 +91,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_32 UChi_12
 #include <simd/Intel512common.h>
 #ifdef AVX512
 #include <simd/Intel512avx.h>
 //#include <simd/Intel512avxAddsub.h> // Alternate implementation
 #endif
 #ifdef IMCI
 #include <simd/Intel512imci.h>
 #endif
 //////////////////////////////////////////////////////////////////
 // Macros used to build wilson kernel -- can rationalise and simplify
@ -111,7 +104,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 #define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 #define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
-#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
+#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
 #define LOAD_CHIMUi \
 	   LOAD_CHIMU01i	\
@ -176,63 +169,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VSTORE(5,%r8,Chi_12)				\
 						);
 #define SAVE_RESULTi(PTR)\
 	   LOAD64(%r8,PTR)			\
  __asm__ (					\
 	   VSTORE(0,%r8,result_00)		\
 	   VSTORE(1,%r8,result_01)		\
 	   VSTORE(2,%r8,result_02)		\
 	   VSTORE(3,%r8,result_10)		\
 	   VSTORE(4,%r8,result_11)		\
 	   VSTORE(5,%r8,result_12)		\
 	   VSTORE(6,%r8,result_20)		\
 	   VSTORE(7,%r8,result_21)		\
 	   VSTORE(8,%r8,result_22)		\
 	   VSTORE(9,%r8,result_30)		\
 	   VSTORE(10,%r8,result_31)		\
 	   VSTORE(11,%r8,result_32) 		\
 						);
 //   auto ptr = &U._odata[sU](A);		
 // A plan for lifting loads 
 //  can use Z2/3/4/5/U/U for U field in first step. 
 //  can use Chi_00, Chi_10, U U for U field in second step
 //  can use Chi_00, Chi_10, Chi_01,11, U U for U field in third step
 // Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
 // KNL is DUAL issue for FP, and lifting these loads is potentially important.
 // Need detailed profile data to be sure.
 #if 0
 #define PREFETCH_U(A) \
  LOAD64(%r8,&U._odata[sU](A)) \
  __asm__ (		       \
  VPREFETCHG(0,%r8)	       \
  VPREFETCHG(1,%r8)	       \
  VPREFETCHG(2,%r8)	       \
  VPREFETCHG(3,%r8)	       \
  VPREFETCHG(4,%r8)	       \
  VPREFETCHG(5,%r8)	       \
  VPREFETCHG(6,%r8)	       \
  VPREFETCHG(7,%r8)	       \
  VPREFETCHG(8,%r8)	       );
 #define PREFETCH_R(A)  \
  LOAD64(%r8,&out._odata[ss]) \
  __asm__ (		       \
  VPREFETCHW(0,%r8)	       \
  VPREFETCHW(1,%r8)	       \
  VPREFETCHW(2,%r8)	       \
  VPREFETCHW(3,%r8)	       \
  VPREFETCHW(4,%r8)	       \
  VPREFETCHW(5,%r8)	       \
  VPREFETCHW(6,%r8)	       \
  VPREFETCHW(7,%r8)	       \
  VPREFETCHW(8,%r8)	       \
  VPREFETCHW(9,%r8)	       \
  VPREFETCHW(10,%r8)	       \
  VPREFETCHW(11,%r8)	       );
 #endif
 #define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
 #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
@ -244,131 +180,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
-#if 0
+#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
-#define MULT_2SPIN_UNOPT(ptr)				\
+#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
-	   LOAD64(%r8,ptr)			\
+#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
-  __asm__ (					\
+#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
-	   ZLOAD (0,%r8,UChi_01,UChi_11)	\
+#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
-	   ZLOAD (3,%r8,UChi_02,UChi_12)	\
+#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
-	   ZLOAD (6,%r8,Uri,Uir)		\
+#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
-	   ZMUL (UChi_01,UChi_11,Chi_00,UChi_00,Z0)	\
+#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
 	   ZMUL (UChi_01,UChi_11,Chi_10,UChi_10,Z1)	\
 	   ZMUL (UChi_02,UChi_12,Chi_00,UChi_01,Z2)	\
 	   ZMUL (UChi_02,UChi_12,Chi_10,UChi_11,Z3)	\
 	   ZMUL (Uri,Uir,        Chi_00,UChi_02,Z4)	\
 	   ZMUL (Uri,Uir,        Chi_10,UChi_12,Z5)	\
 	   						\
 	   ZLOAD (1,%r8,Uri,Uir)			\
 	   ZLOAD (4,%r8,Chi_00, Chi_10)		     	\
 	   ZMADD (Uri,Uir,       Chi_01,UChi_00,Z0)	\
 	   ZMADD (Uri,Uir,       Chi_11,UChi_10,Z1)	\
 	   ZLOAD (7,%r8,Uri,Uir)			\
 	   ZMADD (Chi_00, Chi_10,Chi_01,UChi_01,Z2)	\
 	   ZMADD (Chi_00, Chi_10,Chi_11,UChi_11,Z3)	\
 	   ZLOAD (2,%r8,Chi_00,Chi_10)			\
 	   ZMADD(Uri,Uir,        Chi_01,UChi_02,Z4)	\
 	   ZMADD(Uri,Uir,        Chi_11,UChi_12,Z5)	\
 							\
 	   ZLOAD  (5,%r8,Uri,Uir)			\
 	   ZMADD (Chi_00,Chi_10, Chi_02,UChi_00,Z0)	\
 	   ZMADD (Chi_00,Chi_10, Chi_12,UChi_10,Z1)	\
 	   ZLOAD  (8,%r8,Chi_00,Chi_10)			\
 	   ZMADD (Uri,Uir,       Chi_02,UChi_01,Z2)    	\
 	   ZMADD (Uri,Uir,       Chi_12,UChi_11,Z3)	\
 	   ZMADD(Chi_00,Chi_10,  Chi_02,UChi_02,Z4)	\
 	   ZMADD(Chi_00,Chi_10,  Chi_12,UChi_12,Z5)	\
 	   						\
 	   ZEND1(UChi_00,Z0,Chi_01)			\
 	   ZEND1(UChi_10,Z1,Chi_11)			\
 	   ZEND1(UChi_01,Z2,Chi_00)			\
 	   ZEND1(UChi_11,Z3,Chi_10)			\
 	   ZEND1(UChi_02,Z4,Chi_02)			\
 	   ZEND1(UChi_12,Z5,Chi_12)			\
 	   ZEND2(UChi_00,Z0,Chi_01)			\
 	   ZEND2(UChi_10,Z1,Chi_11)			\
 	   ZEND2(UChi_01,Z2,Chi_00)			\
 	   ZEND2(UChi_11,Z3,Chi_10)			\
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
 #endif
 #define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
 // MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
 #if 0
 #define MULT_2SPIN_PF(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
  __asm__ (					\
 	   ZMULMEM2SP(0,%r8,Uri,Chi_00,Chi_10,UChi_00,Z0,UChi_10,Z1)	\
 	   VPF(0,%r9)						\
 	   ZMULMEM2SP(3,%r8,Uri,Chi_00,Chi_10,UChi_01,Z2,UChi_11,Z3)	\
 	   VPF(1,%r9)						\
 	   ZMULMEM2SP(6,%r8,Uri,Chi_00,Chi_10,UChi_02,Z4,UChi_12,Z5)	\
 	   VPF(2,%r9)						\
 									\
 	   ZMADDMEM2SP(1,%r8,Uri,Chi_01,Chi_11,UChi_00,Z0,UChi_10,Z1)	\
 	   VPF(3,%r9)						\
 	   ZMADDMEM2SP(4,%r8,Uri,Chi_01,Chi_11,UChi_01,Z2,UChi_11,Z3)	\
 	   VPF(4,%r9)						\
 	   ZMADDMEM2SP(7,%r8,Uri,Chi_01,Chi_11,UChi_02,Z4,UChi_12,Z5)	\
 	   VPF(5,%r9)						\
 									\
 	   ZMADDMEM2SP(2,%r8,Uri,Chi_02,Chi_12,UChi_00,Z0,UChi_10,Z1)	\
 	   VPF(6,%r9)						\
 	   ZMADDMEM2SP(5,%r8,Uri,Chi_02,Chi_12,UChi_01,Z2,UChi_11,Z3)	\
 	   VPF(7,%r9)						\
 	   ZMADDMEM2SP(8,%r8,Uri,Chi_02,Chi_12,UChi_02,Z4,UChi_12,Z5)	\
 	   VPF(8,%r9)						\
 	   						\
 	   ZEND1(UChi_00,Z0,Chi_01)			\
 	   ZEND1(UChi_10,Z1,Chi_11)			\
 	   ZEND1(UChi_01,Z2,Chi_00)			\
 	   ZEND1(UChi_11,Z3,Chi_10)			\
 	   VPF(9,%r9)						\
 	   ZEND1(UChi_02,Z4,Chi_02)			\
 	   ZEND1(UChi_12,Z5,Chi_12)			\
 	   ZEND2(UChi_00,Z0,Chi_01)			\
 	   ZEND2(UChi_10,Z1,Chi_11)			\
 	   VPF(10,%r9)						\
 	   ZEND2(UChi_01,Z2,Chi_00)			\
 	   ZEND2(UChi_11,Z3,Chi_10)			\
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   VPF(11,%r9)						\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
 #endif
 #if 0 
 #define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
  __asm__ (					\
 	   VPF(0,%r9)						\
 	   VPF(1,%r9)						\
 	   VPF(2,%r9)						\
 	   							\
 	   VPF(3,%r9)						\
 	   VPF(4,%r9)						\
 	   VPF(5,%r9)						\
 	   							\
 	   VPF(6,%r9)						\
 	   VPF(7,%r9)						\
 	   VPF(8,%r9)						\
 	   							\
 	   VPF(9,%r9)						\
 	   VPF(10,%r9)						\
 	   VPF(11,%r9)						);
 #endif
 // Pretty much Perfectly Pipelined
 //////////////////////////////////////////////////////////////////
 // Dirac algebra
@ -442,8 +261,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define XM_PROJMEM(PTR) \
  LOAD64(%r8,PTR)\
  __asm__ (								\
 	   SHUF_CHIMU23i						\
 	   LOAD_CHIi \
 	   SHUF_CHIMU23i						\
 	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
 	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
 	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
@ -471,8 +290,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZM_PROJMEM(PTR) \
  LOAD64(%r8,PTR)							\
  __asm__ (								\
 	   SHUF_CHIMU23i						\
           LOAD_CHIi \
 	   SHUF_CHIMU23i						\
 	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
 	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
 	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
@ -490,7 +309,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  LOAD64(%r8,ptr)				\
  __asm__ (					\
  LOAD_CHIMU01i					\
-  VSUBMEM(6,%r8 ,Chimu_00,Chi_00)		\
+  VSUBMEM(6,%r8,Chimu_00,Chi_00)		\
  VSUBMEM(7,%r8,Chimu_01,Chi_01)		\
  VSUBMEM(8,%r8,Chimu_02,Chi_02)		\
  VSUBMEM(9,%r8,Chimu_10,Chi_10)		\
@ -503,18 +322,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 //      fspin(3)=timesMinusI(hspin(0))
 #define XP_RECON __asm__ (			\
 			  VZERO(TMP)		\
 			  VMOV(UChi_00,result_00)	\
 			  VMOV(UChi_01,result_01)	\
 			  VMOV(UChi_02,result_02)	\
 			  VMOV(UChi_10,result_10)	\
 			  VMOV(UChi_11,result_11)	\
 			  VMOV(UChi_12,result_12)	\
 			  VTIMESMINUSI0(UChi_10,result_20,TMP)	\
 			  VTIMESMINUSI0(UChi_11,result_21,TMP)	\
 			  VTIMESMINUSI0(UChi_12,result_22,TMP)	\
 			  VTIMESMINUSI0(UChi_00,result_30,TMP)	\
 			  VTIMESMINUSI0(UChi_10,result_20,TMP)	\
 			  VTIMESMINUSI0(UChi_01,result_31,TMP)	\
 			  VTIMESMINUSI0(UChi_11,result_21,TMP)	\
 			  VTIMESMINUSI0(UChi_02,result_32,TMP)   \
 			  VTIMESMINUSI0(UChi_12,result_22,TMP)	\
 			  VMOV(UChi_00,result_00)	\
 			  VMOV(UChi_10,result_10)	\
 			  VMOV(UChi_01,result_01)	\
 			  VMOV(UChi_11,result_11)	\
 			  VMOV(UChi_02,result_02)	\
 			  VMOV(UChi_12,result_12)	\
 			  VTIMESMINUSI1(UChi_10,result_20,TMP)	\
 			  VTIMESMINUSI1(UChi_11,result_21,TMP)	\
 			  VTIMESMINUSI1(UChi_12,result_22,TMP)	\
@ -531,24 +350,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  // NB could save 6 ops using addsub => 12 cycles
 #define XP_RECON_ACCUM __asm__ ( \
  VZERO(TMP)\
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
  VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
  VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
  VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
-  VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
+  VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
-  VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
+  VADD(UChi_00,result_00,result_00)\
-  VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
+  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
  VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
  VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
  VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
  VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
@ -559,24 +378,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define XM_RECON __asm__ ( \
  VZERO(TMP)\
  VMOV(UChi_00,result_00)\
  VMOV(UChi_01,result_01)\
  VMOV(UChi_02,result_02)\
  VMOV(UChi_10,result_10)\
  VMOV(UChi_11,result_11)\
  VMOV(UChi_12,result_12)\
  VTIMESI0(UChi_10,result_20,TMP)\
  VTIMESI0(UChi_11,result_21,TMP)\
  VTIMESI0(UChi_12,result_22,TMP)\
  VTIMESI0(UChi_00,result_30,TMP)\
  VTIMESI0(UChi_10,result_20,TMP)\
  VTIMESI0(UChi_01,result_31,TMP)\
  VTIMESI0(UChi_11,result_21,TMP)\
  VTIMESI0(UChi_02,result_32,TMP)\
-  VTIMESI1(UChi_10,result_20,TMP)\
+  VTIMESI0(UChi_12,result_22,TMP)\
-  VTIMESI1(UChi_11,result_21,TMP)\
+  VMOV(UChi_00,result_00)\
-  VTIMESI1(UChi_12,result_22,TMP)\
+  VMOV(UChi_10,result_10)\
  VMOV(UChi_01,result_01)\
  VMOV(UChi_11,result_11)\
  VMOV(UChi_02,result_02)\
  VMOV(UChi_12,result_12)\
  VTIMESI1(UChi_00,result_30,TMP)\
  VTIMESI1(UChi_10,result_20,TMP)\
  VTIMESI1(UChi_01,result_31,TMP)\
  VTIMESI1(UChi_11,result_21,TMP)\
  VTIMESI1(UChi_02,result_32,TMP)\
  VTIMESI1(UChi_12,result_22,TMP)\
  VTIMESI2(UChi_10,result_20,TMP)\
  VTIMESI2(UChi_11,result_21,TMP)\
  VTIMESI2(UChi_12,result_22,TMP)\
@ -586,23 +405,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 			   );
 #define XM_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESI0(UChi_10,result_20,Z0)\
  VACCTIMESI0(UChi_11,result_21,Z1)\
  VACCTIMESI0(UChi_12,result_22,Z2)\
  VACCTIMESI0(UChi_00,result_30,Z3)\
  VACCTIMESI0(UChi_11,result_21,Z1)\
  VACCTIMESI0(UChi_01,result_31,Z4)\
  VACCTIMESI0(UChi_12,result_22,Z2)\
  VACCTIMESI0(UChi_02,result_32,Z5)\
  \
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_12,result_12,result_12)\
  VADD(UChi_02,result_02,result_02)\
  \
  VACCTIMESI1(UChi_10,result_20,Z0)\
  VACCTIMESI1(UChi_11,result_21,Z1)\
  VACCTIMESI1(UChi_12,result_22,Z2)\
  VACCTIMESI1(UChi_00,result_30,Z3)\
  VACCTIMESI1(UChi_11,result_21,Z1)\
  VACCTIMESI1(UChi_01,result_31,Z4)\
  VACCTIMESI1(UChi_12,result_22,Z2)\
  VACCTIMESI1(UChi_02,result_32,Z5)\
  VACCTIMESI2(UChi_10,result_20,Z0)\
  VACCTIMESI2(UChi_11,result_21,Z1)\
@ -614,10 +435,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define YP_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VADD(UChi_10,result_20,result_20)\
  VADD(UChi_11,result_21,result_21)\
@ -628,10 +449,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define YM_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VSUB(UChi_10,result_20,result_20)\
  VSUB(UChi_11,result_21,result_21)\
@ -641,23 +462,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VADD(UChi_02,result_32,result_32) );
 #define ZP_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
  VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
  VACCTIMESI0(UChi_10,result_30,Z3)\
  VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
  VACCTIMESI0(UChi_11,result_31,Z4)\
  VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
  VACCTIMESI0(UChi_12,result_32,Z5)\
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
  VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
  VACCTIMESI1(UChi_10,result_30,Z3)\
  VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
  VACCTIMESI1(UChi_11,result_31,Z4)\
  VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
  VACCTIMESI1(UChi_12,result_32,Z5)\
  VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
  VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
@ -668,23 +489,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 				 );
 #define ZM_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESI0(UChi_00,result_20,Z0)\
  VACCTIMESI0(UChi_01,result_21,Z1)\
  VACCTIMESI0(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
  VACCTIMESI0(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
  VACCTIMESI0(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESI1(UChi_00,result_20,Z0)\
  VACCTIMESI1(UChi_01,result_21,Z1)\
  VACCTIMESI1(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
  VACCTIMESI1(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
  VACCTIMESI1(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
  VACCTIMESI2(UChi_00,result_20,Z0)\
  VACCTIMESI2(UChi_01,result_21,Z1)\
@ -696,35 +517,121 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define TP_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VADD(UChi_00,result_20,result_20)\
  VADD(UChi_01,result_21,result_21)\
  VADD(UChi_02,result_22,result_22)\
  VADD(UChi_10,result_30,result_30)\
  VADD(UChi_01,result_21,result_21)\
  VADD(UChi_11,result_31,result_31)\
  VADD(UChi_02,result_22,result_22)\
  VADD(UChi_12,result_32,result_32) );
 #define TM_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VSUB(UChi_00,result_20,result_20)\
  VSUB(UChi_01,result_21,result_21)\
  VSUB(UChi_02,result_22,result_22)\
  VSUB(UChi_10,result_30,result_30)\
  VSUB(UChi_01,result_21,result_21)\
  VSUB(UChi_11,result_31,result_31)\
  VSUB(UChi_02,result_22,result_22)\
  VSUB(UChi_12,result_32,result_32) );
-//define PREFETCH_CHIMU(A) 
+#define AVX512_PF_L1
 #define AVX512_PF_L2_GAUGE
 #define AVX512_PF_L2_TABLE
 #undef  AVX512_PF_L2_LINEAR
-#define PERMUTE_DIR0 __asm__ ( 	\
+#ifdef AVX512_PF_L2_TABLE  
 // P1 Fetches the base pointer for next link into L1 with P1
 // M1 Fetches the next site pointer into L2
 #define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
 #define VPREFETCH_P2(A,B) 
 #define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
 #define VPREFETCH_M2(A,B) 
 #endif
 #ifdef AVX512_PF_L2_LINEAR
 #define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
 #define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
 #define VPREFETCH_P1(A,B) 
 #define VPREFETCH_P2(A,B)
 #endif
 #ifdef AVX512_PF_L2_GAUGE
 #define VPREFETCH_G1(A,B)  VPREFETCH1(A,B)
 #define VPREFETCH_G2(A,B)  VPREFETCH2(A,B)
 #endif
 #define PF_GAUGE(A) \
  LOAD64(%r8,&U._odata[sU](A))						\
  __asm__ (								\
 	   VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8)			\
 	   VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8)			\
 									);
 #define SAVE_RESULTi(PTR,pf)			\
 	   LOAD64(%r8,PTR)			\
 	   LOAD64(%r9,pf)			\
  __asm__ (					\
 	   VSTORE(0,%r8,result_00)	VPREFETCH_M1(0,%r9)	\
 	   VSTORE(1,%r8,result_01)	VPREFETCH_M1(1,%r9)	\
 	   VSTORE(2,%r8,result_02)	VPREFETCH_M1(2,%r9)	\
 	   VSTORE(3,%r8,result_10)	VPREFETCH_M1(3,%r9)	\
 	   VSTORE(4,%r8,result_11)	VPREFETCH_M1(4,%r9)	\
 	   VSTORE(5,%r8,result_12)	VPREFETCH_M1(5,%r9)	\
 	   VSTORE(6,%r8,result_20)	VPREFETCH_M1(6,%r9)	\
 	   VSTORE(7,%r8,result_21)	VPREFETCH_M1(7,%r9)	\
 	   VSTORE(8,%r8,result_22)	VPREFETCH_M1(8,%r9)	\
 	   VSTORE(9,%r8,result_30)	VPREFETCH_M1(9,%r9)	\
 	   VSTORE(10,%r8,result_31)	VPREFETCH_M1(10,%r9)	\
 	   VSTORE(11,%r8,result_32) 	VPREFETCH_M1(11,%r9)	\
 						);
 #ifdef AVX512_PF_L2_TABLE
 #define PREFETCH_CHIMU(A) \
  LOAD64(%r9,A)							\
  __asm__ (							\
 	   VPREFETCH_P1(0,%r9)					\
 	   VPREFETCH_P1(1,%r9)					\
 	   VPREFETCH_P1(2,%r9)					\
 	   VPREFETCH_P1(3,%r9)					\
 	   VPREFETCH_P1(4,%r9)					\
 	   VPREFETCH_P1(5,%r9)					\
 	   VPREFETCH_P1(6,%r9)					\
 	   VPREFETCH_P1(7,%r9)					\
 	   VPREFETCH_P1(8,%r9)					\
 	   VPREFETCH_P1(9,%r9)					\
 	   VPREFETCH_P1(10,%r9)					\
 	   VPREFETCH_P1(11,%r9));
 #else
 #define PREFETCH_CHIMU(A)
 #endif
 #define PREFETCH1_CHIMU(A) \
  LOAD64(%r9,A)							\
  __asm__ (							\
 	   VPREFETCH_P1(0,%r9)					\
 	   VPREFETCH_P1(1,%r9)					\
 	   VPREFETCH_P1(2,%r9)					\
 	   VPREFETCH_P1(3,%r9)					\
 	   VPREFETCH_P1(4,%r9)					\
 	   VPREFETCH_P1(5,%r9)					\
 	   VPREFETCH_P1(6,%r9)					\
 	   VPREFETCH_P1(7,%r9)					\
 	   VPREFETCH_P1(8,%r9)					\
 	   VPREFETCH_P1(9,%r9)					\
 	   VPREFETCH_P1(10,%r9)					\
 	   VPREFETCH_P1(11,%r9));
 #define PERMUTE_DIR0 __asm__ (			\
  VPERM0(Chi_00,Chi_00)	\
  VPERM0(Chi_01,Chi_01)	\
  VPERM0(Chi_02,Chi_02)	\
@ -756,65 +663,245 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VPERM3(Chi_11,Chi_11)	\
  VPERM3(Chi_12,Chi_12) );
 #define MULT_ADDSUB_2SPIN1(ptr)  \
           LOAD64(%r8,ptr)                      
 /*
 * __asm__ (                                     \
 );
  VMUL(Z0,%zmm2,%zmm3) \
 */
 #define MULT_ADDSUB_2SPIN(ptr)  \
           LOAD64(%r8,ptr)                      \
  __asm__ (                                     \
           VMOVIDUP(0,%r8,Z0 ) \
           VMOVIDUP(3,%r8,Z1 )\
           VMOVIDUP(6,%r8,Z2 )\
           VSHUF(Chi_00,T1)    \
           VSHUF(Chi_10,T2)    \
                                \
           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 ) \
           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 ) \
           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 ) \
           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 ) \
           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 ) \
           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 ) \
                                \
           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)    \
           VMADDSUB(Z3,Chi_10,UChi_10)    VSHUF(Chi_11,T2)    \
           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 ) \
           VMADDSUB(Z4,Chi_10,UChi_11)\
           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 ) \
           VMADDSUB(Z5,Chi_10,UChi_12)\
                                       \
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 ) \
           VMADDSUB(Z0,T2,UChi_10)\
           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 ) \
           VMADDSUB(Z1,T2,UChi_11)\
           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 ) \
           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 ) \
                                                                \
           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)    \
           VMADDSUB(Z3,Chi_11,UChi_10)    VSHUF(Chi_12,T2)    \
           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 ) \
           VMADDSUB(Z4,Chi_11,UChi_11)\
           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 ) \
           VMADDSUB(Z5,Chi_11,UChi_12)\
                                \
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
           VMADDSUB(Z0,T2,UChi_10)\
           VMADDSUB(Z1,T1,UChi_01)\
           VMADDSUB(Z1,T2,UChi_11)\
           VMADDSUB(Z2,T1,UChi_02)\
           VMADDSUB(Z2,T2,UChi_12)\
                                   \
           VMADDSUB(Z3,Chi_02,UChi_00)\
           VMADDSUB(Z3,Chi_12,UChi_10)\
           VMADDSUB(Z4,Chi_02,UChi_01)\
           VMADDSUB(Z4,Chi_12,UChi_11)\
           VMADDSUB(Z5,Chi_02,UChi_02)\
           VMADDSUB(Z5,Chi_12,UChi_12)\
                                                );
-#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
+#define MULT_ADDSUB_2SPIN(ptr,pf)					\
  LOAD64(%r8,ptr)						\
  LOAD64(%r9,pf)						\
 	   __asm__ (						\
 	   VPREFETCH_G2(9,%r8)				   \
 	   VPREFETCH_G2(10,%r8)					   \
 	   VPREFETCH_G2(11,%r8)					   \
 	   VPREFETCH_G2(12,%r8)					   \
 	   VPREFETCH_G2(13,%r8)					   \
 	   VPREFETCH_G2(14,%r8)					   \
 	   VPREFETCH_G2(15,%r8)					   \
 	   VPREFETCH_G2(16,%r8)					   \
 	   VPREFETCH_G2(17,%r8)					   \
 	   VSHUF(Chi_00,T1)				\
 	   VMOVIDUP(0,%r8,Z0 )					\
           VMOVIDUP(3,%r8,Z1 )					\
           VMOVIDUP(6,%r8,Z2 )	          VSHUF(Chi_10,T2)		\
 	   /*6*/							\
           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 )	\
           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 )	\
           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 )	\
           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
 	   VPREFETCH_M1(0,%r9)					   \
 	   VPREFETCH_M1(1,%r9)					   \
 	   VPREFETCH_M1(2,%r9)					   \
 	   VPREFETCH_M1(3,%r9)					   \
 	   /*18*/						\
           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
           VMADDSUB(Z3,Chi_10,UChi_10)				\
           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 )	\
           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
           VMADDSUB(Z5,Chi_10,UChi_12)				\
 	   VPREFETCH_M1(4,%r9)					   \
 	   VPREFETCH_M1(5,%r9)					   \
 	   VPREFETCH_M1(6,%r9)					   \
 	   VPREFETCH_M1(7,%r9)					   \
 	   /*28*/						\
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
           VMADDSUB(Z0,T2,UChi_10)				\
           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 )	\
           VMADDSUB(Z1,T2,UChi_11)				\
           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 )	\
           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 )	\
 	   VPREFETCH2(12,%r9)					   \
 	   VPREFETCH2(13,%r9)					   \
 	   VPREFETCH2(14,%r9)					   \
 	   VPREFETCH2(15,%r9)					   \
 	   VPREFETCH2(16,%r9)					   \
 	   VPREFETCH2(17,%r9)					   \
 	   VPREFETCH2(18,%r9)					   \
 	   VPREFETCH2(19,%r9)					   \
 	   VPREFETCH2(20,%r9)					   \
 	   VPREFETCH2(21,%r9)					   \
 	   VPREFETCH2(22,%r9)					   \
 	   VPREFETCH2(23,%r9)					   \
           /*38*/						\
           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)	\
           VMADDSUB(Z3,Chi_11,UChi_10)				\
           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 )	\
           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
           VMADDSUB(Z5,Chi_11,UChi_12)				\
 	   VPREFETCH_M1(9,%r8)				   \
 	   VPREFETCH_M1(10,%r8)					   \
 	   VPREFETCH_M1(11,%r8)					   \
 	   VPREFETCH_M1(12,%r8)					   \
 	   VPREFETCH_M1(13,%r8)					   \
 	   VPREFETCH_M1(14,%r8)					   \
 	   VPREFETCH_M1(15,%r8)					   \
 	   VPREFETCH_M1(16,%r8)					   \
 	   VPREFETCH_M1(17,%r8)					   \
 	   /*48*/						\
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
           VMADDSUB(Z0,T2,UChi_10)			      \
           VMADDSUB(Z1,T1,UChi_01)			      \
           VMADDSUB(Z1,T2,UChi_11)			      \
           VMADDSUB(Z2,T1,UChi_02)			      \
           VMADDSUB(Z2,T2,UChi_12)			      \
 	   VPREFETCH_M1(8,%r9)					   \
 	   VPREFETCH_M1(9,%r9)					   \
 	   VPREFETCH_M1(10,%r9)					   \
 	   VPREFETCH_M1(11,%r9)					   \
 	   /*55*/					      \
           VMADDSUB(Z3,Chi_02,UChi_00)			      \
           VMADDSUB(Z3,Chi_12,UChi_10)			      \
           VMADDSUB(Z4,Chi_02,UChi_01)			      \
           VMADDSUB(Z4,Chi_12,UChi_11)			      \
           VMADDSUB(Z5,Chi_02,UChi_02)			      \
           VMADDSUB(Z5,Chi_12,UChi_12)			      \
 	   /*61 insns*/							);
 #define MULT_ADDSUB_2SPIN_LS(ptr,pf)				   \
  LOAD64(%r8,ptr)						   \
  LOAD64(%r9,pf)						   \
  __asm__ (							   \
           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 	   VPREFETCH_M1(0,%r9)					   \
 	   VPREFETCH_M1(1,%r9)					   \
 	   VPREFETCH_M1(2,%r9)					   \
 	   VPREFETCH_M1(3,%r9)					   \
 	   /*8*/						   \
           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 	   VPREFETCH_M1(4,%r9)					   \
 	   VPREFETCH_M1(5,%r9)					   \
 	   VPREFETCH_M1(6,%r9)					   \
 	   VPREFETCH_M1(7,%r9)					   \
 	   /*16*/					  	   \
           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
 	   VPREFETCH_M1(8,%r9)					   \
 	   VPREFETCH_M1(9,%r9)					   \
 	   VPREFETCH_M1(10,%r9)					   \
 	   VPREFETCH_M1(11,%r9)					   \
           /*22*/						   \
           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 	   VPREFETCH_M2(12,%r9)					   \
 	   VPREFETCH_M2(13,%r9)					   \
 	   VPREFETCH_M2(14,%r9)					   \
 	   VPREFETCH_M2(15,%r9)					   \
 	   /*30*/						   \
           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
 	   VPREFETCH_M2(16,%r9)					   \
 	   VPREFETCH_M2(17,%r9)					   \
 	   VPREFETCH_M2(18,%r9)					   \
 	   VPREFETCH_M2(19,%r9)					   \
           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 	   /*36*/					           \
           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 	   VPREFETCH_M2(20,%r9)					   \
 	   VPREFETCH_M2(21,%r9)					   \
 	   VPREFETCH_M2(22,%r9)					   \
 	   VPREFETCH_M2(23,%r9)					   \
 	   VPREFETCH_G1(2,%r8)					   \
 	   VPREFETCH_G1(3,%r8)					   \
 	   VPREFETCH_G2(4,%r8)					   \
 	   VPREFETCH_G2(5,%r8)					   \
 	   VPREFETCH_G2(6,%r8)					   \
 	   VPREFETCH_G2(7,%r8)					   \
 	   /*42 insns*/						);
 #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
  LOAD64(%r8,ptr)						   \
  LOAD64(%r9,pf)						   \
  __asm__ (							   \
           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 	   /*8*/						   \
           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 	   /*16*/					  	   \
           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
           /*22*/						   \
           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 	   /*30*/						   \
           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 	   /*36*/					           \
           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 	   /*	   VPREFETCH1(2,%r8)*/				   \
 	   /*	   VPREFETCH1(3,%r8)*/				   \
 	   /*42 insns*/						);
 #define Z6 Chi_00
 #define MULT_ADDSUB_2SPIN_NEW(ptr,pf)			       \
  LOAD64(%r8,ptr)					       \
  __asm__ (							  \
   VSHUFMEM(0,%r8,Z0)					          \
   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
   VMUL(Z0,Chi_00,Z1)         VMUL(Z0,Chi_10,Z2)		  \
   VSHUFMEM(3,%r8,Z0)						  \
   VMUL(Z0,Chi_00,Z3)         VMUL(Z0,Chi_10,Z4)		  \
   VSHUFMEM(6,%r8,Z0)						  \
   VMUL(Z0,Chi_00,Z5)         VMUL(Z0,Chi_10,Z6)		  \
   VMULMEM(0,%r8,T1,UChi_00)  VMULMEM(0,%r8,T2,UChi_10)		  \
   VMULMEM(3,%r8,T1,UChi_01)  VMULMEM(3,%r8,T2,UChi_11)		  \
   VMULMEM(6,%r8,T1,UChi_02)  VMULMEM(6,%r8,T2,UChi_12)		  \
   /*11 cycles*/						  \
   VSHUFMEM(1,%r8,Z0)						  \
   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
   VMADD(Z0,Chi_01,Z1)        VMADD(Z0,Chi_11,Z2)		  \
   VSHUFMEM(4,%r8,Z0)						  \
   VMADD(Z0,Chi_01,Z3)        VMADD(Z0,Chi_11,Z4)		  \
   VSHUFMEM(7,%r8,Z0)						  \
   VMADD(Z0,Chi_01,Z5)        VMADD(Z0,Chi_11,Z6)		  \
   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
   /*22 cycles*/						  \
   VSHUFMEM(2,%r8,Z0)						  \
   VRDUP(Chi_02,T1)        VIDUP(Chi_02,Chi_02)			  \
   VRDUP(Chi_12,T2)        VIDUP(Chi_12,Chi_12)			  \
   VMADD(Z0,Chi_02,Z1)        VMADD(Z0,Chi_12,Z2)		  \
   VSHUFMEM(5,%r8,Z0)						  \
   VMADD(Z0,Chi_02,Z3)        VMADD(Z0,Chi_12,Z4)		  \
   VSHUFMEM(8,%r8,Z0)						  \
   VMADD(Z0,Chi_02,Z5)        VMADD(Z0,Chi_12,Z6)		  \
   /*33 cycles*/						  \
   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
  /*stall*/						       \
  /*stall*/						       \
  /*stall*/						       \
  VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
  VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
  VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) )
 #endif
--- a/lib/stencil/.dirstamp
+++ b/lib/stencil/.dirstamp
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
 {
  grid = _grid;
  if ( Block[0]==0) ZGraph();
  else if ( Block[1]==0) NoBlocking();
  else CartesianBlocking();
 }
 void LebesgueOrder::NoBlocking(void) 
 {
  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
  _LebesgueReorder.resize(0);
  for ( int s = 0 ; s!= grid->oSites();s++){
    _LebesgueReorder.push_back(s);
  }
 }
 void LebesgueOrder::CartesianBlocking(void) 
 {
  _LebesgueReorder.resize(0);
-  std::cout << GridLogMessage << " CartesianBlocking ";
+  std::cout << GridLogDebug << " CartesianBlocking ";
-  for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
+  //    for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
-  std::cout<<std::endl; 
+  //    std::cout<<std::endl; 
  IndexInteger ND = grid->_ndimension;
@ -103,7 +112,9 @@ void LebesgueOrder::IterateI(int ND,
    } else {
      for(int d=0;d<ND;d++){
 	x[d]=xi[d]+xo[d];
 //	std::cout << x[d]<<" ";
      }
 //      std::cout << "\n";
      IndexInteger index;
      Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
      _LebesgueReorder.push_back(index);
@ -114,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
 void LebesgueOrder::ZGraph(void) 
 {
  _LebesgueReorder.resize(0);
-  
+
  std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
  // Align up dimensions to power of two.
  const IndexInteger one=1;
@ -188,6 +200,7 @@ void LebesgueOrder::ZGraph(void)
  }
  assert( _LebesgueReorder.size() == vol );
  /*
  std::vector<int> coor(4);
  for(IndexInteger asite=0;asite<vol;asite++){
    grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
@ -198,5 +211,6 @@ void LebesgueOrder::ZGraph(void)
 		<< coor[3]<<"]"
 		<<std::endl;
  }
  */
 }
 }
--- a/lib/stencil/Lebesgue.h
+++ b/lib/stencil/Lebesgue.h
@ -59,6 +59,7 @@ namespace Grid {
    // Cartesian stencil blocking strategy
    /////////////////////////////////
    static std::vector<int> Block;
    void NoBlocking(void);
    void CartesianBlocking(void);
    void IterateO(int ND,int dim,
 		  std::vector<IndexInteger> & xo,
--- a/tests/Test_dwf_rb5d.cc
+++ b/tests/Test_dwf_rb5d.cc
@ -97,7 +97,7 @@ int main (int argc, char ** argv)
  RealD M5  =1.8;
  typename WilsonFermion5DR::ImplParams params;
-  WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,M5,params);
+  WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,M5,params);
  Dw.Dhop(src,result,0);
--- a/tests/Test_zmm.cc
+++ b/tests/Test_zmm.cc
@ -27,9 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 int main(int argc,char **argv)
 {
  return 0;
 }
 #if 0
 #include <simd/Intel512wilson.h>
 using namespace Grid;
 using namespace Grid::QCD;
@ -478,3 +482,4 @@ void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3)
  return;
 }
 #endif
Author	SHA1	Message	Date
paboyle	446c768cd3	Merge branch 'hotfix/v0.5.1' Double precision compile fix	2016-07-01 16:33:59 +01:00
paboyle	bfe14000a9	Double compile fix	2016-07-01 16:33:51 +01:00
paboyle	680645f849	Merge branch 'release/v0.5.0'	2016-06-30 15:15:03 -07:00
paboyle	3fc6e03ad1	Version file	2016-06-30 14:44:09 -07:00
paboyle	2d6614f3a1	Merge branch 'feature/knl-cache-opt' into develop	2016-06-30 14:36:20 -07:00
paboyle	4e041b5103	Merge branch 'feature/knl-cache-opt' of https://github.com/paboyle/Grid into feature/knl-cache-opt	2016-06-30 14:36:08 -07:00
paboyle	712b9a3489	Asm only for avx512	2016-06-30 14:35:02 -07:00
paboyle	bdaa5b1767	Updated to have perfect prefetching for the s-vectorised kernel with any cache blocking.	2016-06-30 14:35:02 -07:00
paboyle	8fcefc021a	Improved the prefetching when using cache blocking codes	2016-06-30 14:35:02 -07:00
paboyle	1445189361	COntrol the prefetch strategy	2016-06-30 14:35:02 -07:00
paboyle	05c884a62a	Prefetch change	2016-06-30 14:35:01 -07:00
paboyle	a25bec87d9	Prefetch during save	2016-06-30 14:35:01 -07:00
paboyle	2d8bb4c594	Tweaks	2016-06-30 14:35:01 -07:00
paboyle	51cb2d4328	update file lists	2016-06-30 14:35:01 -07:00
paboyle	6d58cb2a68	Enable reordering of the loops in the assembler for cache friendly. This gets in the way of L2 prefetching however. Do next next link in stencil prefetching.	2016-06-30 14:35:01 -07:00
paboyle	c8b35d960c	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/knl-cache-opt	2016-06-30 14:30:49 -07:00
paboyle	532f41dd61	Asm only for avx512	2016-06-30 14:00:34 -07:00
paboyle	661b0ab45d	Updated to have perfect prefetching for the s-vectorised kernel with any cache blocking.	2016-06-30 13:07:42 -07:00
paboyle	4bc08ed995	Improved the prefetching when using cache blocking codes	2016-06-26 12:54:14 -07:00
paboyle	b2933a0557	COntrol the prefetch strategy	2016-06-25 12:55:25 -07:00
paboyle	db057cc276	Prefetch change	2016-06-25 12:54:50 -07:00
paboyle	22e88eaf54	Prefetch during save	2016-06-25 12:54:14 -07:00
paboyle	09fe3caebd	Tweaks	2016-06-25 11:08:05 -07:00
Guido Cossu	5e02392f9c	Fixed compilation error for benchmark_dwf Some parts were assuming floating point precision	2016-06-20 12:30:51 +01:00
paboyle	17a8f51a9b	update file lists	2016-06-19 11:59:10 -07:00
paboyle	1b7f88dd00	Enable reordering of the loops in the assembler for cache friendly. This gets in the way of L2 prefetching however. Do next next link in stencil prefetching.	2016-06-19 11:45:58 -07:00
Antonin Portelli	d6737e4bd8	Travis fix for Linux clang builds	2016-06-14 19:15:08 +01:00
Antonin Portelli	d539888e57	Merge pull request #37 from rprollins/fix/mpi_communicator Removed write to stdout in constructor for MPI CartesianCommunicator	2016-06-14 17:25:40 +01:00
Richard Rollins	86187d7cca	Removed write to stdout in constructor for MPI CartesianCommunicator	2016-06-14 15:34:20 +01:00
paboyle	87418e7df1	Slightly faster prefetching perf.	2016-06-13 02:32:52 -07:00
paboyle	55f65b81b5	Improvements to the assembler interface that let us move chunks of the site and s loop into the kernels. This will save on function call overhead and guarantee L2 prefetching strategy is right since OMP can't distribute the sub-chunks of work.	2016-06-09 01:12:36 -07:00
Azusa Yamaguchi	d9408893b3	Prefetching in the normal kernel implementation.	2016-06-08 05:43:48 -07:00
paboyle	05acc22920	placeholder for non temporal loads optimisation	2016-06-07 13:18:21 -07:00
paboyle	8ac021de73	Added a test an fixed it for red black precon Ls innermost vectorised DWF	2016-06-07 13:16:56 -07:00
paboyle	e503ef5590	Cleaned up	2016-06-07 00:11:36 +01:00
paboyle	a7682b0060	Only instantiate the one routine to avoid duplicate symbol under g++5/MacOS	2016-06-06 23:48:21 +01:00
paboyle	d4c9d71fc8	Merge branch 'master' of https://github.com/paboyle/Grid	2016-06-06 07:06:54 -07:00
paboyle	786ca52c43	Problems remain in the red black preconditioning of the Ls vectorisation	2016-06-06 07:05:51 -07:00
Peter Boyle	048ac04abc	Update Benchmark_dwf.cc	2016-06-03 13:44:41 +01:00
Peter Boyle	f78d89bcbe	Update Lebesgue.cc kill verbose	2016-06-03 13:33:42 +01:00
paboyle	53d06046b0	Compiling updates for KNL	2016-06-03 03:47:54 -07:00
paboyle	5d3a1a025d	timers flag	2016-06-03 03:25:38 -07:00
paboyle	139cc5f1ae	Large change with KNL preparation	2016-06-03 03:24:26 -07:00
Antonin Portelli	1c0e922585	Merge pull request #35 from aportelli/master empty SIMD fix	2016-05-27 16:49:13 +01:00
Antonin Portelli	9d5f693cbe	empty SIMD fix	2016-05-24 10:56:27 +01:00