From 86187d7cca3b29b7050c096938d4c89aed51194e Mon Sep 17 00:00:00 2001
From: Richard Rollins <rprollins@users.noreply.github.com>
Date: Tue, 14 Jun 2016 15:34:20 +0100
Subject: [PATCH 01/21] Removed write to stdout in constructor for MPI
 CartesianCommunicator

---
 lib/communicator/Communicator_mpi.cc | 1 -
 1 file changed, 1 deletion(-)
diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc
index f804e8fe..dff9811a 100644
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -53,7 +53,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   _Nprocessors=1;
   _processors = processors;
   _processor_coor.resize(_ndimension);
-  std::cout << processors << std::endl;
   
   MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
   MPI_Comm_rank(communicator,&_processor);

From d6737e4bd8eefed1923dcbeef868a0c48134f6bb Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 14 Jun 2016 19:07:01 +0100
Subject: [PATCH 02/21] Travis fix for Linux clang builds

---
 .travis.yml | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index cd73fbac..82066d87 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,9 @@
 language: cpp
 
+cache:
+  directories:
+    - clang
+
 matrix:
   include:
     - os:        osx
@@ -38,29 +42,31 @@ matrix:
         apt:
           sources:
             - ubuntu-toolchain-r-test
-            - llvm-toolchain-precise-3.7
           packages:
-            - clang-3.7
+            - g++-4.8
             - libmpfr-dev
             - libgmp-dev
             - libmpc-dev
             - binutils-dev
-      env: VERSION=-3.7
+      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
     - compiler: clang
       addons:
         apt:
           sources:
             - ubuntu-toolchain-r-test
-            - llvm-toolchain-precise-3.8
           packages:
-            - clang-3.8
+            - g++-4.8
             - libmpfr-dev
             - libgmp-dev
             - libmpc-dev
             - binutils-dev
-      env: VERSION=-3.8
+      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
       
 before_install:
+    - export GRIDDIR=`pwd`
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
@@ -68,6 +74,11 @@ before_install:
 install:
     - export CC=$CC$VERSION
     - export CXX=$CXX$VERSION
+    - echo $PATH
+    - which $CC
+    - $CC  --version
+    - which $CXX
+    - $CXX --version
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
     
 script:

From 1b7f88dd003eb5d863e39ea95ceb6564f19de791 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 19 Jun 2016 11:45:58 -0700
Subject: [PATCH 03/21] Enable reordering of the loops in the assembler for
 cache friendly. This gets in the way of L2 prefetching however. Do next next
 link in stencil prefetching.

---
 benchmarks/Benchmark_dwf_sweep.cc             | 358 ++++++++++++++++++
 lib/qcd/action/fermion/DomainWallFermion.h    |   2 +-
 lib/qcd/action/fermion/WilsonFermion.cc       |  14 +-
 lib/qcd/action/fermion/WilsonFermion.h        |   6 +-
 lib/qcd/action/fermion/WilsonFermion5D.cc     |   4 +-
 lib/qcd/action/fermion/WilsonKernels.cc       |  20 +-
 lib/qcd/action/fermion/WilsonKernels.h        |  14 +-
 lib/qcd/action/fermion/WilsonKernelsAsm.cc    |  36 +-
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h |  76 ++--
 .../action/fermion/WilsonKernelsAsmBody.h.ab  | 163 ++++++++
 lib/qcd/action/fermion/WilsonKernelsHand.cc   |  36 +-
 lib/simd/Intel512common.h                     |   6 +-
 lib/simd/Intel512wilson.h                     |  32 +-
 lib/stencil/Lebesgue.cc                       |  18 +-
 lib/stencil/Lebesgue.h                        |   1 +
 15 files changed, 670 insertions(+), 116 deletions(-)
 create mode 100644 benchmarks/Benchmark_dwf_sweep.cc
 create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab

diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
new file mode 100644
index 00000000..302059a4
--- /dev/null
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -0,0 +1,358 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_dwf.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+#include <PerfCount.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
+void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  if ( getenv("ASMOPT") )  {
+    QCD::WilsonKernelsStatic::AsmOpt=1;
+  } else { 
+    QCD::WilsonKernelsStatic::AsmOpt=0;
+  }
+
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+
+  int Lmax=32;
+  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
+  for (int L=8;L<Lmax;L*=2){
+    std::vector<int> latt4(4,L);
+    for(int d=4;d>0;d--){
+      if ( d<=3 ) latt4[d]*=2;
+      std::cout << GridLogMessage <<"\t";
+      for(int d=0;d<Nd;d++){
+	std::cout<<latt4[d]<<"x";
+      }
+      std::cout <<Ls<<"\t" ;
+      benchDw (latt4,Ls,threads,0);
+      benchsDw(latt4,Ls,threads,0);
+      std::cout<<std::endl;
+    }
+  }
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  {
+    std::vector<int> latt4(4,16);
+    std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
+    benchDw (latt4,Ls,threads,1);
+    std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
+    benchsDw(latt4,Ls,threads,1);
+  }
+
+  Grid_finalize();
+}
+
+#undef CHECK
+
+void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
+{
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+#ifdef CHECK 
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  LatticeFermion src   (FGrid); random(RNG5,src);
+  LatticeGaugeField Umu(UGrid); 
+  random(RNG4,Umu);
+#else 
+  LatticeFermion src   (FGrid); src=zero;
+  LatticeGaugeField Umu(UGrid); Umu=zero;
+#endif
+
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid);    ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+
+  ColourMatrix cm = Complex(1.0,0.0);
+
+
+  LatticeGaugeField Umu5d(FGrid); 
+
+  // replicate across fifth dimension
+  for(int ss=0;ss<Umu._grid->oSites();ss++){
+    for(int s=0;s<Ls;s++){
+      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
+    }
+  }
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  std::vector<LatticeColourMatrix> U(4,FGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+  }
+
+#ifdef CHECK
+  if (1)
+  {
+    ref = zero;
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+#endif
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  RealD NP = UGrid->_Nprocessors;
+
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  
+  double t0=usecond();
+  Dw.Dhop(src,result,0);
+  double t1=usecond();
+
+  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+
+  if (ncall < 5 ) exit(0);
+
+  Dw.Dhop(src,result,0);
+
+  PerformanceCounter Counter(8);
+  Counter.Start();
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.Dhop(src,result,0);
+  }
+  t1=usecond();
+  Counter.Stop();
+  if ( report ) {
+    Counter.Report();
+  }
+  
+  if ( ! report ) 
+    {
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=1344*volume*ncall;
+      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
+    }
+  
+#ifdef CHECK
+    err = ref-result; 
+    RealD errd = norm2(err);
+    if ( errd> 1.0e-4 ) {
+      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
+      exit(-1);
+    }
+#endif
+    
+  LatticeFermion src_e (FrbGrid);
+  LatticeFermion src_o (FrbGrid);
+  LatticeFermion r_e   (FrbGrid);
+  LatticeFermion r_o   (FrbGrid);
+  LatticeFermion r_eo  (FGrid);
+  
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+  
+  {
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+    }
+    double t1=usecond();
+    
+    if(!report){
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=(1344.0*volume*ncall)/2;
+      std::cout<< flops/(t1-t0);
+    }
+  }
+  
+}
+
+#undef CHECK_SDW
+void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
+{
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+#ifdef CHECK_SDW
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  LatticeFermion src   (FGrid); random(RNG5,src);
+  LatticeGaugeField Umu(UGrid); 
+  random(RNG4,Umu);
+#else 
+  LatticeFermion src   (FGrid); src=zero;
+  LatticeGaugeField Umu(UGrid); Umu=zero;
+#endif
+
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid);    ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+
+  ColourMatrix cm = Complex(1.0,0.0);
+
+  LatticeGaugeField Umu5d(FGrid); 
+
+  // replicate across fifth dimension
+  for(int ss=0;ss<Umu._grid->oSites();ss++){
+    for(int s=0;s<Ls;s++){
+      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
+    }
+  }
+
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+    LatticeFermionF ssrc(sFGrid);
+    LatticeFermionF sref(sFGrid);
+    LatticeFermionF sresult(sFGrid);
+    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
+  
+    for(int x=0;x<latt4[0];x++){
+    for(int y=0;y<latt4[1];y++){
+    for(int z=0;z<latt4[2];z++){
+    for(int t=0;t<latt4[3];t++){
+    for(int s=0;s<Ls;s++){
+      std::vector<int> site({s,x,y,z,t});
+      SpinColourVectorF tmp;
+      peekSite(tmp,src,site);
+      pokeSite(tmp,ssrc,site);
+    }}}}}
+
+    double t0=usecond();
+    sDw.Dhop(ssrc,sresult,0);
+    double t1=usecond();
+
+    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+
+    PerformanceCounter Counter(8);
+    Counter.Start();
+    t0=usecond();
+    for(int i=0;i<ncall;i++){
+      sDw.Dhop(ssrc,sresult,0);
+    }
+    t1=usecond();
+    Counter.Stop();
+
+    if ( report ) {
+      Counter.Report();
+    } else { 
+
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=1344*volume*ncall;
+      std::cout<<"\t"<< flops/(t1-t0);
+    }
+
+
+    LatticeFermionF sr_eo(sFGrid);
+    LatticeFermionF serr(sFGrid);
+    
+    LatticeFermion ssrc_e (sFrbGrid);
+    LatticeFermion ssrc_o (sFrbGrid);
+    LatticeFermion sr_e   (sFrbGrid);
+    LatticeFermion sr_o   (sFrbGrid);
+      
+    pickCheckerboard(Even,ssrc_e,ssrc);
+    pickCheckerboard(Odd,ssrc_o,ssrc);
+
+    setCheckerboard(sr_eo,ssrc_o);
+    setCheckerboard(sr_eo,ssrc_e);
+    
+    sr_e = zero;
+    sr_o = zero;
+    
+    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+    PerformanceCounter CounterSdw(8);
+    CounterSdw.Start();
+    t0=usecond();
+    for(int i=0;i<ncall;i++){
+      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+    }
+    t1=usecond();
+    CounterSdw.Stop();
+
+    if ( report ) { 
+      CounterSdw.Report();
+    } else {
+
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=(1344.0*volume*ncall)/2;
+      std::cout<<"\t"<< flops/(t1-t0);
+    }
+}
+
+
diff --git a/lib/qcd/action/fermion/DomainWallFermion.h b/lib/qcd/action/fermion/DomainWallFermion.h
index b05733aa..8e41aa63 100644
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@@ -63,7 +63,7 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
 	
-	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
+	//	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 	// Call base setter
 	this->SetCoefficientsTanh(zdata,1.0,0.0);
 
diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc
index 2618286e..59632409 100644
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -53,6 +53,8 @@ namespace QCD {
 	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
 	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
 	mass(_mass),
+	Lebesgue(_grid),
+	LebesgueEvenOdd(_cbgrid),
 	Umu(&Fgrid),
 	UmuEven(&Hgrid),
 	UmuOdd (&Hgrid) 
@@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
     
     out.checkerboard = in.checkerboard;
     
-    DhopInternal(Stencil,Umu,in,out,dag);
+    DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
   }
   
   template<class Impl>
@@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
     assert(in.checkerboard==Even);
     out.checkerboard = Odd;
     
-    DhopInternal(StencilEven,UmuOdd,in,out,dag);
+    DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
   }
   
   template<class Impl>
@@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
     assert(in.checkerboard==Odd);
     out.checkerboard = Even;
     
-    DhopInternal(StencilOdd,UmuEven,in,out,dag);
+    DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
   }
   
   template<class Impl>
@@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
   };
 
   template<class Impl>
-  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
+  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
   {
     assert((dag==DaggerNo) ||(dag==DaggerYes));
@@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
     if ( dag == DaggerYes ) {
 PARALLEL_FOR_LOOP
       for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
       }
     } else {
 PARALLEL_FOR_LOOP
       for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
+	Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
       }
     }
   };
diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h
index c1f4d682..3de2cac4 100644
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -111,7 +111,7 @@ namespace Grid {
 			 const FermionField &B,
 			 int dag);
 
-      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
+      void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;
 
       // Constructor
@@ -146,6 +146,10 @@ namespace Grid {
       DoubledGaugeField Umu;
       DoubledGaugeField UmuEven;
       DoubledGaugeField UmuOdd;
+
+      LebesgueOrder Lebesgue;
+      LebesgueOrder LebesgueEvenOdd;
+
       
     };
 
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 66047b72..08069bed 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -321,14 +321,14 @@ PARALLEL_FOR_LOOP
     for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	int sF=LLs*sU;
-	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
     }
   } else {
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<U._grid->oSites();ss++){
       int sU=ss;
       int sF=LLs*sU;
-      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
     }
   }
 }
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index 63ba553d..672c23d6 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -38,20 +38,20 @@ template<class Impl>
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
   if ( AsmOpt ) {
 
-    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
+    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 
   } else {
 
     for(int site=0;site<Ns;site++) {
       for(int s=0;s<Ls;s++) {
-	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
-	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
+	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
+	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 	sF++;
       }
       sU++;
@@ -61,17 +61,17 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 }
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
   // No asm implementation yet.
-  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
+  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
   //  else
   for(int site=0;site<Ns;site++) {
     for(int s=0;s<Ls;s++) {
-      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
-      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
+      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
       sF++;
     }
     sU++;
@@ -84,7 +84,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
   ////////////////////////////////////////////
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -262,7 +262,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaug
 
   // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,const FermionField &in, FermionField &out)
 {
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index da6751dd..231fa293 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -53,11 +53,11 @@ namespace Grid {
      
     public:
 
-     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
       
-     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
 
@@ -67,24 +67,24 @@ namespace Grid {
 
     private:
      // Specialised variants
-     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU, const FermionField &in, FermionField &out);
       
-     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in,FermionField &out);
 
-     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 
 
-     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out);
      
-     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				 int sF,int sU,const FermionField &in, FermionField &out);
     public:
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index bccf72c7..cdbe2c8e 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -39,9 +39,9 @@ namespace QCD {
   // Default to no assembler implementation
   ///////////////////////////////////////////////////////////
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
   assert(0);
 }
@@ -71,9 +71,9 @@ static int signInit = setupSigns();
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 
 template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 
 #undef VMOVIDUP
@@ -85,31 +85,31 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaug
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
-void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 
 #endif
 
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 
-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
 
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index aae049e2..bd96b7d5 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,7 +1,8 @@
 {
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
-  uint64_t basea, baseb;
+  int localc,permc, ptypec;
+  uint64_t basea, baseb, basec;
   uint64_t basex;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
@@ -11,14 +12,22 @@
   MASK_REGS;
 
   for(int site=0;site<Ns;site++) {
+  int sU=lo.Reorder(ssU);
+
   for(int s=0;s<Ls;s++) {
+  ss     =sU*Ls+s;
 
   ////////////////////////////////
   // Xp
   ////////////////////////////////
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+
   basex = basea;
 
   if ( locala ) {
@@ -38,6 +47,7 @@
   // Yp
   ////////////////////////////////
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -46,7 +56,7 @@
     LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFYP(Yp,basea);
+    MULT_2SPIN_DIR_PFYP(Yp,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YM_RECON_ACCUM;
@@ -55,15 +65,16 @@
   // Zp
   ////////////////////////////////
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  if ( locala ) {
+  PREFETCH_CHIMU(baseb);
+  if ( localc ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
+    ZM_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR1,permc);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(basec);
   }
   {
-    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+    MULT_2SPIN_DIR_PFZP(Zp,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZM_RECON_ACCUM;
@@ -71,16 +82,17 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  if ( localb ) {
+  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
+    TM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR0,perma);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basea);
   }
   {
-    MULT_2SPIN_DIR_PFTP(Tp,basea);
+    MULT_2SPIN_DIR_PFTP(Tp,baseb);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TM_RECON_ACCUM;
@@ -88,16 +100,17 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  if ( locala ) {
+  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
+    XP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR3,permb);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+    MULT_2SPIN_DIR_PFXM(Xm,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   XP_RECON_ACCUM;
@@ -105,13 +118,14 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  if ( localb ) {
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  if ( localc ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
+    YP_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR2,permc);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basec);
   }
   {
     MULT_2SPIN_DIR_PFYM(Ym,basea);
@@ -122,7 +136,8 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -140,6 +155,7 @@
   // Tm
   ////////////////////////////////
   basea = (uint64_t)&out._odata[ss];
+  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TP_PROJMEM(baseb);
@@ -148,17 +164,15 @@
     LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFTM(Tm,basea);
+    MULT_2SPIN_DIR_PFTM(Tm,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  PREFETCH_CHIMU(basex);
+  //  PREFETCH_CHIMU(basex);
   SAVE_RESULT(&out._odata[ss]);
-
   
-  ss++;
-  } 
-  sU++;
+  }
+  ssU++;
   }
 }
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
new file mode 100644
index 00000000..3ba9eec6
--- /dev/null
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
@@ -0,0 +1,163 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  uint64_t basea, baseb;
+  uint64_t basex;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+
+  for(int site=0;site<Ns;site++) {
+  int sU=lo.Reorder(ssU);  
+  for(int s=0;s<Ls;s++) {
+  ss=sU*Ls+s;
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  basex = basea;
+
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  SAVE_RESULT(&out._odata[ss]);
+
+  } 
+  ssU++;
+  }
+}
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index 757778d3..cb6c01a1 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -312,7 +312,7 @@ namespace QCD {
 
 
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -555,7 +555,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 }
 
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -803,7 +803,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
   // Specialise Gparity to simple implementation
   ////////////////////////////////////////////////
 template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -811,7 +811,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
 }
 
 template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -819,7 +819,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
 }
 
 template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -827,7 +827,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
 }
 
 template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -839,44 +839,44 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //
 
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
 
 
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 
 
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 
diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index 2fea9235..a3cd980d 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -88,7 +88,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" 
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
+#define VPREFETCHNTA(O,A) 
+#define VPREFETCH(O,A)    
+
 #define VEVICT(O,A)   
 
 //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
@@ -124,8 +128,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 
-#define VPREFETCHNTA(O,A) 
-#define VPREFETCH(O,A)    
 
 #define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 1955cc6d..2bc0545d 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -559,22 +559,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSUB(UChi_02,result_22,result_22)\
   VSUB(UChi_12,result_32,result_32) );
 
-#define PREFETCH_CHIMU(A) \
+#define PREFETCH_CHIMU(A) 
+/*
   LOAD64(%r9,A)						\
 	   __asm__ (						\
-  VPREFETCHG(12,%r9)\
-  VPREFETCHG(13,%r9)\
-  VPREFETCHG(14,%r9)\
-  VPREFETCHG(15,%r9)\
-  VPREFETCHG(16,%r9)\
-  VPREFETCHG(17,%r9)\
-  VPREFETCHG(18,%r9)\
-  VPREFETCHG(19,%r9)\
-  VPREFETCHG(20,%r9)\
-  VPREFETCHG(21,%r9)\
-  VPREFETCHG(22,%r9)\
-  VPREFETCHG(23,%r9));
-
+  VPREFETCHG(0,%r9)\
+  VPREFETCHG(1,%r9)\
+  VPREFETCHG(2,%r9)\
+  VPREFETCHG(3,%r9)\
+  VPREFETCHG(4,%r9)\
+  VPREFETCHG(5,%r9)\
+  VPREFETCHG(6,%r9)\
+  VPREFETCHG(7,%r9)\
+  VPREFETCHG(8,%r9)\
+  VPREFETCHG(9,%r9)\
+  VPREFETCHG(10,%r9)\
+  VPREFETCHG(11,%r9));
+*/
 #define PERMUTE_DIR0 __asm__ ( 	\
   VPERM0(Chi_00,Chi_00)	\
   VPERM0(Chi_01,Chi_01)	\
@@ -612,8 +613,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   LOAD64(%r8,ptr)						\
   LOAD64(%r9,pf)						\
 	   __asm__ (						\
-	   VPREFETCH2(9,%r8)				   \
-	   VPREFETCH2(10,%r8)					   \
+	   VPREFETCH2(9,%r8)				   VPREFETCH2(10,%r8)					   \
 	   VPREFETCH2(11,%r8)					   \
 	   VPREFETCH2(12,%r8)					   \
 	   VPREFETCH2(13,%r8)					   \
diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc
index 7704e08f..c34b5c96 100644
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
 {
   grid = _grid;
   if ( Block[0]==0) ZGraph();
+  else if ( Block[1]==0) NoBlocking();
   else CartesianBlocking();
 }
 
+void LebesgueOrder::NoBlocking(void) 
+{
+  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
+  _LebesgueReorder.resize(0);
+  for ( int s = 0 ; s!= grid->oSites();s++){
+    _LebesgueReorder.push_back(s);
+  }
+}
 void LebesgueOrder::CartesianBlocking(void) 
 {
   _LebesgueReorder.resize(0);
 
-  std::cout << GridLogMessage << " CartesianBlocking ";
-  for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
-  std::cout<<std::endl; 
+  std::cout << GridLogDebug << " CartesianBlocking ";
+  //    for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
+  //    std::cout<<std::endl; 
 
   IndexInteger ND = grid->_ndimension;
 
@@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
 void LebesgueOrder::ZGraph(void) 
 {
   _LebesgueReorder.resize(0);
-  
+
+  std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
   // Align up dimensions to power of two.
   const IndexInteger one=1;
 
diff --git a/lib/stencil/Lebesgue.h b/lib/stencil/Lebesgue.h
index f69b089d..1675d16c 100644
--- a/lib/stencil/Lebesgue.h
+++ b/lib/stencil/Lebesgue.h
@@ -59,6 +59,7 @@ namespace Grid {
     // Cartesian stencil blocking strategy
     /////////////////////////////////
     static std::vector<int> Block;
+    void NoBlocking(void);
     void CartesianBlocking(void);
     void IterateO(int ND,int dim,
 		  std::vector<IndexInteger> & xo,

From 17a8f51a9b5ada82ec5ecf5739c04847a5f7ed3f Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 19 Jun 2016 11:59:10 -0700
Subject: [PATCH 04/21] update file lists

---
 benchmarks/Make.inc | 6 +++++-
 lib/Make.inc        | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/Make.inc b/benchmarks/Make.inc
index b60ec835..8d0721a4 100644
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,5 +1,5 @@
 
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 
 
 Benchmark_comms_SOURCES=Benchmark_comms.cc
@@ -14,6 +14,10 @@ Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
 Benchmark_dwf_ntpf_LDADD=-lGrid
 
 
+Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
+Benchmark_dwf_sweep_LDADD=-lGrid
+
+
 Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 Benchmark_memory_asynch_LDADD=-lGrid
 
diff --git a/lib/Make.inc b/lib/Make.inc
index 900da916..8763692a 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonKernelsAsmBody.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h
+HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonKernelsAsmBody.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512wilson.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h
 
 CCFILES=./Init.cc ./Log.cc ./PerfCount.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc

From 09fe3caebd985fe8e2db1ff85164e3779cdaf17d Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 25 Jun 2016 11:08:05 -0700
Subject: [PATCH 05/21] Tweaks

---
 benchmarks/Benchmark_dwf_sweep.cc             |  16 +-
 lib/qcd/action/fermion/WilsonKernelsAsm.cc    |   7 +-
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h |  79 +++-----
 .../action/fermion/WilsonKernelsAsmBody.h.abc | 187 ++++++++++++++++++
 lib/simd/Intel512common.h                     |   4 +-
 5 files changed, 242 insertions(+), 51 deletions(-)
 create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc

diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
index 302059a4..94a00903 100644
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -68,10 +68,12 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 
   int Lmax=32;
+  int dmin=0;
   if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
-  for (int L=8;L<Lmax;L*=2){
+  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
+  for (int L=8;L<=Lmax;L*=2){
     std::vector<int> latt4(4,L);
-    for(int d=4;d>0;d--){
+    for(int d=4;d>dmin;d--){
       if ( d<=3 ) latt4[d]*=2;
       std::cout << GridLogMessage <<"\t";
       for(int d=0;d<Nd;d++){
@@ -170,7 +172,11 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
   Dw.Dhop(src,result,0);
   double t1=usecond();
 
+#ifdef TIMERS_OFF
+    int ncall =10;
+#else
   int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+#endif
 
   if (ncall < 5 ) exit(0);
 
@@ -297,7 +303,11 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
     sDw.Dhop(ssrc,sresult,0);
     double t1=usecond();
 
+#ifdef TIMERS_OFF
+    int ncall =10;
+#else 
     int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+#endif
 
     PerformanceCounter Counter(8);
     Counter.Start();
@@ -340,7 +350,9 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
     CounterSdw.Start();
     t0=usecond();
     for(int i=0;i<ncall;i++){
+      __SSC_START;
       sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+      __SSC_STOP;
     }
     t1=usecond();
     CounterSdw.Stop();
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index cdbe2c8e..33c464ac 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -67,9 +67,12 @@ int setupSigns(void ){
 }
 static int signInit = setupSigns();
 
+#define label(A)  ilabel(A)
+#define ilabel(A) ".globl\n"  #A ":\n" 
+
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-
+#define FX(A) WILSONASM_ ##A
 template<>
 void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
@@ -80,6 +83,8 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #undef VMOVRDUP
 #undef MAYBEPERM
 #undef MULT_2SPIN
+#undef FX 
+#define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index bd96b7d5..d3e86276 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,8 +1,7 @@
 {
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
-  int localc,permc, ptypec;
-  uint64_t basea, baseb, basec;
+  uint64_t basea, baseb;
   uint64_t basex;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
@@ -12,22 +11,15 @@
   MASK_REGS;
 
   for(int site=0;site<Ns;site++) {
-  int sU=lo.Reorder(ssU);
-
+  int sU=lo.Reorder(ssU);  
   for(int s=0;s<Ls;s++) {
-  ss     =sU*Ls+s;
-
+  ss=sU*Ls+s;
   ////////////////////////////////
   // Xp
   ////////////////////////////////
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-
   basex = basea;
 
   if ( locala ) {
@@ -47,7 +39,6 @@
   // Yp
   ////////////////////////////////
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -56,7 +47,7 @@
     LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFYP(Yp,basec);
+    MULT_2SPIN_DIR_PFYP(Yp,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YM_RECON_ACCUM;
@@ -65,16 +56,15 @@
   // Zp
   ////////////////////////////////
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  if ( localc ) {
+  if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR1,permc);
+    ZM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
   } else { 
-    LOAD_CHI(basec);
+    LOAD_CHI(basea);
   }
   {
-    MULT_2SPIN_DIR_PFZP(Zp,basea);
+    MULT_2SPIN_DIR_PFZP(Zp,baseb);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZM_RECON_ACCUM;
@@ -82,17 +72,16 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-  if ( locala ) {
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR0,perma);
+    TM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFTP(Tp,baseb);
+    MULT_2SPIN_DIR_PFTP(Tp,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TM_RECON_ACCUM;
@@ -100,17 +89,16 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  if ( localb ) {
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR3,permb);
+    XP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basea);
   }
   {
-    MULT_2SPIN_DIR_PFXM(Xm,basec);
+    MULT_2SPIN_DIR_PFXM(Xm,baseb);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   XP_RECON_ACCUM;
@@ -118,14 +106,13 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  if ( localc ) {
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR2,permc);
+    YP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
   } else { 
-    LOAD_CHI(basec);
+    LOAD_CHI(baseb);
   }
   {
     MULT_2SPIN_DIR_PFYM(Ym,basea);
@@ -136,8 +123,7 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -155,7 +141,6 @@
   // Tm
   ////////////////////////////////
   basea = (uint64_t)&out._odata[ss];
-  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TP_PROJMEM(baseb);
@@ -163,16 +148,16 @@
   } else { 
     LOAD_CHI(baseb);
   }
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal);
   {
-    MULT_2SPIN_DIR_PFTM(Tm,basec);
+    MULT_2SPIN_DIR_PFTM(Tm,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  //  PREFETCH_CHIMU(basex);
-  SAVE_RESULT(&out._odata[ss]);
-  
-  }
+  SAVE_RESULT(&out._odata[ss],baseb);
+
+  } 
   ssU++;
   }
 }
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
new file mode 100644
index 00000000..5a3e01f7
--- /dev/null
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
@@ -0,0 +1,187 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  int localc,permc, ptypec;
+  uint64_t basea, baseb, basec;
+  uint64_t basex;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+
+  for(int site=0;site<Ns;site++) {
+  int sU=lo.Reorder(ssU);
+
+  for(int s=0;s<Ls;s++) {
+  ss     =sU*Ls+s;
+
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+
+  basex = basea;
+
+  label(FX(XP) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  label(FX(YP) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  label(FX(ZP) );
+  if ( localc ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR1,permc);
+  } else { 
+    LOAD_CHI(basec);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  label(FX(TP) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR0,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  label(FX(XM) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR3,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  label(FX(YM) );
+  if ( localc ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR2,permc);
+  } else { 
+    LOAD_CHI(basec);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  label(FX(ZM) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  PREFETCH_CHIMU(basea);
+  label(FX(TM) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  //  PREFETCH_CHIMU(basex);
+  label(FX(SAV) );
+  SAVE_RESULT(&out._odata[ss]);
+  
+  }
+  ssU++;
+  }
+}
diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index a3cd980d..6878bcfb 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+   /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
 
@@ -37,6 +37,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            "mov     $0x5555, %%eax \n"\
            "kmovw    %%eax, %%k7 \n" : : : "%eax");
 
+//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" );
+
 #define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 #define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
 

From 22e88eaf541f5d279df7c20bc97ab6c20d9409de Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 25 Jun 2016 12:54:14 -0700
Subject: [PATCH 06/21] Prefetch during save

---
 lib/simd/Intel512wilson.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 2bc0545d..207d9db8 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -104,7 +104,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 #define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 #define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
-#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
+#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
 
 #define LOAD_CHIMUi \
 	   LOAD_CHIMU01i	\
@@ -169,21 +169,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSTORE(5,%r8,Chi_12)				\
 						);
 
-#define SAVE_RESULTi(PTR)\
+#define SAVE_RESULTi(PTR,pf)			\
 	   LOAD64(%r8,PTR)			\
+	   LOAD64(%r9,pf)			\
   __asm__ (					\
-	   VSTORE(0,%r8,result_00)		\
-	   VSTORE(1,%r8,result_01)		\
-	   VSTORE(2,%r8,result_02)		\
-	   VSTORE(3,%r8,result_10)		\
-	   VSTORE(4,%r8,result_11)		\
-	   VSTORE(5,%r8,result_12)		\
-	   VSTORE(6,%r8,result_20)		\
-	   VSTORE(7,%r8,result_21)		\
-	   VSTORE(8,%r8,result_22)		\
-	   VSTORE(9,%r8,result_30)		\
-	   VSTORE(10,%r8,result_31)		\
-	   VSTORE(11,%r8,result_32) 		\
+	   VSTORE(0,%r8,result_00)	VPREFETCHG(0,%r9)	\
+	   VSTORE(1,%r8,result_01)	VPREFETCHG(1,%r9)	\
+	   VSTORE(2,%r8,result_02)	VPREFETCHG(2,%r9)	\
+	   VSTORE(3,%r8,result_10)	VPREFETCHG(3,%r9)	\
+	   VSTORE(4,%r8,result_11)	VPREFETCHG(4,%r9)	\
+	   VSTORE(5,%r8,result_12)	VPREFETCHG(5,%r9)	\
+	   VSTORE(6,%r8,result_20)	VPREFETCHG(6,%r9)	\
+	   VSTORE(7,%r8,result_21)	VPREFETCHG(7,%r9)	\
+	   VSTORE(8,%r8,result_22)	VPREFETCHG(8,%r9)	\
+	   VSTORE(9,%r8,result_30)	VPREFETCHG(9,%r9)	\
+	   VSTORE(10,%r8,result_31)	VPREFETCHG(10,%r9)	\
+	   VSTORE(11,%r8,result_32) 	VPREFETCHG(11,%r9)	\
 						);
 
 #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)

From db057cc2762a3b503f6e7c306621dbfc8aab5a1d Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 25 Jun 2016 12:54:50 -0700
Subject: [PATCH 07/21] Prefetch change

---
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index d3e86276..d50999f6 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -2,7 +2,6 @@
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
   uint64_t basea, baseb;
-  uint64_t basex;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
@@ -20,7 +19,6 @@
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  basex = basea;
 
   if ( locala ) {
     LOAD64(%r10,isigns);
@@ -38,7 +36,7 @@
   ////////////////////////////////
   // Yp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -55,7 +53,7 @@
   ////////////////////////////////
   // Zp
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZM_PROJMEM(basea);
@@ -72,7 +70,7 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TM_PROJMEM(baseb);
@@ -89,7 +87,7 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     XP_PROJMEM(basea);
@@ -106,7 +104,7 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YP_PROJMEM(baseb);
@@ -123,7 +121,7 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -148,7 +146,7 @@
   } else { 
     LOAD_CHI(baseb);
   }
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal);
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
   {
     MULT_2SPIN_DIR_PFTM(Tm,basea);
   }

From b2933a0557d211fa3b3d86d9d9ee4d38ebb854b1 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 25 Jun 2016 12:55:25 -0700
Subject: [PATCH 08/21] COntrol the prefetch strategy

---
 lib/simd/Intel512common.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index 6878bcfb..a05f978c 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -28,6 +28,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ASM_INTEL_COMMON_512_H
 #define GRID_ASM_INTEL_COMMON_512_H
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Peformance options
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define AVX512_PF_L1
+#undef  AVX512_PF_L2_LINEAR
+#undef  AVX512_PF_L2_TABLE
+#undef  AVX512_PF_L2_WRITE
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Opcodes common 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -88,10 +96,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 #define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 
+#ifdef AVX512_PF_L1
 #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
+#else
+#define VPREFETCHG(O,A) 
+#endif
+
+#ifdef AVX512_PF_L2_LINEAR
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#else
+#define VPREFETCH2(O,A) 
+#endif
+
+#ifdef AVX512_PF_L2_TABLE
 #define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#else
+#define VPREFETCHP(O,A) 
+#endif
+
+#ifdef AVX512_PF_L2_WRITE
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
+#else
+#define VPREFETCHW(O,A) 
+#endif
+
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 

From 4bc08ed9956b2a461ec227b4144dc93c2b134968 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 26 Jun 2016 12:54:14 -0700
Subject: [PATCH 09/21] Improved the prefetching when using cache blocking
 codes

---
 lib/Stencil.h                                 |   5 +-
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h |  86 ++++---
 .../action/fermion/WilsonKernelsAsmBody.h.ab  |  18 +-
 lib/simd/Intel512common.h                     |  24 +-
 lib/simd/Intel512wilson.h                     | 237 +++++++++++-------
 5 files changed, 208 insertions(+), 162 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index 8019e3f9..bc015370 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -1,4 +1,4 @@
-     /*************************************************************************************
+   /*************************************************************************************
 
      Grid physics library, www.github.com/paboyle/Grid 
 
@@ -261,6 +261,9 @@
 	 }
        };
 
+       inline uint64_t Touch(int ent) {
+	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
+       }
        inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
 	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 	 local = _entries[ent]._is_local;
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index d50999f6..7373d2eb 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,7 +1,9 @@
 {
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
-  uint64_t basea, baseb;
+  int localc,permc, ptypec;
+  uint64_t basea, baseb, basec;
+
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
@@ -10,15 +12,22 @@
   MASK_REGS;
 
   for(int site=0;site<Ns;site++) {
-  int sU=lo.Reorder(ssU);  
+  int sU =lo.Reorder(ssU);
   for(int s=0;s<Ls;s++) {
-  ss=sU*Ls+s;
+  ss     =sU*Ls+s;
+
   ////////////////////////////////
   // Xp
   ////////////////////////////////
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH1_CHIMU(basea);
+  PF_GAUGE(Xp); 
+
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
 
   if ( locala ) {
     LOAD64(%r10,isigns);
@@ -36,7 +45,8 @@
   ////////////////////////////////
   // Yp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Tp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -45,7 +55,7 @@
     LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFYP(Yp,basea);
+    MULT_2SPIN_DIR_PFYP(Yp,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YM_RECON_ACCUM;
@@ -53,16 +63,17 @@
   ////////////////////////////////
   // Zp
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
-  if ( locala ) {
+  baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  if ( localc ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
+    ZM_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR1,permc);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(basec);
   }
   {
-    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+    MULT_2SPIN_DIR_PFZP(Zp,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZM_RECON_ACCUM;
@@ -70,16 +81,17 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
-  if ( localb ) {
+  basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
+    TM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR0,perma);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basea);
   }
   {
-    MULT_2SPIN_DIR_PFTP(Tp,basea);
+    MULT_2SPIN_DIR_PFTP(Tp,baseb);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TM_RECON_ACCUM;
@@ -87,16 +99,17 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
-  if ( locala ) {
+  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
+    XP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR3,permb);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+    MULT_2SPIN_DIR_PFXM(Xm,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   XP_RECON_ACCUM;
@@ -104,13 +117,14 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
-  if ( localb ) {
+  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  if ( localc ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
+    YP_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR2,permc);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basec);
   }
   {
     MULT_2SPIN_DIR_PFYM(Ym,basea);
@@ -121,7 +135,8 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
+  basec = (uint64_t)&out._odata[ss];
+  PREFETCH_CHIMU(basec);
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -138,7 +153,8 @@
   ////////////////////////////////
   // Tm
   ////////////////////////////////
-  basea = (uint64_t)&out._odata[ss];
+  //  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  //  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TP_PROJMEM(baseb);
@@ -146,16 +162,16 @@
   } else { 
     LOAD_CHI(baseb);
   }
-  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
   {
-    MULT_2SPIN_DIR_PFTM(Tm,basea);
+    MULT_2SPIN_DIR_PFTM(Tm,basec);
   }
+  //  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  SAVE_RESULT(&out._odata[ss],baseb);
-
-  } 
+  SAVE_RESULT(&out._odata[ss],basec);
+  
+  }
   ssU++;
   }
 }
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
index 3ba9eec6..d50999f6 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
@@ -2,7 +2,6 @@
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
   uint64_t basea, baseb;
-  uint64_t basex;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
@@ -19,9 +18,7 @@
   ////////////////////////////////
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  basex = basea;
 
   if ( locala ) {
     LOAD64(%r10,isigns);
@@ -39,7 +36,7 @@
   ////////////////////////////////
   // Yp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -56,7 +53,7 @@
   ////////////////////////////////
   // Zp
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZM_PROJMEM(basea);
@@ -73,7 +70,7 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TM_PROJMEM(baseb);
@@ -90,7 +87,7 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     XP_PROJMEM(basea);
@@ -107,7 +104,7 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YP_PROJMEM(baseb);
@@ -124,7 +121,7 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -149,13 +146,14 @@
   } else { 
     LOAD_CHI(baseb);
   }
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
   {
     MULT_2SPIN_DIR_PFTM(Tm,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  SAVE_RESULT(&out._odata[ss]);
+  SAVE_RESULT(&out._odata[ss],baseb);
 
   } 
   ssU++;
diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index a05f978c..dabbf6d8 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -31,9 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Peformance options
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-#define AVX512_PF_L1
-#undef  AVX512_PF_L2_LINEAR
-#undef  AVX512_PF_L2_TABLE
 #undef  AVX512_PF_L2_WRITE
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -45,7 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            "mov     $0x5555, %%eax \n"\
            "kmovw    %%eax, %%k7 \n" : : : "%eax");
 
-//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" );
+//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
 
 #define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 #define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
@@ -96,30 +93,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 #define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 
-#ifdef AVX512_PF_L1
-#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
-#else
-#define VPREFETCHG(O,A) 
-#endif
-
-#ifdef AVX512_PF_L2_LINEAR
+#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" 
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
-#else
-#define VPREFETCH2(O,A) 
-#endif
-
-#ifdef AVX512_PF_L2_TABLE
-#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" 
-#else
-#define VPREFETCHP(O,A) 
-#endif
-
 #ifdef AVX512_PF_L2_WRITE
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 #else
 #define VPREFETCHW(O,A) 
 #endif
-
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 
diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 207d9db8..9deffd80 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -169,23 +169,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSTORE(5,%r8,Chi_12)				\
 						);
 
-#define SAVE_RESULTi(PTR,pf)			\
-	   LOAD64(%r8,PTR)			\
-	   LOAD64(%r9,pf)			\
-  __asm__ (					\
-	   VSTORE(0,%r8,result_00)	VPREFETCHG(0,%r9)	\
-	   VSTORE(1,%r8,result_01)	VPREFETCHG(1,%r9)	\
-	   VSTORE(2,%r8,result_02)	VPREFETCHG(2,%r9)	\
-	   VSTORE(3,%r8,result_10)	VPREFETCHG(3,%r9)	\
-	   VSTORE(4,%r8,result_11)	VPREFETCHG(4,%r9)	\
-	   VSTORE(5,%r8,result_12)	VPREFETCHG(5,%r9)	\
-	   VSTORE(6,%r8,result_20)	VPREFETCHG(6,%r9)	\
-	   VSTORE(7,%r8,result_21)	VPREFETCHG(7,%r9)	\
-	   VSTORE(8,%r8,result_22)	VPREFETCHG(8,%r9)	\
-	   VSTORE(9,%r8,result_30)	VPREFETCHG(9,%r9)	\
-	   VSTORE(10,%r8,result_31)	VPREFETCHG(10,%r9)	\
-	   VSTORE(11,%r8,result_32) 	VPREFETCHG(11,%r9)	\
-						);
 
 #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
@@ -560,24 +543,89 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSUB(UChi_02,result_22,result_22)\
   VSUB(UChi_12,result_32,result_32) );
 
-#define PREFETCH_CHIMU(A) 
-/*
-  LOAD64(%r9,A)						\
-	   __asm__ (						\
-  VPREFETCHG(0,%r9)\
-  VPREFETCHG(1,%r9)\
-  VPREFETCHG(2,%r9)\
-  VPREFETCHG(3,%r9)\
-  VPREFETCHG(4,%r9)\
-  VPREFETCHG(5,%r9)\
-  VPREFETCHG(6,%r9)\
-  VPREFETCHG(7,%r9)\
-  VPREFETCHG(8,%r9)\
-  VPREFETCHG(9,%r9)\
-  VPREFETCHG(10,%r9)\
-  VPREFETCHG(11,%r9));
-*/
-#define PERMUTE_DIR0 __asm__ ( 	\
+#define AVX512_PF_L1
+#define AVX512_PF_L2_GAUGE
+#define AVX512_PF_L2_TABLE
+#undef  AVX512_PF_L2_LINEAR
+
+#ifdef AVX512_PF_L2_TABLE
+#define VPREFETCH_P1(A,B)  VPREFETCH1(A,B)
+#define VPREFETCH_P2(A,B) VPREFETCH1(A,B)
+#else
+#define VPREFETCH_P1(A,B)
+#define VPREFETCH_P2(A,B)
+#endif
+#ifdef AVX512_PF_L2_LINEAR
+#define VPREFETCH_M1(A,B) 
+#define VPREFETCH_M2(A,B) 
+#else 
+#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
+#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
+#endif
+#ifdef AVX512_PF_L2_GAUGE
+#define VPREFETCH_G1(A,B)  VPREFETCH1(A,B)
+#define VPREFETCH_G2(A,B)  VPREFETCH2(A,B)
+#else
+#endif
+
+#define PF_GAUGE(A) \
+  LOAD64(%r8,&U._odata[sU](A))						\
+  __asm__ (								\
+	   VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8)			\
+	   VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8)			\
+									);
+
+#define SAVE_RESULTi(PTR,pf)			\
+	   LOAD64(%r8,PTR)			\
+	   LOAD64(%r9,pf)			\
+  __asm__ (					\
+	   VSTORE(0,%r8,result_00)	VPREFETCH_M1(0,%r9)	\
+	   VSTORE(1,%r8,result_01)	VPREFETCH_M1(1,%r9)	\
+	   VSTORE(2,%r8,result_02)	VPREFETCH_M1(2,%r9)	\
+	   VSTORE(3,%r8,result_10)	VPREFETCH_M1(3,%r9)	\
+	   VSTORE(4,%r8,result_11)	VPREFETCH_M1(4,%r9)	\
+	   VSTORE(5,%r8,result_12)	VPREFETCH_M1(5,%r9)	\
+	   VSTORE(6,%r8,result_20)	VPREFETCH_M1(6,%r9)	\
+	   VSTORE(7,%r8,result_21)	VPREFETCH_M1(7,%r9)	\
+	   VSTORE(8,%r8,result_22)	VPREFETCH_M1(8,%r9)	\
+	   VSTORE(9,%r8,result_30)	VPREFETCH_M1(9,%r9)	\
+	   VSTORE(10,%r8,result_31)	VPREFETCH_M1(10,%r9)	\
+	   VSTORE(11,%r8,result_32) 	VPREFETCH_M1(11,%r9)	\
+						);
+
+#define PREFETCH_CHIMU(A) \
+  LOAD64(%r9,A)							\
+  __asm__ (							\
+	   VPREFETCH_P2(0,%r9)					\
+	   VPREFETCH_P2(1,%r9)					\
+	   VPREFETCH_P2(2,%r9)					\
+	   VPREFETCH_P2(3,%r9)					\
+	   VPREFETCH_P2(4,%r9)					\
+	   VPREFETCH_P2(5,%r9)					\
+	   VPREFETCH_P2(6,%r9)					\
+	   VPREFETCH_P2(7,%r9)					\
+	   VPREFETCH_P2(8,%r9)					\
+	   VPREFETCH_P2(9,%r9)					\
+	   VPREFETCH_P2(10,%r9)					\
+	   VPREFETCH_P2(11,%r9));
+
+#define PREFETCH1_CHIMU(A) \
+  LOAD64(%r9,A)							\
+  __asm__ (							\
+	   VPREFETCH_P1(0,%r9)					\
+	   VPREFETCH_P1(1,%r9)					\
+	   VPREFETCH_P1(2,%r9)					\
+	   VPREFETCH_P1(3,%r9)					\
+	   VPREFETCH_P1(4,%r9)					\
+	   VPREFETCH_P1(5,%r9)					\
+	   VPREFETCH_P1(6,%r9)					\
+	   VPREFETCH_P1(7,%r9)					\
+	   VPREFETCH_P1(8,%r9)					\
+	   VPREFETCH_P1(9,%r9)					\
+	   VPREFETCH_P1(10,%r9)					\
+	   VPREFETCH_P1(11,%r9));
+
+#define PERMUTE_DIR0 __asm__ (			\
   VPERM0(Chi_00,Chi_00)	\
   VPERM0(Chi_01,Chi_01)	\
   VPERM0(Chi_02,Chi_02)	\
@@ -614,14 +662,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   LOAD64(%r8,ptr)						\
   LOAD64(%r9,pf)						\
 	   __asm__ (						\
-	   VPREFETCH2(9,%r8)				   VPREFETCH2(10,%r8)					   \
-	   VPREFETCH2(11,%r8)					   \
-	   VPREFETCH2(12,%r8)					   \
-	   VPREFETCH2(13,%r8)					   \
-	   VPREFETCH2(14,%r8)					   \
-	   VPREFETCH2(15,%r8)					   \
-	   VPREFETCH2(16,%r8)					   \
-	   VPREFETCH2(17,%r8)					   \
+	   VPREFETCH_G2(9,%r8)				   \
+	   VPREFETCH_G2(10,%r8)					   \
+	   VPREFETCH_G2(11,%r8)					   \
+	   VPREFETCH_G2(12,%r8)					   \
+	   VPREFETCH_G2(13,%r8)					   \
+	   VPREFETCH_G2(14,%r8)					   \
+	   VPREFETCH_G2(15,%r8)					   \
+	   VPREFETCH_G2(16,%r8)					   \
+	   VPREFETCH_G2(17,%r8)					   \
 	   VSHUF(Chi_00,T1)				\
 	   VMOVIDUP(0,%r8,Z0 )					\
            VMOVIDUP(3,%r8,Z1 )					\
@@ -633,10 +682,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
            VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
            VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
-	   VPREFETCHG(0,%r9)					   \
-	   VPREFETCHG(1,%r9)					   \
-	   VPREFETCHG(2,%r9)					   \
-	   VPREFETCHG(3,%r9)					   \
+	   VPREFETCH_M1(0,%r9)					   \
+	   VPREFETCH_M1(1,%r9)					   \
+	   VPREFETCH_M1(2,%r9)					   \
+	   VPREFETCH_M1(3,%r9)					   \
 	   /*18*/						\
            VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
            VMADDSUB(Z3,Chi_10,UChi_10)				\
@@ -644,10 +693,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
            VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
            VMADDSUB(Z5,Chi_10,UChi_12)				\
-	   VPREFETCHG(4,%r9)					   \
-	   VPREFETCHG(5,%r9)					   \
-	   VPREFETCHG(6,%r9)					   \
-	   VPREFETCHG(7,%r9)					   \
+	   VPREFETCH_M1(4,%r9)					   \
+	   VPREFETCH_M1(5,%r9)					   \
+	   VPREFETCH_M1(6,%r9)					   \
+	   VPREFETCH_M1(7,%r9)					   \
 	   /*28*/						\
            VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
            VMADDSUB(Z0,T2,UChi_10)				\
@@ -674,15 +723,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
            VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
            VMADDSUB(Z5,Chi_11,UChi_12)				\
-	   VPREFETCHG(9,%r8)				   \
-	   VPREFETCHG(10,%r8)					   \
-	   VPREFETCHG(11,%r8)					   \
-	   VPREFETCHG(12,%r8)					   \
-	   VPREFETCHG(13,%r8)					   \
-	   VPREFETCHG(14,%r8)					   \
-	   VPREFETCHG(15,%r8)					   \
-	   VPREFETCHG(16,%r8)					   \
-	   VPREFETCHG(17,%r8)					   \
+	   VPREFETCH_M1(9,%r8)				   \
+	   VPREFETCH_M1(10,%r8)					   \
+	   VPREFETCH_M1(11,%r8)					   \
+	   VPREFETCH_M1(12,%r8)					   \
+	   VPREFETCH_M1(13,%r8)					   \
+	   VPREFETCH_M1(14,%r8)					   \
+	   VPREFETCH_M1(15,%r8)					   \
+	   VPREFETCH_M1(16,%r8)					   \
+	   VPREFETCH_M1(17,%r8)					   \
 	   /*48*/						\
            VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
            VMADDSUB(Z0,T2,UChi_10)			      \
@@ -690,10 +739,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUB(Z1,T2,UChi_11)			      \
            VMADDSUB(Z2,T1,UChi_02)			      \
            VMADDSUB(Z2,T2,UChi_12)			      \
-	   VPREFETCHG(8,%r9)					   \
-	   VPREFETCHG(9,%r9)					   \
-	   VPREFETCHG(10,%r9)					   \
-	   VPREFETCHG(11,%r9)					   \
+	   VPREFETCH_M1(8,%r9)					   \
+	   VPREFETCH_M1(9,%r9)					   \
+	   VPREFETCH_M1(10,%r9)					   \
+	   VPREFETCH_M1(11,%r9)					   \
 	   /*55*/					      \
            VMADDSUB(Z3,Chi_02,UChi_00)			      \
            VMADDSUB(Z3,Chi_12,UChi_10)			      \
@@ -712,56 +761,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
            VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
            VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
-	   VPREFETCHG(0,%r9)					   \
-	   VPREFETCHG(1,%r9)					   \
-	   VPREFETCHG(2,%r9)					   \
-	   VPREFETCHG(3,%r9)					   \
+	   VPREFETCH_M1(0,%r9)					   \
+	   VPREFETCH_M1(1,%r9)					   \
+	   VPREFETCH_M1(2,%r9)					   \
+	   VPREFETCH_M1(3,%r9)					   \
 	   /*8*/						   \
            VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
            VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
            VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
            VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
-	   VPREFETCHG(4,%r9)					   \
-	   VPREFETCHG(5,%r9)					   \
-	   VPREFETCHG(6,%r9)					   \
-	   VPREFETCHG(7,%r9)					   \
+	   VPREFETCH_M1(4,%r9)					   \
+	   VPREFETCH_M1(5,%r9)					   \
+	   VPREFETCH_M1(6,%r9)					   \
+	   VPREFETCH_M1(7,%r9)					   \
 	   /*16*/					  	   \
            VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
            VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
            VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
-	   VPREFETCHG(8,%r9)					   \
-	   VPREFETCHG(9,%r9)					   \
-	   VPREFETCHG(10,%r9)					   \
-	   VPREFETCHG(11,%r9)					   \
+	   VPREFETCH_M1(8,%r9)					   \
+	   VPREFETCH_M1(9,%r9)					   \
+	   VPREFETCH_M1(10,%r9)					   \
+	   VPREFETCH_M1(11,%r9)					   \
            /*22*/						   \
            VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
            VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
            VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
            VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
-	   VPREFETCH2(12,%r9)					   \
-	   VPREFETCH2(13,%r9)					   \
-	   VPREFETCH2(14,%r9)					   \
-	   VPREFETCH2(15,%r9)					   \
+	   VPREFETCH_M2(12,%r9)					   \
+	   VPREFETCH_M2(13,%r9)					   \
+	   VPREFETCH_M2(14,%r9)					   \
+	   VPREFETCH_M2(15,%r9)					   \
 	   /*30*/						   \
            VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
            VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
-	   VPREFETCH2(16,%r9)					   \
-	   VPREFETCH2(17,%r9)					   \
-	   VPREFETCH2(18,%r9)					   \
-	   VPREFETCH2(19,%r9)					   \
+	   VPREFETCH_M2(16,%r9)					   \
+	   VPREFETCH_M2(17,%r9)					   \
+	   VPREFETCH_M2(18,%r9)					   \
+	   VPREFETCH_M2(19,%r9)					   \
            VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 	   /*36*/					           \
            VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
            VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
            VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
-	   VPREFETCH2(20,%r9)					   \
-	   VPREFETCH2(21,%r9)					   \
-	   VPREFETCH2(22,%r9)					   \
-	   VPREFETCH2(23,%r9)					   \
-	   VPREFETCHG(2,%r8)					   \
-	   VPREFETCHG(3,%r8)					   \
-	   VPREFETCH2(4,%r8)					   \
-	   VPREFETCH2(5,%r8)					   \
+	   VPREFETCH_M2(20,%r9)					   \
+	   VPREFETCH_M2(21,%r9)					   \
+	   VPREFETCH_M2(22,%r9)					   \
+	   VPREFETCH_M2(23,%r9)					   \
+	   VPREFETCH_G1(2,%r8)					   \
+	   VPREFETCH_G1(3,%r8)					   \
+	   VPREFETCH_G2(4,%r8)					   \
+	   VPREFETCH_G2(5,%r8)					   \
 	   /*42 insns*/						);
 
 #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
@@ -794,8 +843,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
            VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
            VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
-	   /*	   VPREFETCHG(2,%r8)*/				   \
-	   /*	   VPREFETCHG(3,%r8)*/				   \
+	   /*	   VPREFETCH1(2,%r8)*/				   \
+	   /*	   VPREFETCH1(3,%r8)*/				   \
 	   /*42 insns*/						);
 
 

From 661b0ab45de2e6765180fa60b21fff8fc9bbe2fc Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 30 Jun 2016 13:07:42 -0700
Subject: [PATCH 10/21] Updated to have perfect prefetching for the
 s-vectorised kernel with any cache blocking.

---
 lib/Stencil.h                                 |   5 +
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 153 +++++++++---------
 lib/simd/Intel512wilson.h                     |  58 ++++---
 3 files changed, 119 insertions(+), 97 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index bc015370..f5b6c288 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -272,6 +272,11 @@
 	 if (local) return base + _entries[ent]._byte_offset;
 	 else       return _entries[ent]._byte_offset;
        }
+       inline uint64_t GetPFInfo(int ent,uint64_t base) {
+	 int local = _entries[ent]._is_local;
+	 if (local) return base + _entries[ent]._byte_offset;
+	 else       return        _entries[ent]._byte_offset;
+       }
 
        // Comms buffers
        std::vector<Vector<scalar_object> > u_simd_send_buf;
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index 7373d2eb..4f3ef861 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,43 +1,44 @@
 {
-  int locala,perma, ptypea;
-  int localb,permb, ptypeb;
-  int localc,permc, ptypec;
-  uint64_t basea, baseb, basec;
-
+  int local,perm, ptype;
+  uint64_t base;
+  uint64_t basep;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
   vComplexF *isigns = &signs[0];
 
   MASK_REGS;
-
+  int nmax=U._grid->oSites();
   for(int site=0;site<Ns;site++) {
   int sU =lo.Reorder(ssU);
+  int ssn=ssU+1; 
+  if(ssn>=nmax) ssn=0;
+  int sUn=lo.Reorder(ssn);
   for(int s=0;s<Ls;s++) {
-  ss     =sU*Ls+s;
-
+  ss =sU*Ls+s;
+  ssn=sUn*Ls+s; 
   ////////////////////////////////
   // Xp
   ////////////////////////////////
-  int ent=ss*8;// 2*Ndim
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH1_CHIMU(basea);
+  int  ent=ss*8;// 2*Ndim
+  int nent=ssn*8;
+
   PF_GAUGE(Xp); 
+  base  = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
+  PREFETCH1_CHIMU(base);
 
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-
-  if ( locala ) {
+  basep = st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);
-    XM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
+    XM_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR3,perm);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+    MULT_2SPIN_DIR_PFXP(Xp,basep);
   }
   LOAD64(%r10,isigns);
   XM_RECON;
@@ -45,17 +46,18 @@
   ////////////////////////////////
   // Yp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Tp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  if ( localb ) {
+  basep = st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
+    YM_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR2,perm);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(base);
   }
+  base  = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFYP(Yp,basec);
+    MULT_2SPIN_DIR_PFYP(Yp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YM_RECON_ACCUM;
@@ -63,17 +65,18 @@
   ////////////////////////////////
   // Zp
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  if ( localc ) {
+  basep = st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR1,permc);
+    ZM_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR1,perm);
   } else { 
-    LOAD_CHI(basec);
+    LOAD_CHI(base);
   }
+  base  = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFZP(Zp,basea);
+    MULT_2SPIN_DIR_PFZP(Zp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZM_RECON_ACCUM;
@@ -81,17 +84,18 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-  if ( locala ) {
+  basep = st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR0,perma);
+    TM_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR0,perm);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFTP(Tp,baseb);
+    MULT_2SPIN_DIR_PFTP(Tp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TM_RECON_ACCUM;
@@ -99,17 +103,19 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  if ( localb ) {
+  basep= (uint64_t) &out._odata[ss];
+  //  basep= st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR3,permb);
+    XP_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR3,perm);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFXM(Xm,basec);
+    MULT_2SPIN_DIR_PFXM(Xm,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   XP_RECON_ACCUM;
@@ -117,17 +123,18 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  if ( localc ) {
+  basep= st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR2,permc);
+    YP_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR2,perm);
   } else { 
-    LOAD_CHI(basec);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFYM(Ym,basea);
+    MULT_2SPIN_DIR_PFYM(Ym,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YP_RECON_ACCUM;
@@ -135,17 +142,18 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  basec = (uint64_t)&out._odata[ss];
-  PREFETCH_CHIMU(basec);
-  if ( locala ) {
+  basep= st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
+    ZP_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR1,perm);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+    MULT_2SPIN_DIR_PFZM(Zm,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZP_RECON_ACCUM;
@@ -153,23 +161,24 @@
   ////////////////////////////////
   // Tm
   ////////////////////////////////
-  //  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  //  PREFETCH_CHIMU(basea);
-  if ( localb ) {
+  basep= st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
+    TP_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR0,perm);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(base);
   }
+  base= (uint64_t) &out._odata[ss];
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFTM(Tm,basec);
+    MULT_2SPIN_DIR_PFTM(Tm,basep);
   }
-  //  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  SAVE_RESULT(&out._odata[ss],basec);
+  basep= st.GetPFInfo(nent,plocal); nent++;
+  SAVE_RESULT(base,basep);
   
   }
   ssU++;
diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 9deffd80..660d07d6 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -261,8 +261,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define XM_PROJMEM(PTR) \
   LOAD64(%r8,PTR)\
   __asm__ (								\
-	   SHUF_CHIMU23i						\
 	   LOAD_CHIi \
+	   SHUF_CHIMU23i						\
 	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
 	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
 	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
@@ -290,8 +290,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZM_PROJMEM(PTR) \
   LOAD64(%r8,PTR)							\
   __asm__ (								\
-	   SHUF_CHIMU23i						\
            LOAD_CHIi \
+	   SHUF_CHIMU23i						\
 	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
 	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
 	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
@@ -548,24 +548,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define AVX512_PF_L2_TABLE
 #undef  AVX512_PF_L2_LINEAR
 
-#ifdef AVX512_PF_L2_TABLE
-#define VPREFETCH_P1(A,B)  VPREFETCH1(A,B)
-#define VPREFETCH_P2(A,B) VPREFETCH1(A,B)
-#else
-#define VPREFETCH_P1(A,B)
-#define VPREFETCH_P2(A,B)
-#endif
-#ifdef AVX512_PF_L2_LINEAR
-#define VPREFETCH_M1(A,B) 
+#ifdef AVX512_PF_L2_TABLE  
+// P1 Fetches the base pointer for next link into L1 with P1
+// M1 Fetches the next site pointer into L2
+#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
+#define VPREFETCH_P2(A,B) 
+#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
 #define VPREFETCH_M2(A,B) 
-#else 
+#endif
+
+#ifdef AVX512_PF_L2_LINEAR
 #define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
 #define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
+#define VPREFETCH_P1(A,B) 
+#define VPREFETCH_P2(A,B)
 #endif
+
 #ifdef AVX512_PF_L2_GAUGE
 #define VPREFETCH_G1(A,B)  VPREFETCH1(A,B)
 #define VPREFETCH_G2(A,B)  VPREFETCH2(A,B)
-#else
 #endif
 
 #define PF_GAUGE(A) \
@@ -593,21 +594,26 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VSTORE(11,%r8,result_32) 	VPREFETCH_M1(11,%r9)	\
 						);
 
+#ifdef AVX512_PF_L2_TABLE
 #define PREFETCH_CHIMU(A) \
   LOAD64(%r9,A)							\
   __asm__ (							\
-	   VPREFETCH_P2(0,%r9)					\
-	   VPREFETCH_P2(1,%r9)					\
-	   VPREFETCH_P2(2,%r9)					\
-	   VPREFETCH_P2(3,%r9)					\
-	   VPREFETCH_P2(4,%r9)					\
-	   VPREFETCH_P2(5,%r9)					\
-	   VPREFETCH_P2(6,%r9)					\
-	   VPREFETCH_P2(7,%r9)					\
-	   VPREFETCH_P2(8,%r9)					\
-	   VPREFETCH_P2(9,%r9)					\
-	   VPREFETCH_P2(10,%r9)					\
-	   VPREFETCH_P2(11,%r9));
+	   VPREFETCH_P1(0,%r9)					\
+	   VPREFETCH_P1(1,%r9)					\
+	   VPREFETCH_P1(2,%r9)					\
+	   VPREFETCH_P1(3,%r9)					\
+	   VPREFETCH_P1(4,%r9)					\
+	   VPREFETCH_P1(5,%r9)					\
+	   VPREFETCH_P1(6,%r9)					\
+	   VPREFETCH_P1(7,%r9)					\
+	   VPREFETCH_P1(8,%r9)					\
+	   VPREFETCH_P1(9,%r9)					\
+	   VPREFETCH_P1(10,%r9)					\
+	   VPREFETCH_P1(11,%r9));
+
+#else
+#define PREFETCH_CHIMU(A)
+#endif
 
 #define PREFETCH1_CHIMU(A) \
   LOAD64(%r9,A)							\
@@ -811,6 +817,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VPREFETCH_G1(3,%r8)					   \
 	   VPREFETCH_G2(4,%r8)					   \
 	   VPREFETCH_G2(5,%r8)					   \
+	   VPREFETCH_G2(6,%r8)					   \
+	   VPREFETCH_G2(7,%r8)					   \
 	   /*42 insns*/						);
 
 #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \

From 532f41dd61c3105369f58fb6dbd9329d29eca90b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 30 Jun 2016 14:00:34 -0700
Subject: [PATCH 11/21] Asm only for avx512

---
 lib/qcd/action/fermion/WilsonKernels.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index 672c23d6..4edd25f9 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -42,12 +42,15 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,Dou
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
+#ifdef AVX512
   if ( AsmOpt ) {
 
     WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 
   } else {
-
+#else
+  {  
+#endif
     for(int site=0;site<Ns;site++) {
       for(int s=0;s<Ls;s++) {
 	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);

From 6d58cb2a68bf939e253acff4b3e55d40aa5f9f7b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 19 Jun 2016 11:45:58 -0700
Subject: [PATCH 12/21] Enable reordering of the loops in the assembler for
 cache friendly. This gets in the way of L2 prefetching however. Do next next
 link in stencil prefetching.

---
 benchmarks/Benchmark_dwf_sweep.cc             | 358 ++++++++++++++++++
 lib/qcd/action/fermion/DomainWallFermion.h    |   2 +-
 lib/qcd/action/fermion/WilsonFermion.cc       |  14 +-
 lib/qcd/action/fermion/WilsonFermion.h        |   6 +-
 lib/qcd/action/fermion/WilsonFermion5D.cc     |   4 +-
 lib/qcd/action/fermion/WilsonKernels.cc       |  20 +-
 lib/qcd/action/fermion/WilsonKernels.h        |  14 +-
 lib/qcd/action/fermion/WilsonKernelsAsm.cc    |  36 +-
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h |  76 ++--
 .../action/fermion/WilsonKernelsAsmBody.h.ab  | 163 ++++++++
 lib/qcd/action/fermion/WilsonKernelsHand.cc   |  36 +-
 lib/simd/Intel512common.h                     |   6 +-
 lib/simd/Intel512wilson.h                     |  32 +-
 lib/stencil/Lebesgue.cc                       |  18 +-
 lib/stencil/Lebesgue.h                        |   1 +
 15 files changed, 670 insertions(+), 116 deletions(-)
 create mode 100644 benchmarks/Benchmark_dwf_sweep.cc
 create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab

diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
new file mode 100644
index 00000000..302059a4
--- /dev/null
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -0,0 +1,358 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_dwf.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+#include <PerfCount.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
+void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  if ( getenv("ASMOPT") )  {
+    QCD::WilsonKernelsStatic::AsmOpt=1;
+  } else { 
+    QCD::WilsonKernelsStatic::AsmOpt=0;
+  }
+
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+
+  int Lmax=32;
+  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
+  for (int L=8;L<Lmax;L*=2){
+    std::vector<int> latt4(4,L);
+    for(int d=4;d>0;d--){
+      if ( d<=3 ) latt4[d]*=2;
+      std::cout << GridLogMessage <<"\t";
+      for(int d=0;d<Nd;d++){
+	std::cout<<latt4[d]<<"x";
+      }
+      std::cout <<Ls<<"\t" ;
+      benchDw (latt4,Ls,threads,0);
+      benchsDw(latt4,Ls,threads,0);
+      std::cout<<std::endl;
+    }
+  }
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  {
+    std::vector<int> latt4(4,16);
+    std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
+    benchDw (latt4,Ls,threads,1);
+    std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
+    benchsDw(latt4,Ls,threads,1);
+  }
+
+  Grid_finalize();
+}
+
+#undef CHECK
+
+void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
+{
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+#ifdef CHECK 
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  LatticeFermion src   (FGrid); random(RNG5,src);
+  LatticeGaugeField Umu(UGrid); 
+  random(RNG4,Umu);
+#else 
+  LatticeFermion src   (FGrid); src=zero;
+  LatticeGaugeField Umu(UGrid); Umu=zero;
+#endif
+
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid);    ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+
+  ColourMatrix cm = Complex(1.0,0.0);
+
+
+  LatticeGaugeField Umu5d(FGrid); 
+
+  // replicate across fifth dimension
+  for(int ss=0;ss<Umu._grid->oSites();ss++){
+    for(int s=0;s<Ls;s++){
+      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
+    }
+  }
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  std::vector<LatticeColourMatrix> U(4,FGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+  }
+
+#ifdef CHECK
+  if (1)
+  {
+    ref = zero;
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+#endif
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  RealD NP = UGrid->_Nprocessors;
+
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  
+  double t0=usecond();
+  Dw.Dhop(src,result,0);
+  double t1=usecond();
+
+  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+
+  if (ncall < 5 ) exit(0);
+
+  Dw.Dhop(src,result,0);
+
+  PerformanceCounter Counter(8);
+  Counter.Start();
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.Dhop(src,result,0);
+  }
+  t1=usecond();
+  Counter.Stop();
+  if ( report ) {
+    Counter.Report();
+  }
+  
+  if ( ! report ) 
+    {
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=1344*volume*ncall;
+      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
+    }
+  
+#ifdef CHECK
+    err = ref-result; 
+    RealD errd = norm2(err);
+    if ( errd> 1.0e-4 ) {
+      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
+      exit(-1);
+    }
+#endif
+    
+  LatticeFermion src_e (FrbGrid);
+  LatticeFermion src_o (FrbGrid);
+  LatticeFermion r_e   (FrbGrid);
+  LatticeFermion r_o   (FrbGrid);
+  LatticeFermion r_eo  (FGrid);
+  
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+  
+  {
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+    }
+    double t1=usecond();
+    
+    if(!report){
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=(1344.0*volume*ncall)/2;
+      std::cout<< flops/(t1-t0);
+    }
+  }
+  
+}
+
+#undef CHECK_SDW
+void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
+{
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+#ifdef CHECK_SDW
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  LatticeFermion src   (FGrid); random(RNG5,src);
+  LatticeGaugeField Umu(UGrid); 
+  random(RNG4,Umu);
+#else 
+  LatticeFermion src   (FGrid); src=zero;
+  LatticeGaugeField Umu(UGrid); Umu=zero;
+#endif
+
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid);    ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+
+  ColourMatrix cm = Complex(1.0,0.0);
+
+  LatticeGaugeField Umu5d(FGrid); 
+
+  // replicate across fifth dimension
+  for(int ss=0;ss<Umu._grid->oSites();ss++){
+    for(int s=0;s<Ls;s++){
+      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
+    }
+  }
+
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+    LatticeFermionF ssrc(sFGrid);
+    LatticeFermionF sref(sFGrid);
+    LatticeFermionF sresult(sFGrid);
+    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
+  
+    for(int x=0;x<latt4[0];x++){
+    for(int y=0;y<latt4[1];y++){
+    for(int z=0;z<latt4[2];z++){
+    for(int t=0;t<latt4[3];t++){
+    for(int s=0;s<Ls;s++){
+      std::vector<int> site({s,x,y,z,t});
+      SpinColourVectorF tmp;
+      peekSite(tmp,src,site);
+      pokeSite(tmp,ssrc,site);
+    }}}}}
+
+    double t0=usecond();
+    sDw.Dhop(ssrc,sresult,0);
+    double t1=usecond();
+
+    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+
+    PerformanceCounter Counter(8);
+    Counter.Start();
+    t0=usecond();
+    for(int i=0;i<ncall;i++){
+      sDw.Dhop(ssrc,sresult,0);
+    }
+    t1=usecond();
+    Counter.Stop();
+
+    if ( report ) {
+      Counter.Report();
+    } else { 
+
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=1344*volume*ncall;
+      std::cout<<"\t"<< flops/(t1-t0);
+    }
+
+
+    LatticeFermionF sr_eo(sFGrid);
+    LatticeFermionF serr(sFGrid);
+    
+    LatticeFermion ssrc_e (sFrbGrid);
+    LatticeFermion ssrc_o (sFrbGrid);
+    LatticeFermion sr_e   (sFrbGrid);
+    LatticeFermion sr_o   (sFrbGrid);
+      
+    pickCheckerboard(Even,ssrc_e,ssrc);
+    pickCheckerboard(Odd,ssrc_o,ssrc);
+
+    setCheckerboard(sr_eo,ssrc_o);
+    setCheckerboard(sr_eo,ssrc_e);
+    
+    sr_e = zero;
+    sr_o = zero;
+    
+    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+    PerformanceCounter CounterSdw(8);
+    CounterSdw.Start();
+    t0=usecond();
+    for(int i=0;i<ncall;i++){
+      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+    }
+    t1=usecond();
+    CounterSdw.Stop();
+
+    if ( report ) { 
+      CounterSdw.Report();
+    } else {
+
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=(1344.0*volume*ncall)/2;
+      std::cout<<"\t"<< flops/(t1-t0);
+    }
+}
+
+
diff --git a/lib/qcd/action/fermion/DomainWallFermion.h b/lib/qcd/action/fermion/DomainWallFermion.h
index b05733aa..8e41aa63 100644
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@@ -63,7 +63,7 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
 	
-	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
+	//	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 	// Call base setter
 	this->SetCoefficientsTanh(zdata,1.0,0.0);
 
diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc
index 2618286e..59632409 100644
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -53,6 +53,8 @@ namespace QCD {
 	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
 	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
 	mass(_mass),
+	Lebesgue(_grid),
+	LebesgueEvenOdd(_cbgrid),
 	Umu(&Fgrid),
 	UmuEven(&Hgrid),
 	UmuOdd (&Hgrid) 
@@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
     
     out.checkerboard = in.checkerboard;
     
-    DhopInternal(Stencil,Umu,in,out,dag);
+    DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
   }
   
   template<class Impl>
@@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
     assert(in.checkerboard==Even);
     out.checkerboard = Odd;
     
-    DhopInternal(StencilEven,UmuOdd,in,out,dag);
+    DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
   }
   
   template<class Impl>
@@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
     assert(in.checkerboard==Odd);
     out.checkerboard = Even;
     
-    DhopInternal(StencilOdd,UmuEven,in,out,dag);
+    DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
   }
   
   template<class Impl>
@@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
   };
 
   template<class Impl>
-  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
+  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
   {
     assert((dag==DaggerNo) ||(dag==DaggerYes));
@@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
     if ( dag == DaggerYes ) {
 PARALLEL_FOR_LOOP
       for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
       }
     } else {
 PARALLEL_FOR_LOOP
       for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
+	Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
       }
     }
   };
diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h
index c1f4d682..3de2cac4 100644
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -111,7 +111,7 @@ namespace Grid {
 			 const FermionField &B,
 			 int dag);
 
-      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
+      void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;
 
       // Constructor
@@ -146,6 +146,10 @@ namespace Grid {
       DoubledGaugeField Umu;
       DoubledGaugeField UmuEven;
       DoubledGaugeField UmuOdd;
+
+      LebesgueOrder Lebesgue;
+      LebesgueOrder LebesgueEvenOdd;
+
       
     };
 
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 66047b72..08069bed 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -321,14 +321,14 @@ PARALLEL_FOR_LOOP
     for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	int sF=LLs*sU;
-	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
     }
   } else {
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<U._grid->oSites();ss++){
       int sU=ss;
       int sF=LLs*sU;
-      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
     }
   }
 }
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index 63ba553d..672c23d6 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -38,20 +38,20 @@ template<class Impl>
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
   if ( AsmOpt ) {
 
-    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
+    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 
   } else {
 
     for(int site=0;site<Ns;site++) {
       for(int s=0;s<Ls;s++) {
-	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
-	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
+	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
+	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 	sF++;
       }
       sU++;
@@ -61,17 +61,17 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 }
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
   // No asm implementation yet.
-  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
+  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
   //  else
   for(int site=0;site<Ns;site++) {
     for(int s=0;s<Ls;s++) {
-      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
-      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
+      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
       sF++;
     }
     sU++;
@@ -84,7 +84,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
   ////////////////////////////////////////////
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -262,7 +262,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaug
 
   // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,const FermionField &in, FermionField &out)
 {
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index da6751dd..231fa293 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -53,11 +53,11 @@ namespace Grid {
      
     public:
 
-     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
       
-     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
 
@@ -67,24 +67,24 @@ namespace Grid {
 
     private:
      // Specialised variants
-     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU, const FermionField &in, FermionField &out);
       
-     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in,FermionField &out);
 
-     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 
 
-     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out);
      
-     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				 int sF,int sU,const FermionField &in, FermionField &out);
     public:
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index bccf72c7..cdbe2c8e 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -39,9 +39,9 @@ namespace QCD {
   // Default to no assembler implementation
   ///////////////////////////////////////////////////////////
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
   assert(0);
 }
@@ -71,9 +71,9 @@ static int signInit = setupSigns();
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 
 template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 
 #undef VMOVIDUP
@@ -85,31 +85,31 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaug
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
-void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 
 #endif
 
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 
-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
 
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index aae049e2..bd96b7d5 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,7 +1,8 @@
 {
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
-  uint64_t basea, baseb;
+  int localc,permc, ptypec;
+  uint64_t basea, baseb, basec;
   uint64_t basex;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
@@ -11,14 +12,22 @@
   MASK_REGS;
 
   for(int site=0;site<Ns;site++) {
+  int sU=lo.Reorder(ssU);
+
   for(int s=0;s<Ls;s++) {
+  ss     =sU*Ls+s;
 
   ////////////////////////////////
   // Xp
   ////////////////////////////////
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+
   basex = basea;
 
   if ( locala ) {
@@ -38,6 +47,7 @@
   // Yp
   ////////////////////////////////
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -46,7 +56,7 @@
     LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFYP(Yp,basea);
+    MULT_2SPIN_DIR_PFYP(Yp,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YM_RECON_ACCUM;
@@ -55,15 +65,16 @@
   // Zp
   ////////////////////////////////
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  if ( locala ) {
+  PREFETCH_CHIMU(baseb);
+  if ( localc ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
+    ZM_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR1,permc);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(basec);
   }
   {
-    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+    MULT_2SPIN_DIR_PFZP(Zp,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZM_RECON_ACCUM;
@@ -71,16 +82,17 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  if ( localb ) {
+  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
+    TM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR0,perma);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basea);
   }
   {
-    MULT_2SPIN_DIR_PFTP(Tp,basea);
+    MULT_2SPIN_DIR_PFTP(Tp,baseb);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TM_RECON_ACCUM;
@@ -88,16 +100,17 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  if ( locala ) {
+  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
+    XP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR3,permb);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+    MULT_2SPIN_DIR_PFXM(Xm,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   XP_RECON_ACCUM;
@@ -105,13 +118,14 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  if ( localb ) {
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  if ( localc ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
+    YP_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR2,permc);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basec);
   }
   {
     MULT_2SPIN_DIR_PFYM(Ym,basea);
@@ -122,7 +136,8 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -140,6 +155,7 @@
   // Tm
   ////////////////////////////////
   basea = (uint64_t)&out._odata[ss];
+  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TP_PROJMEM(baseb);
@@ -148,17 +164,15 @@
     LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFTM(Tm,basea);
+    MULT_2SPIN_DIR_PFTM(Tm,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  PREFETCH_CHIMU(basex);
+  //  PREFETCH_CHIMU(basex);
   SAVE_RESULT(&out._odata[ss]);
-
   
-  ss++;
-  } 
-  sU++;
+  }
+  ssU++;
   }
 }
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
new file mode 100644
index 00000000..3ba9eec6
--- /dev/null
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
@@ -0,0 +1,163 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  uint64_t basea, baseb;
+  uint64_t basex;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+
+  for(int site=0;site<Ns;site++) {
+  int sU=lo.Reorder(ssU);  
+  for(int s=0;s<Ls;s++) {
+  ss=sU*Ls+s;
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  basex = basea;
+
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  SAVE_RESULT(&out._odata[ss]);
+
+  } 
+  ssU++;
+  }
+}
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index 757778d3..cb6c01a1 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -312,7 +312,7 @@ namespace QCD {
 
 
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -555,7 +555,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 }
 
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -803,7 +803,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
   // Specialise Gparity to simple implementation
   ////////////////////////////////////////////////
 template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -811,7 +811,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
 }
 
 template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -819,7 +819,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
 }
 
 template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -827,7 +827,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
 }
 
 template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -839,44 +839,44 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //
 
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
 
 
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 
 
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 
diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index 2fea9235..a3cd980d 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -88,7 +88,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" 
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
+#define VPREFETCHNTA(O,A) 
+#define VPREFETCH(O,A)    
+
 #define VEVICT(O,A)   
 
 //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
@@ -124,8 +128,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 
-#define VPREFETCHNTA(O,A) 
-#define VPREFETCH(O,A)    
 
 #define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 1955cc6d..2bc0545d 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -559,22 +559,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSUB(UChi_02,result_22,result_22)\
   VSUB(UChi_12,result_32,result_32) );
 
-#define PREFETCH_CHIMU(A) \
+#define PREFETCH_CHIMU(A) 
+/*
   LOAD64(%r9,A)						\
 	   __asm__ (						\
-  VPREFETCHG(12,%r9)\
-  VPREFETCHG(13,%r9)\
-  VPREFETCHG(14,%r9)\
-  VPREFETCHG(15,%r9)\
-  VPREFETCHG(16,%r9)\
-  VPREFETCHG(17,%r9)\
-  VPREFETCHG(18,%r9)\
-  VPREFETCHG(19,%r9)\
-  VPREFETCHG(20,%r9)\
-  VPREFETCHG(21,%r9)\
-  VPREFETCHG(22,%r9)\
-  VPREFETCHG(23,%r9));
-
+  VPREFETCHG(0,%r9)\
+  VPREFETCHG(1,%r9)\
+  VPREFETCHG(2,%r9)\
+  VPREFETCHG(3,%r9)\
+  VPREFETCHG(4,%r9)\
+  VPREFETCHG(5,%r9)\
+  VPREFETCHG(6,%r9)\
+  VPREFETCHG(7,%r9)\
+  VPREFETCHG(8,%r9)\
+  VPREFETCHG(9,%r9)\
+  VPREFETCHG(10,%r9)\
+  VPREFETCHG(11,%r9));
+*/
 #define PERMUTE_DIR0 __asm__ ( 	\
   VPERM0(Chi_00,Chi_00)	\
   VPERM0(Chi_01,Chi_01)	\
@@ -612,8 +613,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   LOAD64(%r8,ptr)						\
   LOAD64(%r9,pf)						\
 	   __asm__ (						\
-	   VPREFETCH2(9,%r8)				   \
-	   VPREFETCH2(10,%r8)					   \
+	   VPREFETCH2(9,%r8)				   VPREFETCH2(10,%r8)					   \
 	   VPREFETCH2(11,%r8)					   \
 	   VPREFETCH2(12,%r8)					   \
 	   VPREFETCH2(13,%r8)					   \
diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc
index 7704e08f..c34b5c96 100644
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
 {
   grid = _grid;
   if ( Block[0]==0) ZGraph();
+  else if ( Block[1]==0) NoBlocking();
   else CartesianBlocking();
 }
 
+void LebesgueOrder::NoBlocking(void) 
+{
+  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
+  _LebesgueReorder.resize(0);
+  for ( int s = 0 ; s!= grid->oSites();s++){
+    _LebesgueReorder.push_back(s);
+  }
+}
 void LebesgueOrder::CartesianBlocking(void) 
 {
   _LebesgueReorder.resize(0);
 
-  std::cout << GridLogMessage << " CartesianBlocking ";
-  for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
-  std::cout<<std::endl; 
+  std::cout << GridLogDebug << " CartesianBlocking ";
+  //    for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
+  //    std::cout<<std::endl; 
 
   IndexInteger ND = grid->_ndimension;
 
@@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
 void LebesgueOrder::ZGraph(void) 
 {
   _LebesgueReorder.resize(0);
-  
+
+  std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
   // Align up dimensions to power of two.
   const IndexInteger one=1;
 
diff --git a/lib/stencil/Lebesgue.h b/lib/stencil/Lebesgue.h
index f69b089d..1675d16c 100644
--- a/lib/stencil/Lebesgue.h
+++ b/lib/stencil/Lebesgue.h
@@ -59,6 +59,7 @@ namespace Grid {
     // Cartesian stencil blocking strategy
     /////////////////////////////////
     static std::vector<int> Block;
+    void NoBlocking(void);
     void CartesianBlocking(void);
     void IterateO(int ND,int dim,
 		  std::vector<IndexInteger> & xo,

From 51cb2d43289cec27a511c64825883c4bc7662279 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 19 Jun 2016 11:59:10 -0700
Subject: [PATCH 13/21] update file lists

---
 benchmarks/Make.inc | 6 +++++-
 lib/Make.inc        | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/Make.inc b/benchmarks/Make.inc
index b60ec835..8d0721a4 100644
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,5 +1,5 @@
 
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 
 
 Benchmark_comms_SOURCES=Benchmark_comms.cc
@@ -14,6 +14,10 @@ Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
 Benchmark_dwf_ntpf_LDADD=-lGrid
 
 
+Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
+Benchmark_dwf_sweep_LDADD=-lGrid
+
+
 Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 Benchmark_memory_asynch_LDADD=-lGrid
 
diff --git a/lib/Make.inc b/lib/Make.inc
index 900da916..8763692a 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonKernelsAsmBody.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h
+HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonKernelsAsmBody.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512wilson.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h
 
 CCFILES=./Init.cc ./Log.cc ./PerfCount.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc

From 2d8bb4c594b52233b204cf1ed6d318e231acfe14 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 25 Jun 2016 11:08:05 -0700
Subject: [PATCH 14/21] Tweaks

---
 benchmarks/Benchmark_dwf_sweep.cc             |  16 +-
 lib/qcd/action/fermion/WilsonKernelsAsm.cc    |   7 +-
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h |  79 +++-----
 .../action/fermion/WilsonKernelsAsmBody.h.abc | 187 ++++++++++++++++++
 lib/simd/Intel512common.h                     |   4 +-
 5 files changed, 242 insertions(+), 51 deletions(-)
 create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc

diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
index 302059a4..94a00903 100644
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -68,10 +68,12 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 
   int Lmax=32;
+  int dmin=0;
   if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
-  for (int L=8;L<Lmax;L*=2){
+  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
+  for (int L=8;L<=Lmax;L*=2){
     std::vector<int> latt4(4,L);
-    for(int d=4;d>0;d--){
+    for(int d=4;d>dmin;d--){
       if ( d<=3 ) latt4[d]*=2;
       std::cout << GridLogMessage <<"\t";
       for(int d=0;d<Nd;d++){
@@ -170,7 +172,11 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
   Dw.Dhop(src,result,0);
   double t1=usecond();
 
+#ifdef TIMERS_OFF
+    int ncall =10;
+#else
   int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+#endif
 
   if (ncall < 5 ) exit(0);
 
@@ -297,7 +303,11 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
     sDw.Dhop(ssrc,sresult,0);
     double t1=usecond();
 
+#ifdef TIMERS_OFF
+    int ncall =10;
+#else 
     int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+#endif
 
     PerformanceCounter Counter(8);
     Counter.Start();
@@ -340,7 +350,9 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
     CounterSdw.Start();
     t0=usecond();
     for(int i=0;i<ncall;i++){
+      __SSC_START;
       sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+      __SSC_STOP;
     }
     t1=usecond();
     CounterSdw.Stop();
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index cdbe2c8e..33c464ac 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -67,9 +67,12 @@ int setupSigns(void ){
 }
 static int signInit = setupSigns();
 
+#define label(A)  ilabel(A)
+#define ilabel(A) ".globl\n"  #A ":\n" 
+
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-
+#define FX(A) WILSONASM_ ##A
 template<>
 void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
@@ -80,6 +83,8 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #undef VMOVRDUP
 #undef MAYBEPERM
 #undef MULT_2SPIN
+#undef FX 
+#define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index bd96b7d5..d3e86276 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,8 +1,7 @@
 {
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
-  int localc,permc, ptypec;
-  uint64_t basea, baseb, basec;
+  uint64_t basea, baseb;
   uint64_t basex;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
@@ -12,22 +11,15 @@
   MASK_REGS;
 
   for(int site=0;site<Ns;site++) {
-  int sU=lo.Reorder(ssU);
-
+  int sU=lo.Reorder(ssU);  
   for(int s=0;s<Ls;s++) {
-  ss     =sU*Ls+s;
-
+  ss=sU*Ls+s;
   ////////////////////////////////
   // Xp
   ////////////////////////////////
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-
   basex = basea;
 
   if ( locala ) {
@@ -47,7 +39,6 @@
   // Yp
   ////////////////////////////////
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -56,7 +47,7 @@
     LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFYP(Yp,basec);
+    MULT_2SPIN_DIR_PFYP(Yp,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YM_RECON_ACCUM;
@@ -65,16 +56,15 @@
   // Zp
   ////////////////////////////////
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  if ( localc ) {
+  if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR1,permc);
+    ZM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
   } else { 
-    LOAD_CHI(basec);
+    LOAD_CHI(basea);
   }
   {
-    MULT_2SPIN_DIR_PFZP(Zp,basea);
+    MULT_2SPIN_DIR_PFZP(Zp,baseb);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZM_RECON_ACCUM;
@@ -82,17 +72,16 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-  if ( locala ) {
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR0,perma);
+    TM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFTP(Tp,baseb);
+    MULT_2SPIN_DIR_PFTP(Tp,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TM_RECON_ACCUM;
@@ -100,17 +89,16 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  if ( localb ) {
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR3,permb);
+    XP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basea);
   }
   {
-    MULT_2SPIN_DIR_PFXM(Xm,basec);
+    MULT_2SPIN_DIR_PFXM(Xm,baseb);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   XP_RECON_ACCUM;
@@ -118,14 +106,13 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  if ( localc ) {
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR2,permc);
+    YP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
   } else { 
-    LOAD_CHI(basec);
+    LOAD_CHI(baseb);
   }
   {
     MULT_2SPIN_DIR_PFYM(Ym,basea);
@@ -136,8 +123,7 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -155,7 +141,6 @@
   // Tm
   ////////////////////////////////
   basea = (uint64_t)&out._odata[ss];
-  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TP_PROJMEM(baseb);
@@ -163,16 +148,16 @@
   } else { 
     LOAD_CHI(baseb);
   }
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal);
   {
-    MULT_2SPIN_DIR_PFTM(Tm,basec);
+    MULT_2SPIN_DIR_PFTM(Tm,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  //  PREFETCH_CHIMU(basex);
-  SAVE_RESULT(&out._odata[ss]);
-  
-  }
+  SAVE_RESULT(&out._odata[ss],baseb);
+
+  } 
   ssU++;
   }
 }
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
new file mode 100644
index 00000000..5a3e01f7
--- /dev/null
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
@@ -0,0 +1,187 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  int localc,permc, ptypec;
+  uint64_t basea, baseb, basec;
+  uint64_t basex;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+
+  for(int site=0;site<Ns;site++) {
+  int sU=lo.Reorder(ssU);
+
+  for(int s=0;s<Ls;s++) {
+  ss     =sU*Ls+s;
+
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+
+  basex = basea;
+
+  label(FX(XP) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  label(FX(YP) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  label(FX(ZP) );
+  if ( localc ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR1,permc);
+  } else { 
+    LOAD_CHI(basec);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  label(FX(TP) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR0,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  label(FX(XM) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR3,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  label(FX(YM) );
+  if ( localc ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR2,permc);
+  } else { 
+    LOAD_CHI(basec);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  label(FX(ZM) );
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  PREFETCH_CHIMU(basea);
+  label(FX(TM) );
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basec);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  //  PREFETCH_CHIMU(basex);
+  label(FX(SAV) );
+  SAVE_RESULT(&out._odata[ss]);
+  
+  }
+  ssU++;
+  }
+}
diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index a3cd980d..6878bcfb 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+   /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
 
@@ -37,6 +37,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            "mov     $0x5555, %%eax \n"\
            "kmovw    %%eax, %%k7 \n" : : : "%eax");
 
+//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" );
+
 #define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 #define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
 

From a25bec87d981a393d7f03b81254587543ae8d3d6 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 25 Jun 2016 12:54:14 -0700
Subject: [PATCH 15/21] Prefetch during save

---
 lib/simd/Intel512wilson.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 2bc0545d..207d9db8 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -104,7 +104,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 #define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 #define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
-#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
+#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
 
 #define LOAD_CHIMUi \
 	   LOAD_CHIMU01i	\
@@ -169,21 +169,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSTORE(5,%r8,Chi_12)				\
 						);
 
-#define SAVE_RESULTi(PTR)\
+#define SAVE_RESULTi(PTR,pf)			\
 	   LOAD64(%r8,PTR)			\
+	   LOAD64(%r9,pf)			\
   __asm__ (					\
-	   VSTORE(0,%r8,result_00)		\
-	   VSTORE(1,%r8,result_01)		\
-	   VSTORE(2,%r8,result_02)		\
-	   VSTORE(3,%r8,result_10)		\
-	   VSTORE(4,%r8,result_11)		\
-	   VSTORE(5,%r8,result_12)		\
-	   VSTORE(6,%r8,result_20)		\
-	   VSTORE(7,%r8,result_21)		\
-	   VSTORE(8,%r8,result_22)		\
-	   VSTORE(9,%r8,result_30)		\
-	   VSTORE(10,%r8,result_31)		\
-	   VSTORE(11,%r8,result_32) 		\
+	   VSTORE(0,%r8,result_00)	VPREFETCHG(0,%r9)	\
+	   VSTORE(1,%r8,result_01)	VPREFETCHG(1,%r9)	\
+	   VSTORE(2,%r8,result_02)	VPREFETCHG(2,%r9)	\
+	   VSTORE(3,%r8,result_10)	VPREFETCHG(3,%r9)	\
+	   VSTORE(4,%r8,result_11)	VPREFETCHG(4,%r9)	\
+	   VSTORE(5,%r8,result_12)	VPREFETCHG(5,%r9)	\
+	   VSTORE(6,%r8,result_20)	VPREFETCHG(6,%r9)	\
+	   VSTORE(7,%r8,result_21)	VPREFETCHG(7,%r9)	\
+	   VSTORE(8,%r8,result_22)	VPREFETCHG(8,%r9)	\
+	   VSTORE(9,%r8,result_30)	VPREFETCHG(9,%r9)	\
+	   VSTORE(10,%r8,result_31)	VPREFETCHG(10,%r9)	\
+	   VSTORE(11,%r8,result_32) 	VPREFETCHG(11,%r9)	\
 						);
 
 #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)

From 05c884a62a116033fb426354c5d07b7bdd6c5a82 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 25 Jun 2016 12:54:50 -0700
Subject: [PATCH 16/21] Prefetch change

---
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index d3e86276..d50999f6 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -2,7 +2,6 @@
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
   uint64_t basea, baseb;
-  uint64_t basex;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
@@ -20,7 +19,6 @@
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  basex = basea;
 
   if ( locala ) {
     LOAD64(%r10,isigns);
@@ -38,7 +36,7 @@
   ////////////////////////////////
   // Yp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -55,7 +53,7 @@
   ////////////////////////////////
   // Zp
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZM_PROJMEM(basea);
@@ -72,7 +70,7 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TM_PROJMEM(baseb);
@@ -89,7 +87,7 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     XP_PROJMEM(basea);
@@ -106,7 +104,7 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YP_PROJMEM(baseb);
@@ -123,7 +121,7 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -148,7 +146,7 @@
   } else { 
     LOAD_CHI(baseb);
   }
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal);
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
   {
     MULT_2SPIN_DIR_PFTM(Tm,basea);
   }

From 1445189361272305c0101081ea5da69300deb3ff Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 25 Jun 2016 12:55:25 -0700
Subject: [PATCH 17/21] COntrol the prefetch strategy

---
 lib/simd/Intel512common.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index 6878bcfb..a05f978c 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -28,6 +28,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ASM_INTEL_COMMON_512_H
 #define GRID_ASM_INTEL_COMMON_512_H
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Peformance options
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define AVX512_PF_L1
+#undef  AVX512_PF_L2_LINEAR
+#undef  AVX512_PF_L2_TABLE
+#undef  AVX512_PF_L2_WRITE
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Opcodes common 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -88,10 +96,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 #define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 
+#ifdef AVX512_PF_L1
 #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
+#else
+#define VPREFETCHG(O,A) 
+#endif
+
+#ifdef AVX512_PF_L2_LINEAR
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#else
+#define VPREFETCH2(O,A) 
+#endif
+
+#ifdef AVX512_PF_L2_TABLE
 #define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#else
+#define VPREFETCHP(O,A) 
+#endif
+
+#ifdef AVX512_PF_L2_WRITE
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
+#else
+#define VPREFETCHW(O,A) 
+#endif
+
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 

From 8fcefc021a604431a1de19c05f848c4af022e61a Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 26 Jun 2016 12:54:14 -0700
Subject: [PATCH 18/21] Improved the prefetching when using cache blocking
 codes

---
 lib/Stencil.h                                 |   5 +-
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h |  86 ++++---
 .../action/fermion/WilsonKernelsAsmBody.h.ab  |  18 +-
 lib/simd/Intel512common.h                     |  24 +-
 lib/simd/Intel512wilson.h                     | 237 +++++++++++-------
 5 files changed, 208 insertions(+), 162 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index 8019e3f9..bc015370 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -1,4 +1,4 @@
-     /*************************************************************************************
+   /*************************************************************************************
 
      Grid physics library, www.github.com/paboyle/Grid 
 
@@ -261,6 +261,9 @@
 	 }
        };
 
+       inline uint64_t Touch(int ent) {
+	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
+       }
        inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
 	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 	 local = _entries[ent]._is_local;
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index d50999f6..7373d2eb 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,7 +1,9 @@
 {
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
-  uint64_t basea, baseb;
+  int localc,permc, ptypec;
+  uint64_t basea, baseb, basec;
+
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
@@ -10,15 +12,22 @@
   MASK_REGS;
 
   for(int site=0;site<Ns;site++) {
-  int sU=lo.Reorder(ssU);  
+  int sU =lo.Reorder(ssU);
   for(int s=0;s<Ls;s++) {
-  ss=sU*Ls+s;
+  ss     =sU*Ls+s;
+
   ////////////////////////////////
   // Xp
   ////////////////////////////////
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  PREFETCH1_CHIMU(basea);
+  PF_GAUGE(Xp); 
+
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
 
   if ( locala ) {
     LOAD64(%r10,isigns);
@@ -36,7 +45,8 @@
   ////////////////////////////////
   // Yp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Tp,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -45,7 +55,7 @@
     LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFYP(Yp,basea);
+    MULT_2SPIN_DIR_PFYP(Yp,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YM_RECON_ACCUM;
@@ -53,16 +63,17 @@
   ////////////////////////////////
   // Zp
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
-  if ( locala ) {
+  baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  if ( localc ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
+    ZM_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR1,permc);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(basec);
   }
   {
-    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+    MULT_2SPIN_DIR_PFZP(Zp,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZM_RECON_ACCUM;
@@ -70,16 +81,17 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
-  if ( localb ) {
+  basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++;
+  PREFETCH_CHIMU(basec);
+  if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
+    TM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR0,perma);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basea);
   }
   {
-    MULT_2SPIN_DIR_PFTP(Tp,basea);
+    MULT_2SPIN_DIR_PFTP(Tp,baseb);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TM_RECON_ACCUM;
@@ -87,16 +99,17 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
-  if ( locala ) {
+  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
+  PREFETCH_CHIMU(basea);
+  if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
+    XP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR3,permb);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(baseb);
   }
   {
-    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+    MULT_2SPIN_DIR_PFXM(Xm,basec);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   XP_RECON_ACCUM;
@@ -104,13 +117,14 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
-  if ( localb ) {
+  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
+  PREFETCH_CHIMU(baseb);
+  if ( localc ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
+    YP_PROJMEM(basec);
+    MAYBEPERM(PERMUTE_DIR2,permc);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basec);
   }
   {
     MULT_2SPIN_DIR_PFYM(Ym,basea);
@@ -121,7 +135,8 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
+  basec = (uint64_t)&out._odata[ss];
+  PREFETCH_CHIMU(basec);
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -138,7 +153,8 @@
   ////////////////////////////////
   // Tm
   ////////////////////////////////
-  basea = (uint64_t)&out._odata[ss];
+  //  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  //  PREFETCH_CHIMU(basea);
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TP_PROJMEM(baseb);
@@ -146,16 +162,16 @@
   } else { 
     LOAD_CHI(baseb);
   }
-  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
   {
-    MULT_2SPIN_DIR_PFTM(Tm,basea);
+    MULT_2SPIN_DIR_PFTM(Tm,basec);
   }
+  //  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  SAVE_RESULT(&out._odata[ss],baseb);
-
-  } 
+  SAVE_RESULT(&out._odata[ss],basec);
+  
+  }
   ssU++;
   }
 }
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
index 3ba9eec6..d50999f6 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
@@ -2,7 +2,6 @@
   int locala,perma, ptypea;
   int localb,permb, ptypeb;
   uint64_t basea, baseb;
-  uint64_t basex;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
@@ -19,9 +18,7 @@
   ////////////////////////////////
   int ent=ss*8;// 2*Ndim
   basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
   baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  basex = basea;
 
   if ( locala ) {
     LOAD64(%r10,isigns);
@@ -39,7 +36,7 @@
   ////////////////////////////////
   // Yp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YM_PROJMEM(baseb);
@@ -56,7 +53,7 @@
   ////////////////////////////////
   // Zp
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZM_PROJMEM(basea);
@@ -73,7 +70,7 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     TM_PROJMEM(baseb);
@@ -90,7 +87,7 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     XP_PROJMEM(basea);
@@ -107,7 +104,7 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
   if ( localb ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     YP_PROJMEM(baseb);
@@ -124,7 +121,7 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
   if ( locala ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
     ZP_PROJMEM(basea);
@@ -149,13 +146,14 @@
   } else { 
     LOAD_CHI(baseb);
   }
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
   {
     MULT_2SPIN_DIR_PFTM(Tm,basea);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  SAVE_RESULT(&out._odata[ss]);
+  SAVE_RESULT(&out._odata[ss],baseb);
 
   } 
   ssU++;
diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index a05f978c..dabbf6d8 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -31,9 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Peformance options
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-#define AVX512_PF_L1
-#undef  AVX512_PF_L2_LINEAR
-#undef  AVX512_PF_L2_TABLE
 #undef  AVX512_PF_L2_WRITE
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -45,7 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            "mov     $0x5555, %%eax \n"\
            "kmovw    %%eax, %%k7 \n" : : : "%eax");
 
-//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" );
+//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
 
 #define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 #define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
@@ -96,30 +93,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 #define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 
-#ifdef AVX512_PF_L1
-#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
-#else
-#define VPREFETCHG(O,A) 
-#endif
-
-#ifdef AVX512_PF_L2_LINEAR
+#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" 
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
-#else
-#define VPREFETCH2(O,A) 
-#endif
-
-#ifdef AVX512_PF_L2_TABLE
-#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" 
-#else
-#define VPREFETCHP(O,A) 
-#endif
-
 #ifdef AVX512_PF_L2_WRITE
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 #else
 #define VPREFETCHW(O,A) 
 #endif
-
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 
diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 207d9db8..9deffd80 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -169,23 +169,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSTORE(5,%r8,Chi_12)				\
 						);
 
-#define SAVE_RESULTi(PTR,pf)			\
-	   LOAD64(%r8,PTR)			\
-	   LOAD64(%r9,pf)			\
-  __asm__ (					\
-	   VSTORE(0,%r8,result_00)	VPREFETCHG(0,%r9)	\
-	   VSTORE(1,%r8,result_01)	VPREFETCHG(1,%r9)	\
-	   VSTORE(2,%r8,result_02)	VPREFETCHG(2,%r9)	\
-	   VSTORE(3,%r8,result_10)	VPREFETCHG(3,%r9)	\
-	   VSTORE(4,%r8,result_11)	VPREFETCHG(4,%r9)	\
-	   VSTORE(5,%r8,result_12)	VPREFETCHG(5,%r9)	\
-	   VSTORE(6,%r8,result_20)	VPREFETCHG(6,%r9)	\
-	   VSTORE(7,%r8,result_21)	VPREFETCHG(7,%r9)	\
-	   VSTORE(8,%r8,result_22)	VPREFETCHG(8,%r9)	\
-	   VSTORE(9,%r8,result_30)	VPREFETCHG(9,%r9)	\
-	   VSTORE(10,%r8,result_31)	VPREFETCHG(10,%r9)	\
-	   VSTORE(11,%r8,result_32) 	VPREFETCHG(11,%r9)	\
-						);
 
 #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
@@ -560,24 +543,89 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSUB(UChi_02,result_22,result_22)\
   VSUB(UChi_12,result_32,result_32) );
 
-#define PREFETCH_CHIMU(A) 
-/*
-  LOAD64(%r9,A)						\
-	   __asm__ (						\
-  VPREFETCHG(0,%r9)\
-  VPREFETCHG(1,%r9)\
-  VPREFETCHG(2,%r9)\
-  VPREFETCHG(3,%r9)\
-  VPREFETCHG(4,%r9)\
-  VPREFETCHG(5,%r9)\
-  VPREFETCHG(6,%r9)\
-  VPREFETCHG(7,%r9)\
-  VPREFETCHG(8,%r9)\
-  VPREFETCHG(9,%r9)\
-  VPREFETCHG(10,%r9)\
-  VPREFETCHG(11,%r9));
-*/
-#define PERMUTE_DIR0 __asm__ ( 	\
+#define AVX512_PF_L1
+#define AVX512_PF_L2_GAUGE
+#define AVX512_PF_L2_TABLE
+#undef  AVX512_PF_L2_LINEAR
+
+#ifdef AVX512_PF_L2_TABLE
+#define VPREFETCH_P1(A,B)  VPREFETCH1(A,B)
+#define VPREFETCH_P2(A,B) VPREFETCH1(A,B)
+#else
+#define VPREFETCH_P1(A,B)
+#define VPREFETCH_P2(A,B)
+#endif
+#ifdef AVX512_PF_L2_LINEAR
+#define VPREFETCH_M1(A,B) 
+#define VPREFETCH_M2(A,B) 
+#else 
+#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
+#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
+#endif
+#ifdef AVX512_PF_L2_GAUGE
+#define VPREFETCH_G1(A,B)  VPREFETCH1(A,B)
+#define VPREFETCH_G2(A,B)  VPREFETCH2(A,B)
+#else
+#endif
+
+#define PF_GAUGE(A) \
+  LOAD64(%r8,&U._odata[sU](A))						\
+  __asm__ (								\
+	   VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8)			\
+	   VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8)			\
+									);
+
+#define SAVE_RESULTi(PTR,pf)			\
+	   LOAD64(%r8,PTR)			\
+	   LOAD64(%r9,pf)			\
+  __asm__ (					\
+	   VSTORE(0,%r8,result_00)	VPREFETCH_M1(0,%r9)	\
+	   VSTORE(1,%r8,result_01)	VPREFETCH_M1(1,%r9)	\
+	   VSTORE(2,%r8,result_02)	VPREFETCH_M1(2,%r9)	\
+	   VSTORE(3,%r8,result_10)	VPREFETCH_M1(3,%r9)	\
+	   VSTORE(4,%r8,result_11)	VPREFETCH_M1(4,%r9)	\
+	   VSTORE(5,%r8,result_12)	VPREFETCH_M1(5,%r9)	\
+	   VSTORE(6,%r8,result_20)	VPREFETCH_M1(6,%r9)	\
+	   VSTORE(7,%r8,result_21)	VPREFETCH_M1(7,%r9)	\
+	   VSTORE(8,%r8,result_22)	VPREFETCH_M1(8,%r9)	\
+	   VSTORE(9,%r8,result_30)	VPREFETCH_M1(9,%r9)	\
+	   VSTORE(10,%r8,result_31)	VPREFETCH_M1(10,%r9)	\
+	   VSTORE(11,%r8,result_32) 	VPREFETCH_M1(11,%r9)	\
+						);
+
+#define PREFETCH_CHIMU(A) \
+  LOAD64(%r9,A)							\
+  __asm__ (							\
+	   VPREFETCH_P2(0,%r9)					\
+	   VPREFETCH_P2(1,%r9)					\
+	   VPREFETCH_P2(2,%r9)					\
+	   VPREFETCH_P2(3,%r9)					\
+	   VPREFETCH_P2(4,%r9)					\
+	   VPREFETCH_P2(5,%r9)					\
+	   VPREFETCH_P2(6,%r9)					\
+	   VPREFETCH_P2(7,%r9)					\
+	   VPREFETCH_P2(8,%r9)					\
+	   VPREFETCH_P2(9,%r9)					\
+	   VPREFETCH_P2(10,%r9)					\
+	   VPREFETCH_P2(11,%r9));
+
+#define PREFETCH1_CHIMU(A) \
+  LOAD64(%r9,A)							\
+  __asm__ (							\
+	   VPREFETCH_P1(0,%r9)					\
+	   VPREFETCH_P1(1,%r9)					\
+	   VPREFETCH_P1(2,%r9)					\
+	   VPREFETCH_P1(3,%r9)					\
+	   VPREFETCH_P1(4,%r9)					\
+	   VPREFETCH_P1(5,%r9)					\
+	   VPREFETCH_P1(6,%r9)					\
+	   VPREFETCH_P1(7,%r9)					\
+	   VPREFETCH_P1(8,%r9)					\
+	   VPREFETCH_P1(9,%r9)					\
+	   VPREFETCH_P1(10,%r9)					\
+	   VPREFETCH_P1(11,%r9));
+
+#define PERMUTE_DIR0 __asm__ (			\
   VPERM0(Chi_00,Chi_00)	\
   VPERM0(Chi_01,Chi_01)	\
   VPERM0(Chi_02,Chi_02)	\
@@ -614,14 +662,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   LOAD64(%r8,ptr)						\
   LOAD64(%r9,pf)						\
 	   __asm__ (						\
-	   VPREFETCH2(9,%r8)				   VPREFETCH2(10,%r8)					   \
-	   VPREFETCH2(11,%r8)					   \
-	   VPREFETCH2(12,%r8)					   \
-	   VPREFETCH2(13,%r8)					   \
-	   VPREFETCH2(14,%r8)					   \
-	   VPREFETCH2(15,%r8)					   \
-	   VPREFETCH2(16,%r8)					   \
-	   VPREFETCH2(17,%r8)					   \
+	   VPREFETCH_G2(9,%r8)				   \
+	   VPREFETCH_G2(10,%r8)					   \
+	   VPREFETCH_G2(11,%r8)					   \
+	   VPREFETCH_G2(12,%r8)					   \
+	   VPREFETCH_G2(13,%r8)					   \
+	   VPREFETCH_G2(14,%r8)					   \
+	   VPREFETCH_G2(15,%r8)					   \
+	   VPREFETCH_G2(16,%r8)					   \
+	   VPREFETCH_G2(17,%r8)					   \
 	   VSHUF(Chi_00,T1)				\
 	   VMOVIDUP(0,%r8,Z0 )					\
            VMOVIDUP(3,%r8,Z1 )					\
@@ -633,10 +682,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
            VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
            VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
-	   VPREFETCHG(0,%r9)					   \
-	   VPREFETCHG(1,%r9)					   \
-	   VPREFETCHG(2,%r9)					   \
-	   VPREFETCHG(3,%r9)					   \
+	   VPREFETCH_M1(0,%r9)					   \
+	   VPREFETCH_M1(1,%r9)					   \
+	   VPREFETCH_M1(2,%r9)					   \
+	   VPREFETCH_M1(3,%r9)					   \
 	   /*18*/						\
            VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
            VMADDSUB(Z3,Chi_10,UChi_10)				\
@@ -644,10 +693,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
            VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
            VMADDSUB(Z5,Chi_10,UChi_12)				\
-	   VPREFETCHG(4,%r9)					   \
-	   VPREFETCHG(5,%r9)					   \
-	   VPREFETCHG(6,%r9)					   \
-	   VPREFETCHG(7,%r9)					   \
+	   VPREFETCH_M1(4,%r9)					   \
+	   VPREFETCH_M1(5,%r9)					   \
+	   VPREFETCH_M1(6,%r9)					   \
+	   VPREFETCH_M1(7,%r9)					   \
 	   /*28*/						\
            VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
            VMADDSUB(Z0,T2,UChi_10)				\
@@ -674,15 +723,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
            VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
            VMADDSUB(Z5,Chi_11,UChi_12)				\
-	   VPREFETCHG(9,%r8)				   \
-	   VPREFETCHG(10,%r8)					   \
-	   VPREFETCHG(11,%r8)					   \
-	   VPREFETCHG(12,%r8)					   \
-	   VPREFETCHG(13,%r8)					   \
-	   VPREFETCHG(14,%r8)					   \
-	   VPREFETCHG(15,%r8)					   \
-	   VPREFETCHG(16,%r8)					   \
-	   VPREFETCHG(17,%r8)					   \
+	   VPREFETCH_M1(9,%r8)				   \
+	   VPREFETCH_M1(10,%r8)					   \
+	   VPREFETCH_M1(11,%r8)					   \
+	   VPREFETCH_M1(12,%r8)					   \
+	   VPREFETCH_M1(13,%r8)					   \
+	   VPREFETCH_M1(14,%r8)					   \
+	   VPREFETCH_M1(15,%r8)					   \
+	   VPREFETCH_M1(16,%r8)					   \
+	   VPREFETCH_M1(17,%r8)					   \
 	   /*48*/						\
            VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
            VMADDSUB(Z0,T2,UChi_10)			      \
@@ -690,10 +739,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUB(Z1,T2,UChi_11)			      \
            VMADDSUB(Z2,T1,UChi_02)			      \
            VMADDSUB(Z2,T2,UChi_12)			      \
-	   VPREFETCHG(8,%r9)					   \
-	   VPREFETCHG(9,%r9)					   \
-	   VPREFETCHG(10,%r9)					   \
-	   VPREFETCHG(11,%r9)					   \
+	   VPREFETCH_M1(8,%r9)					   \
+	   VPREFETCH_M1(9,%r9)					   \
+	   VPREFETCH_M1(10,%r9)					   \
+	   VPREFETCH_M1(11,%r9)					   \
 	   /*55*/					      \
            VMADDSUB(Z3,Chi_02,UChi_00)			      \
            VMADDSUB(Z3,Chi_12,UChi_10)			      \
@@ -712,56 +761,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
            VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
            VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
-	   VPREFETCHG(0,%r9)					   \
-	   VPREFETCHG(1,%r9)					   \
-	   VPREFETCHG(2,%r9)					   \
-	   VPREFETCHG(3,%r9)					   \
+	   VPREFETCH_M1(0,%r9)					   \
+	   VPREFETCH_M1(1,%r9)					   \
+	   VPREFETCH_M1(2,%r9)					   \
+	   VPREFETCH_M1(3,%r9)					   \
 	   /*8*/						   \
            VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
            VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
            VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
            VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
-	   VPREFETCHG(4,%r9)					   \
-	   VPREFETCHG(5,%r9)					   \
-	   VPREFETCHG(6,%r9)					   \
-	   VPREFETCHG(7,%r9)					   \
+	   VPREFETCH_M1(4,%r9)					   \
+	   VPREFETCH_M1(5,%r9)					   \
+	   VPREFETCH_M1(6,%r9)					   \
+	   VPREFETCH_M1(7,%r9)					   \
 	   /*16*/					  	   \
            VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
            VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
            VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
-	   VPREFETCHG(8,%r9)					   \
-	   VPREFETCHG(9,%r9)					   \
-	   VPREFETCHG(10,%r9)					   \
-	   VPREFETCHG(11,%r9)					   \
+	   VPREFETCH_M1(8,%r9)					   \
+	   VPREFETCH_M1(9,%r9)					   \
+	   VPREFETCH_M1(10,%r9)					   \
+	   VPREFETCH_M1(11,%r9)					   \
            /*22*/						   \
            VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
            VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
            VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
            VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
-	   VPREFETCH2(12,%r9)					   \
-	   VPREFETCH2(13,%r9)					   \
-	   VPREFETCH2(14,%r9)					   \
-	   VPREFETCH2(15,%r9)					   \
+	   VPREFETCH_M2(12,%r9)					   \
+	   VPREFETCH_M2(13,%r9)					   \
+	   VPREFETCH_M2(14,%r9)					   \
+	   VPREFETCH_M2(15,%r9)					   \
 	   /*30*/						   \
            VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
            VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
-	   VPREFETCH2(16,%r9)					   \
-	   VPREFETCH2(17,%r9)					   \
-	   VPREFETCH2(18,%r9)					   \
-	   VPREFETCH2(19,%r9)					   \
+	   VPREFETCH_M2(16,%r9)					   \
+	   VPREFETCH_M2(17,%r9)					   \
+	   VPREFETCH_M2(18,%r9)					   \
+	   VPREFETCH_M2(19,%r9)					   \
            VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 	   /*36*/					           \
            VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
            VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
            VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
-	   VPREFETCH2(20,%r9)					   \
-	   VPREFETCH2(21,%r9)					   \
-	   VPREFETCH2(22,%r9)					   \
-	   VPREFETCH2(23,%r9)					   \
-	   VPREFETCHG(2,%r8)					   \
-	   VPREFETCHG(3,%r8)					   \
-	   VPREFETCH2(4,%r8)					   \
-	   VPREFETCH2(5,%r8)					   \
+	   VPREFETCH_M2(20,%r9)					   \
+	   VPREFETCH_M2(21,%r9)					   \
+	   VPREFETCH_M2(22,%r9)					   \
+	   VPREFETCH_M2(23,%r9)					   \
+	   VPREFETCH_G1(2,%r8)					   \
+	   VPREFETCH_G1(3,%r8)					   \
+	   VPREFETCH_G2(4,%r8)					   \
+	   VPREFETCH_G2(5,%r8)					   \
 	   /*42 insns*/						);
 
 #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
@@ -794,8 +843,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
            VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
            VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
-	   /*	   VPREFETCHG(2,%r8)*/				   \
-	   /*	   VPREFETCHG(3,%r8)*/				   \
+	   /*	   VPREFETCH1(2,%r8)*/				   \
+	   /*	   VPREFETCH1(3,%r8)*/				   \
 	   /*42 insns*/						);
 
 

From bdaa5b17670b728cbd65e7ea0a802dc1d9e1bc65 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 30 Jun 2016 13:07:42 -0700
Subject: [PATCH 19/21] Updated to have perfect prefetching for the
 s-vectorised kernel with any cache blocking.

---
 lib/Stencil.h                                 |   5 +
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 153 +++++++++---------
 lib/simd/Intel512wilson.h                     |  58 ++++---
 3 files changed, 119 insertions(+), 97 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index bc015370..f5b6c288 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -272,6 +272,11 @@
 	 if (local) return base + _entries[ent]._byte_offset;
 	 else       return _entries[ent]._byte_offset;
        }
+       inline uint64_t GetPFInfo(int ent,uint64_t base) {
+	 int local = _entries[ent]._is_local;
+	 if (local) return base + _entries[ent]._byte_offset;
+	 else       return        _entries[ent]._byte_offset;
+       }
 
        // Comms buffers
        std::vector<Vector<scalar_object> > u_simd_send_buf;
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index 7373d2eb..4f3ef861 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -1,43 +1,44 @@
 {
-  int locala,perma, ptypea;
-  int localb,permb, ptypeb;
-  int localc,permc, ptypec;
-  uint64_t basea, baseb, basec;
-
+  int local,perm, ptype;
+  uint64_t base;
+  uint64_t basep;
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
   vComplexF *isigns = &signs[0];
 
   MASK_REGS;
-
+  int nmax=U._grid->oSites();
   for(int site=0;site<Ns;site++) {
   int sU =lo.Reorder(ssU);
+  int ssn=ssU+1; 
+  if(ssn>=nmax) ssn=0;
+  int sUn=lo.Reorder(ssn);
   for(int s=0;s<Ls;s++) {
-  ss     =sU*Ls+s;
-
+  ss =sU*Ls+s;
+  ssn=sUn*Ls+s; 
   ////////////////////////////////
   // Xp
   ////////////////////////////////
-  int ent=ss*8;// 2*Ndim
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  PREFETCH1_CHIMU(basea);
+  int  ent=ss*8;// 2*Ndim
+  int nent=ssn*8;
+
   PF_GAUGE(Xp); 
+  base  = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
+  PREFETCH1_CHIMU(base);
 
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-
-  if ( locala ) {
+  basep = st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);
-    XM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR3,perma);
+    XM_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR3,perm);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+    MULT_2SPIN_DIR_PFXP(Xp,basep);
   }
   LOAD64(%r10,isigns);
   XM_RECON;
@@ -45,17 +46,18 @@
   ////////////////////////////////
   // Yp
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Tp,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  if ( localb ) {
+  basep = st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YM_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR2,permb);
+    YM_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR2,perm);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(base);
   }
+  base  = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFYP(Yp,basec);
+    MULT_2SPIN_DIR_PFYP(Yp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YM_RECON_ACCUM;
@@ -63,17 +65,18 @@
   ////////////////////////////////
   // Zp
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Xm,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  if ( localc ) {
+  basep = st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR1,permc);
+    ZM_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR1,perm);
   } else { 
-    LOAD_CHI(basec);
+    LOAD_CHI(base);
   }
+  base  = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFZP(Zp,basea);
+    MULT_2SPIN_DIR_PFZP(Zp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZM_RECON_ACCUM;
@@ -81,17 +84,18 @@
   ////////////////////////////////
   // Tp
   ////////////////////////////////
-  basec = st.GetInfo(ptypec,localc,permc,Ym,ent,plocal); ent++;
-  PREFETCH_CHIMU(basec);
-  if ( locala ) {
+  basep = st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR0,perma);
+    TM_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR0,perm);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFTP(Tp,baseb);
+    MULT_2SPIN_DIR_PFTP(Tp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TM_RECON_ACCUM;
@@ -99,17 +103,19 @@
   ////////////////////////////////
   // Xm
   ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
-  PREFETCH_CHIMU(basea);
-  if ( localb ) {
+  basep= (uint64_t) &out._odata[ss];
+  //  basep= st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR3,permb);
+    XP_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR3,perm);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFXM(Xm,basec);
+    MULT_2SPIN_DIR_PFXM(Xm,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   XP_RECON_ACCUM;
@@ -117,17 +123,18 @@
   ////////////////////////////////
   // Ym
   ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
-  PREFETCH_CHIMU(baseb);
-  if ( localc ) {
+  basep= st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR2,permc);
+    YP_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR2,perm);
   } else { 
-    LOAD_CHI(basec);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFYM(Ym,basea);
+    MULT_2SPIN_DIR_PFYM(Ym,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   YP_RECON_ACCUM;
@@ -135,17 +142,18 @@
   ////////////////////////////////
   // Zm
   ////////////////////////////////
-  basec = (uint64_t)&out._odata[ss];
-  PREFETCH_CHIMU(basec);
-  if ( locala ) {
+  basep= st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZP_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR1,perma);
+    ZP_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR1,perm);
   } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(base);
   }
+  base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+    MULT_2SPIN_DIR_PFZM(Zm,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   ZP_RECON_ACCUM;
@@ -153,23 +161,24 @@
   ////////////////////////////////
   // Tm
   ////////////////////////////////
-  //  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
-  //  PREFETCH_CHIMU(basea);
-  if ( localb ) {
+  basep= st.GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR0,permb);
+    TP_PROJMEM(base);
+    MAYBEPERM(PERMUTE_DIR0,perm);
   } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(base);
   }
+  base= (uint64_t) &out._odata[ss];
+  PREFETCH_CHIMU(base);
   {
-    MULT_2SPIN_DIR_PFTM(Tm,basec);
+    MULT_2SPIN_DIR_PFTM(Tm,basep);
   }
-  //  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
   TP_RECON_ACCUM;
 
-  SAVE_RESULT(&out._odata[ss],basec);
+  basep= st.GetPFInfo(nent,plocal); nent++;
+  SAVE_RESULT(base,basep);
   
   }
   ssU++;
diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 9deffd80..660d07d6 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -261,8 +261,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define XM_PROJMEM(PTR) \
   LOAD64(%r8,PTR)\
   __asm__ (								\
-	   SHUF_CHIMU23i						\
 	   LOAD_CHIi \
+	   SHUF_CHIMU23i						\
 	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
 	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
 	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
@@ -290,8 +290,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZM_PROJMEM(PTR) \
   LOAD64(%r8,PTR)							\
   __asm__ (								\
-	   SHUF_CHIMU23i						\
            LOAD_CHIi \
+	   SHUF_CHIMU23i						\
 	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
 	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
 	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
@@ -548,24 +548,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define AVX512_PF_L2_TABLE
 #undef  AVX512_PF_L2_LINEAR
 
-#ifdef AVX512_PF_L2_TABLE
-#define VPREFETCH_P1(A,B)  VPREFETCH1(A,B)
-#define VPREFETCH_P2(A,B) VPREFETCH1(A,B)
-#else
-#define VPREFETCH_P1(A,B)
-#define VPREFETCH_P2(A,B)
-#endif
-#ifdef AVX512_PF_L2_LINEAR
-#define VPREFETCH_M1(A,B) 
+#ifdef AVX512_PF_L2_TABLE  
+// P1 Fetches the base pointer for next link into L1 with P1
+// M1 Fetches the next site pointer into L2
+#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
+#define VPREFETCH_P2(A,B) 
+#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
 #define VPREFETCH_M2(A,B) 
-#else 
+#endif
+
+#ifdef AVX512_PF_L2_LINEAR
 #define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
 #define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
+#define VPREFETCH_P1(A,B) 
+#define VPREFETCH_P2(A,B)
 #endif
+
 #ifdef AVX512_PF_L2_GAUGE
 #define VPREFETCH_G1(A,B)  VPREFETCH1(A,B)
 #define VPREFETCH_G2(A,B)  VPREFETCH2(A,B)
-#else
 #endif
 
 #define PF_GAUGE(A) \
@@ -593,21 +594,26 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VSTORE(11,%r8,result_32) 	VPREFETCH_M1(11,%r9)	\
 						);
 
+#ifdef AVX512_PF_L2_TABLE
 #define PREFETCH_CHIMU(A) \
   LOAD64(%r9,A)							\
   __asm__ (							\
-	   VPREFETCH_P2(0,%r9)					\
-	   VPREFETCH_P2(1,%r9)					\
-	   VPREFETCH_P2(2,%r9)					\
-	   VPREFETCH_P2(3,%r9)					\
-	   VPREFETCH_P2(4,%r9)					\
-	   VPREFETCH_P2(5,%r9)					\
-	   VPREFETCH_P2(6,%r9)					\
-	   VPREFETCH_P2(7,%r9)					\
-	   VPREFETCH_P2(8,%r9)					\
-	   VPREFETCH_P2(9,%r9)					\
-	   VPREFETCH_P2(10,%r9)					\
-	   VPREFETCH_P2(11,%r9));
+	   VPREFETCH_P1(0,%r9)					\
+	   VPREFETCH_P1(1,%r9)					\
+	   VPREFETCH_P1(2,%r9)					\
+	   VPREFETCH_P1(3,%r9)					\
+	   VPREFETCH_P1(4,%r9)					\
+	   VPREFETCH_P1(5,%r9)					\
+	   VPREFETCH_P1(6,%r9)					\
+	   VPREFETCH_P1(7,%r9)					\
+	   VPREFETCH_P1(8,%r9)					\
+	   VPREFETCH_P1(9,%r9)					\
+	   VPREFETCH_P1(10,%r9)					\
+	   VPREFETCH_P1(11,%r9));
+
+#else
+#define PREFETCH_CHIMU(A)
+#endif
 
 #define PREFETCH1_CHIMU(A) \
   LOAD64(%r9,A)							\
@@ -811,6 +817,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VPREFETCH_G1(3,%r8)					   \
 	   VPREFETCH_G2(4,%r8)					   \
 	   VPREFETCH_G2(5,%r8)					   \
+	   VPREFETCH_G2(6,%r8)					   \
+	   VPREFETCH_G2(7,%r8)					   \
 	   /*42 insns*/						);
 
 #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \

From 712b9a348979f41e50f9f8bb7c7838807dbebe14 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 30 Jun 2016 14:00:34 -0700
Subject: [PATCH 20/21] Asm only for avx512

---
 lib/qcd/action/fermion/WilsonKernels.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index 672c23d6..4edd25f9 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -42,12 +42,15 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,Dou
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
+#ifdef AVX512
   if ( AsmOpt ) {
 
     WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 
   } else {
-
+#else
+  {  
+#endif
     for(int site=0;site<Ns;site++) {
       for(int s=0;s<Ls;s++) {
 	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);

From 3fc6e03ad11881367ca62b2ca85abf0076f47897 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 30 Jun 2016 14:44:09 -0700
Subject: [PATCH 21/21] Version file

---
 VERSION | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 VERSION

diff --git a/VERSION b/VERSION
new file mode 100644
index 00000000..c12f9497
--- /dev/null
+++ b/VERSION
@@ -0,0 +1,4 @@
+Version : 0.5.0
+
+- AVX512, AVX2, AVX, SSE good
+- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above