Enable reordering of the loops in the assembler for cache friendly.

This gets in the way of L2 prefetching however. Do next next link in stencil prefetching.
2026-05-20 17:14:30 +01:00 · 2016-06-19 11:45:58 -07:00
parent d6737e4bd8
commit 6d58cb2a68
15 changed files with 670 additions and 116 deletions
@@ -0,0 +1,358 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
 void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  if ( getenv("ASMOPT") )  {
    QCD::WilsonKernelsStatic::AsmOpt=1;
  } else { 
    QCD::WilsonKernelsStatic::AsmOpt=0;
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  int Lmax=32;
  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
  for (int L=8;L<Lmax;L*=2){
    std::vector<int> latt4(4,L);
    for(int d=4;d>0;d--){
      if ( d<=3 ) latt4[d]*=2;
      std::cout << GridLogMessage <<"\t";
      for(int d=0;d<Nd;d++){
 	std::cout<<latt4[d]<<"x";
      }
      std::cout <<Ls<<"\t" ;
      benchDw (latt4,Ls,threads,0);
      benchsDw(latt4,Ls,threads,0);
      std::cout<<std::endl;
    }
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  {
    std::vector<int> latt4(4,16);
    std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
    benchDw (latt4,Ls,threads,1);
    std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
    benchsDw(latt4,Ls,threads,1);
  }
  Grid_finalize();
 }
 #undef CHECK
 void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
 #ifdef CHECK 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
 #else 
  LatticeFermion src   (FGrid); src=zero;
  LatticeGaugeField Umu(UGrid); Umu=zero;
 #endif
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
 #ifdef CHECK
  if (1)
  {
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
 #endif
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  double t0=usecond();
  Dw.Dhop(src,result,0);
  double t1=usecond();
  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
  if (ncall < 5 ) exit(0);
  Dw.Dhop(src,result,0);
  PerformanceCounter Counter(8);
  Counter.Start();
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.Dhop(src,result,0);
  }
  t1=usecond();
  Counter.Stop();
  if ( report ) {
    Counter.Report();
  }
  if ( ! report ) 
    {
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=1344*volume*ncall;
      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
    }
 #ifdef CHECK
    err = ref-result; 
    RealD errd = norm2(err);
    if ( errd> 1.0e-4 ) {
      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
      exit(-1);
    }
 #endif
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
  LatticeFermion r_o   (FrbGrid);
  LatticeFermion r_eo  (FGrid);
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  {
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
    }
    double t1=usecond();
    if(!report){
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<< flops/(t1-t0);
    }
  }
 }
 #undef CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
 #ifdef CHECK_SDW
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
 #else 
  LatticeFermion src   (FGrid); src=zero;
  LatticeGaugeField Umu(UGrid); Umu=zero;
 #endif
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  RealD mass=0.1;
  RealD M5  =1.8;
    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
    LatticeFermionF ssrc(sFGrid);
    LatticeFermionF sref(sFGrid);
    LatticeFermionF sresult(sFGrid);
    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVectorF tmp;
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
    double t0=usecond();
    sDw.Dhop(ssrc,sresult,0);
    double t1=usecond();
    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
    PerformanceCounter Counter(8);
    Counter.Start();
    t0=usecond();
    for(int i=0;i<ncall;i++){
      sDw.Dhop(ssrc,sresult,0);
    }
    t1=usecond();
    Counter.Stop();
    if ( report ) {
      Counter.Report();
    } else { 
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=1344*volume*ncall;
      std::cout<<"\t"<< flops/(t1-t0);
    }
    LatticeFermionF sr_eo(sFGrid);
    LatticeFermionF serr(sFGrid);
    LatticeFermion ssrc_e (sFrbGrid);
    LatticeFermion ssrc_o (sFrbGrid);
    LatticeFermion sr_e   (sFrbGrid);
    LatticeFermion sr_o   (sFrbGrid);
    pickCheckerboard(Even,ssrc_e,ssrc);
    pickCheckerboard(Odd,ssrc_o,ssrc);
    setCheckerboard(sr_eo,ssrc_o);
    setCheckerboard(sr_eo,ssrc_e);
    sr_e = zero;
    sr_o = zero;
    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
    PerformanceCounter CounterSdw(8);
    CounterSdw.Start();
    t0=usecond();
    for(int i=0;i<ncall;i++){
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
    }
    t1=usecond();
    CounterSdw.Stop();
    if ( report ) { 
      CounterSdw.Report();
    } else {
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<"\t"<< flops/(t1-t0);
    }
 }
@@ -63,7 +63,7 @@ namespace Grid {
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
-	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
+	//	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 	// Call base setter
 	this->SetCoefficientsTanh(zdata,1.0,0.0);
@@ -53,6 +53,8 @@ namespace QCD {
 	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
 	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
 	mass(_mass),
 	Lebesgue(_grid),
 	LebesgueEvenOdd(_cbgrid),
 	Umu(&Fgrid),
 	UmuEven(&Hgrid),
 	UmuOdd (&Hgrid) 
@@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
    out.checkerboard = in.checkerboard;
-    DhopInternal(Stencil,Umu,in,out,dag);
+    DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
  }
  template<class Impl>
@@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
    assert(in.checkerboard==Even);
    out.checkerboard = Odd;
-    DhopInternal(StencilEven,UmuOdd,in,out,dag);
+    DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
  }
  template<class Impl>
@@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
    assert(in.checkerboard==Odd);
    out.checkerboard = Even;
-    DhopInternal(StencilOdd,UmuEven,in,out,dag);
+    DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
  }
  template<class Impl>
@@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
  };
  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
+  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
    assert((dag==DaggerNo) ||(dag==DaggerYes));
@@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
    if ( dag == DaggerYes ) {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    } else {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
+	Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    }
  };
@@ -111,7 +111,7 @@ namespace Grid {
 			 const FermionField &B,
 			 int dag);
-      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
+      void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;
      // Constructor
@@ -146,6 +146,10 @@ namespace Grid {
      DoubledGaugeField Umu;
      DoubledGaugeField UmuEven;
      DoubledGaugeField UmuOdd;
      LebesgueOrder Lebesgue;
      LebesgueOrder LebesgueEvenOdd;
    };
@@ -321,14 +321,14 @@ PARALLEL_FOR_LOOP
    for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	int sF=LLs*sU;
-	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
    }
  } else {
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<U._grid->oSites();ss++){
      int sU=ss;
      int sF=LLs*sU;
-      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
    }
  }
 }
@@ -38,20 +38,20 @@ template<class Impl>
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
  if ( AsmOpt ) {
-    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
+    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
  } else {
    for(int site=0;site<Ns;site++) {
      for(int s=0;s<Ls;s++) {
-	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
+	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
-	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
+	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 	sF++;
      }
      sU++;
@@ -61,17 +61,17 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 }
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
  // No asm implementation yet.
-  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
+  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
  //  else
  for(int site=0;site<Ns;site++) {
    for(int s=0;s<Ls;s++) {
-      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
+      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
+      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
      sF++;
    }
    sU++;
@@ -84,7 +84,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
  ////////////////////////////////////////////
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -262,7 +262,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaug
  // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -53,11 +53,11 @@ namespace Grid {
    public:
-     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
-     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
@@ -67,24 +67,24 @@ namespace Grid {
    private:
     // Specialised variants
-     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU, const FermionField &in, FermionField &out);
-     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in,FermionField &out);
-     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
-     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out);
-     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				 int sF,int sU,const FermionField &in, FermionField &out);
    public:
@@ -39,9 +39,9 @@ namespace QCD {
  // Default to no assembler implementation
  ///////////////////////////////////////////////////////////
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
@@ -71,9 +71,9 @@ static int signInit = setupSigns();
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef VMOVIDUP
@@ -85,31 +85,31 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaug
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
-void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #endif
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
@@ -1,7 +1,8 @@
 {
  int locala,perma, ptypea;
  int localb,permb, ptypeb;
-  uint64_t basea, baseb;
+  int localc,permc, ptypec;
  uint64_t basea, baseb, basec;
  uint64_t basex;
  const uint64_t plocal =(uint64_t) & in._odata[0];
@@ -11,14 +12,22 @@
  MASK_REGS;
  for(int site=0;site<Ns;site++) {
  int sU=lo.Reorder(ssU);
  for(int s=0;s<Ls;s++) {
  ss     =sU*Ls+s;
  ////////////////////////////////
  // Xp
  ////////////////////////////////
  int ent=ss*8;// 2*Ndim
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  PREFETCH_CHIMU(basea);
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  PREFETCH_CHIMU(baseb);
  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
  PREFETCH_CHIMU(basec);
  basex = basea;
  if ( locala ) {
@@ -38,6 +47,7 @@
  // Yp
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  PREFETCH_CHIMU(basea);
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YM_PROJMEM(baseb);
@@ -46,7 +56,7 @@
    LOAD_CHI(baseb);
  }
  {
-    MULT_2SPIN_DIR_PFYP(Yp,basea);
+    MULT_2SPIN_DIR_PFYP(Yp,basec);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YM_RECON_ACCUM;
@@ -55,15 +65,16 @@
  // Zp
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
-  if ( locala ) {
+  PREFETCH_CHIMU(baseb);
  if ( localc ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    ZM_PROJMEM(basea);
+    ZM_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR1,perma);
+    MAYBEPERM(PERMUTE_DIR1,permc);
  } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(basec);
  }
  {
-    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+    MULT_2SPIN_DIR_PFZP(Zp,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZM_RECON_ACCUM;
@@ -71,16 +82,17 @@
  ////////////////////////////////
  // Tp
  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
-  if ( localb ) {
+  PREFETCH_CHIMU(basec);
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    TM_PROJMEM(baseb);
+    TM_PROJMEM(basea);
-    MAYBEPERM(PERMUTE_DIR0,permb);
+    MAYBEPERM(PERMUTE_DIR0,perma);
  } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basea);
  }
  {
-    MULT_2SPIN_DIR_PFTP(Tp,basea);
+    MULT_2SPIN_DIR_PFTP(Tp,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TM_RECON_ACCUM;
@@ -88,16 +100,17 @@
  ////////////////////////////////
  // Xm
  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
-  if ( locala ) {
+  PREFETCH_CHIMU(basea);
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    XP_PROJMEM(basea);
+    XP_PROJMEM(baseb);
-    MAYBEPERM(PERMUTE_DIR3,perma);
+    MAYBEPERM(PERMUTE_DIR3,permb);
  } else { 
-    LOAD_CHI(basea);
+    LOAD_CHI(baseb);
  }
  {
-    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+    MULT_2SPIN_DIR_PFXM(Xm,basec);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  XP_RECON_ACCUM;
@@ -105,13 +118,14 @@
  ////////////////////////////////
  // Ym
  ////////////////////////////////
-  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
-  if ( localb ) {
+  PREFETCH_CHIMU(baseb);
  if ( localc ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
-    YP_PROJMEM(baseb);
+    YP_PROJMEM(basec);
-    MAYBEPERM(PERMUTE_DIR2,permb);
+    MAYBEPERM(PERMUTE_DIR2,permc);
  } else { 
-    LOAD_CHI(baseb);
+    LOAD_CHI(basec);
  }
  {
    MULT_2SPIN_DIR_PFYM(Ym,basea);
@@ -122,7 +136,8 @@
  ////////////////////////////////
  // Zm
  ////////////////////////////////
-  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
  PREFETCH_CHIMU(basec);
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZP_PROJMEM(basea);
@@ -140,6 +155,7 @@
  // Tm
  ////////////////////////////////
  basea = (uint64_t)&out._odata[ss];
  PREFETCH_CHIMU(basea);
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TP_PROJMEM(baseb);
@@ -148,17 +164,15 @@
    LOAD_CHI(baseb);
  }
  {
-    MULT_2SPIN_DIR_PFTM(Tm,basea);
+    MULT_2SPIN_DIR_PFTM(Tm,basec);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TP_RECON_ACCUM;
-  PREFETCH_CHIMU(basex);
+  //  PREFETCH_CHIMU(basex);
  SAVE_RESULT(&out._odata[ss]);
-  ss++;
+  }
-  } 
+  ssU++;
  sU++;
  }
 }
@@ -0,0 +1,163 @@
 {
  int locala,perma, ptypea;
  int localb,permb, ptypeb;
  uint64_t basea, baseb;
  uint64_t basex;
  const uint64_t plocal =(uint64_t) & in._odata[0];
  //  vComplexF isigns[2] = { signs[0], signs[1] };
  vComplexF *isigns = &signs[0];
  MASK_REGS;
  for(int site=0;site<Ns;site++) {
  int sU=lo.Reorder(ssU);  
  for(int s=0;s<Ls;s++) {
  ss=sU*Ls+s;
  ////////////////////////////////
  // Xp
  ////////////////////////////////
  int ent=ss*8;// 2*Ndim
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  PREFETCH_CHIMU(basea);
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  basex = basea;
  if ( locala ) {
    LOAD64(%r10,isigns);
    XM_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR3,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFXP(Xp,baseb);
  }
  LOAD64(%r10,isigns);
  XM_RECON;
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YM_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR2,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFYP(Yp,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YM_RECON_ACCUM;
  ////////////////////////////////
  // Zp
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZM_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR1,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFZP(Zp,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZM_RECON_ACCUM;
  ////////////////////////////////
  // Tp
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TM_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR0,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFTP(Tp,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TM_RECON_ACCUM;
  ////////////////////////////////
  // Xm
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    XP_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR3,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFXM(Xm,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  XP_RECON_ACCUM;
  ////////////////////////////////
  // Ym
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YP_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR2,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFYM(Ym,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YP_RECON_ACCUM;
  ////////////////////////////////
  // Zm
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZP_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR1,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFZM(Zm,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZP_RECON_ACCUM;
  ////////////////////////////////
  // Tm
  ////////////////////////////////
  basea = (uint64_t)&out._odata[ss];
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TP_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR0,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFTM(Tm,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TP_RECON_ACCUM;
  SAVE_RESULT(&out._odata[ss]);
  } 
  ssU++;
  }
 }
@@ -312,7 +312,7 @@ namespace QCD {
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -555,7 +555,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 }
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -803,7 +803,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
 template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -811,7 +811,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
 }
 template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -819,7 +819,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
 }
 template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -827,7 +827,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
 }
 template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -839,44 +839,44 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
@@ -88,7 +88,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
 #define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n" 
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 #define VEVICT(O,A)   
 //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
@@ -124,8 +128,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 #define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
@@ -559,22 +559,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VSUB(UChi_02,result_22,result_22)\
  VSUB(UChi_12,result_32,result_32) );
-#define PREFETCH_CHIMU(A) \
+#define PREFETCH_CHIMU(A) 
 /*
  LOAD64(%r9,A)						\
 	   __asm__ (						\
-  VPREFETCHG(12,%r9)\
+  VPREFETCHG(0,%r9)\
-  VPREFETCHG(13,%r9)\
+  VPREFETCHG(1,%r9)\
-  VPREFETCHG(14,%r9)\
+  VPREFETCHG(2,%r9)\
-  VPREFETCHG(15,%r9)\
+  VPREFETCHG(3,%r9)\
-  VPREFETCHG(16,%r9)\
+  VPREFETCHG(4,%r9)\
-  VPREFETCHG(17,%r9)\
+  VPREFETCHG(5,%r9)\
-  VPREFETCHG(18,%r9)\
+  VPREFETCHG(6,%r9)\
-  VPREFETCHG(19,%r9)\
+  VPREFETCHG(7,%r9)\
-  VPREFETCHG(20,%r9)\
+  VPREFETCHG(8,%r9)\
-  VPREFETCHG(21,%r9)\
+  VPREFETCHG(9,%r9)\
-  VPREFETCHG(22,%r9)\
+  VPREFETCHG(10,%r9)\
-  VPREFETCHG(23,%r9));
+  VPREFETCHG(11,%r9));
-
+*/
 #define PERMUTE_DIR0 __asm__ ( 	\
  VPERM0(Chi_00,Chi_00)	\
  VPERM0(Chi_01,Chi_01)	\
@@ -612,8 +613,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  LOAD64(%r8,ptr)						\
  LOAD64(%r9,pf)						\
 	   __asm__ (						\
-	   VPREFETCH2(9,%r8)				   \
+	   VPREFETCH2(9,%r8)				   VPREFETCH2(10,%r8)					   \
 	   VPREFETCH2(10,%r8)					   \
 	   VPREFETCH2(11,%r8)					   \
 	   VPREFETCH2(12,%r8)					   \
 	   VPREFETCH2(13,%r8)					   \
@@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
 {
  grid = _grid;
  if ( Block[0]==0) ZGraph();
  else if ( Block[1]==0) NoBlocking();
  else CartesianBlocking();
 }
 void LebesgueOrder::NoBlocking(void) 
 {
  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
  _LebesgueReorder.resize(0);
  for ( int s = 0 ; s!= grid->oSites();s++){
    _LebesgueReorder.push_back(s);
  }
 }
 void LebesgueOrder::CartesianBlocking(void) 
 {
  _LebesgueReorder.resize(0);
-  std::cout << GridLogMessage << " CartesianBlocking ";
+  std::cout << GridLogDebug << " CartesianBlocking ";
-  for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
+  //    for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
-  std::cout<<std::endl; 
+  //    std::cout<<std::endl; 
  IndexInteger ND = grid->_ndimension;
@@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
 void LebesgueOrder::ZGraph(void) 
 {
  _LebesgueReorder.resize(0);
-  
+
  std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
  // Align up dimensions to power of two.
  const IndexInteger one=1;
@@ -59,6 +59,7 @@ namespace Grid {
    // Cartesian stencil blocking strategy
    /////////////////////////////////
    static std::vector<int> Block;
    void NoBlocking(void);
    void CartesianBlocking(void);
    void IterateO(int ND,int dim,
 		  std::vector<IndexInteger> & xo,