diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc
new file mode 100644
index 00000000..f7bc8e8e
--- /dev/null
+++ b/benchmarks/Benchmark_zmm.cc
@@ -0,0 +1,174 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_zmm.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+#include <PerfCount.h>
+#include <simd/Intel512wilson.h>
+
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+void ZmulF(void *ptr1,void *ptr2,void *ptr3);
+void Zmul(void *ptr1,void *ptr2,void *ptr3);
+void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
+void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
+void TimesIAvx512F(void *ptr1,void *ptr3);
+void TimesIAvx512(void *ptr1,void *ptr3);
+void TimesMinusIAvx512F(void *ptr1,void *ptr3);
+void TimesMinusIAvx512(void *ptr1,void *ptr3);
+
+
+int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
+
+int main(int argc,char **argv)
+{
+  Grid_init(&argc,&argv);
+  std::ofstream os("zmm.dat");
+
+  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
+  for(int L=4;L<32;L+=2){
+    for(int m=1;m<=2;m++){
+      for(int Ls=8;Ls<=16;Ls+=8){
+	std::vector<int> grid({L,L,m*L,m*L});
+	bench(os,latt4,Ls);
+      }
+    }
+  }
+}
+
+int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
+{
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  int threads = GridThread::GetThreads();
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
+
+  LatticeFermion src (FGrid);
+  LatticeFermion tmp (FGrid);
+  LatticeFermion srce(FrbGrid);
+
+  LatticeFermion resulto(FrbGrid); resulto=zero;
+  LatticeFermion resulta(FrbGrid); resulta=zero;
+  LatticeFermion junk(FrbGrid); junk=zero;
+  LatticeFermion diff(FrbGrid); 
+  LatticeGaugeField Umu(UGrid);
+
+  double mfc, mfa, mfo, mfl1;
+
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  random(RNG5,src);
+#if 1
+  random(RNG4,Umu);
+#else
+  int mmu=2;
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    if ( mu!=mmu ) U[mu] = zero;
+    if ( mu==mmu ) U[mu] = 1.0;
+    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
+  }
+#endif
+ pickCheckerboard(Even,srce,src);
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  int ncall=50;
+  double t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.DhopOE(srce,resulto,0);
+  }
+  double t1=usecond();
+
+  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+  double flops=1344*volume/2;
+
+  mfc = flops*ncall/(t1-t0);
+  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
+
+  QCD::WilsonFermion5DStatic::AsmOptDslash=1;
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.DhopOE(srce,resulta,0);
+  }
+  t1=usecond();
+  mfa = flops*ncall/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
+
+  t0=usecond();
+  for(int i=0;i<1;i++){
+    Dw.DhopInternalOMPbench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag);
+  }
+  t1=usecond();
+  mfo = flops*100/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
+
+  t0=usecond();
+  for(int i=0;i<1;i++){
+    Dw.DhopInternalL1bench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag);
+  }
+  t1=usecond();
+  mfl1= flops*100/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
+
+  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
+     << mfc<<" "
+     << mfa<<" "
+     << mfo<<" "
+     << mfl1<<std::endl;
+
+  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+    Dw.DhopOE(srce,resulta,0);
+    PerformanceCounter Counter(i);
+    Counter.Start();
+    Dw.DhopOE(srce,resulta,0);
+    Counter.Stop();
+    Counter.Report();
+  }
+  //resulta = (-0.5) * resulta;
+
+  diff = resulto-resulta;
+  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
+  std::cout<<std::endl;
+}
+
+
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 581a3fc5..9874031d 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -450,7 +450,7 @@ void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder
 
 #pragma omp parallel 
   {
-  for(int jjj=0;jjj<1000;jjj++){
+  for(int jjj=0;jjj<100;jjj++){
 #pragma omp barrier
   dslashtime -=usecond();
   if ( dag == DaggerYes ) {
@@ -538,6 +538,124 @@ void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder
   alltime+=usecond();
 }
 
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
+						DoubledGaugeField & U,
+						const FermionField &in, FermionField &out,int dag)
+{
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  alltime-=usecond();
+  Compressor compressor(dag);
+
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
+  commtime -=usecond();
+  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
+  commtime +=usecond();
+
+  jointime -=usecond();
+  jointime +=usecond();
+  
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  // Not loop ordering and data layout.
+  // Designed to create 
+  // - per thread reuse in L1 cache for U
+  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+
+#pragma omp parallel 
+  {
+  for(int jjj=0;jjj<100;jjj++){
+#pragma omp barrier
+  dslashtime -=usecond();
+  if ( dag == DaggerYes ) {
+    if( this->HandOptDslash ) {
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+      }
+    } else { 
+
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	{
+	  int sd;
+	  for(sd=0;sd<Ls;sd++){
+	    int sU=0;
+	    int sF = sd+Ls*sU;
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+    }
+  } else {
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp for
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=0;
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+#pragma omp for
+
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    } else { 
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU; 
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    }
+  }
+  }
+  }
+  dslashtime +=usecond();
+  alltime+=usecond();
+}
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 						     DoubledGaugeField & U,