From 805255627593de52c8e68fe994e0a7ee9481faaa Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 31 Mar 2016 14:51:32 +0100
Subject: [PATCH 1/9] Cleaning up the single/double kernel implementation
 switch

---
 lib/qcd/action/fermion/WilsonKernelsAsm.cc |  97 +--------------
 lib/simd/Intel512double.h                  | 135 +++++++++++++++++++++
 lib/simd/Intel512single.h                  | 135 +++++++++++++++++++++
 lib/simd/Intel512wilson.h                  |  17 +--
 4 files changed, 281 insertions(+), 103 deletions(-)
 create mode 100644 lib/simd/Intel512double.h
 create mode 100644 lib/simd/Intel512single.h
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index b5f016f5..bdda199f 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -32,81 +32,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #include <simd/Intel512wilson.h>
 
-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-#undef VZERO
-#undef VTIMESI
-#undef VTIMESMINUSI
-#undef VMOVIDUP 
-#undef VMOVRDUP 
-#undef VMADDSUB
-#undef VSHUF
+#include <simd/Intel512single.h>
 
-#define VZERO(A)                  VZEROf(A)
-#define VMOV(A,B)                 VMOVf(A,B)
-#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
-
-#define VADD(A,B,C)               VADDf(A,B,C)
-#define VSUB(A,B,C)               VSUBf(A,B,C)
-#define VMUL(Uri,Uir,Chi)  VMULf(Uri,Uir,Chi)
-#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
-
-#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
-#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
-#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
-#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
-
-#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
-#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
-#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
-#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
-
-#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
-#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
-#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
-#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
-
-#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
-#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
-#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
-#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
-
-#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
-#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
-
-#define VPERM0(A,B)               VPERM0f(A,B)
-#define VPERM1(A,B)               VPERM1f(A,B)
-#define VPERM2(A,B)               VPERM2f(A,B)
-#define VPERM3(A,B)               VPERM3f(A,B)
-#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
-
-#define ZEND1(A,B,C)               ZEND1f(A,B,C)
-#define ZEND2(A,B,C)               ZEND2f(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
-#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
-#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
-#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) 
-#define VSHUF(A,B) VSHUFf(A,B)
-
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 
 namespace Grid {
 namespace QCD {
@@ -136,26 +63,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 
   SE=st.GetEntry(ptype,Xm,ss);
 
-#if 0
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  LOAD64(%r9,pf);
-  __asm__( 
-	  VPREFETCH(0,%r9)
-	  VPREFETCH(1,%r9)
-	  VPREFETCH(2,%r9)
-	  VPREFETCH(3,%r9)
-	  VPREFETCH(4,%r9)
-	  VPREFETCH(5,%r9)
-	  VPREFETCH(6,%r9)
-	  VPREFETCH(7,%r9)
-	  VPREFETCH(8,%r9)
-	  VPREFETCH(9,%r9)
-	  VPREFETCH(10,%r9)
-	  VPREFETCH(11,%r9) );
-#endif
-
   // Xm
   offset = SE->_offset;
   local  = SE->_is_local;
@@ -322,8 +229,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
   offset = SE->_offset;
   local  = SE->_is_local;
     
-  //  PREFETCH_R(A);
-
   // Prefetch
   SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
   if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h
new file mode 100644
index 00000000..a2e9d38f
--- /dev/null
+++ b/lib/simd/Intel512double.h
@@ -0,0 +1,135 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearage
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROd(A)
+#define VMOV(A,B)                 VMOVd(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDd(A,B,C)
+#define VSUB(A,B,C)               VSUBd(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0d(A,B)
+#define VPERM1(A,B)               VPERM1d(A,B)
+#define VPERM2(A,B)               VPERM2d(A,B)
+#define VPERM3(A,B)               VPERM3d(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSBUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
+#define VSHUD(A,B)                                       VSHUFd(A,B)
+
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h
new file mode 100644
index 00000000..c94a7852
--- /dev/null
+++ b/lib/simd/Intel512single.h
@@ -0,0 +1,135 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearge of macros
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROf(A)
+#define VMOV(A,B)                 VMOVf(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDf(A,B,C)
+#define VSUB(A,B,C)               VSUBf(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0f(A,B)
+#define VPERM1(A,B)               VPERM1f(A,B)
+#define VPERM2(A,B)               VPERM2f(A,B)
+#define VPERM3(A,B)               VPERM3f(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSBUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFf(A,B)
+
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 2d0e1e35..64087ea6 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -201,7 +201,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
 // KNL is DUAL issue for FP, and lifting these loads is potentially important.
 // Need detailed profile data to be sure.
-
+#if 0
 #define PREFETCH_U(A) \
   LOAD64(%r8,&U._odata[sU](A)) \
   __asm__ (		       \
@@ -230,7 +230,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VPREFETCHW(9,%r8)	       \
   VPREFETCHW(10,%r8)	       \
   VPREFETCHW(11,%r8)	       );
-
+#endif
  
 #define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
 
@@ -244,6 +244,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
 
+#if 0
 #define MULT_2SPIN_UNOPT(ptr)				\
 	   LOAD64(%r8,ptr)			\
   __asm__ (					\
@@ -289,6 +290,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   ZEND2(UChi_11,Z3,Chi_10)			\
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
+#endif
 
 #define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
@@ -299,10 +301,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
 
-#define MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
+// MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
 #define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
 
-
+#if 0
 #define MULT_2SPIN_PF(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
@@ -343,8 +345,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   VPF(11,%r9)						\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
+#endif
 
-
+#if 0 
 #define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
@@ -364,7 +367,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VPF(9,%r9)						\
 	   VPF(10,%r9)						\
 	   VPF(11,%r9)						);
-
+#endif
 
 // Pretty much Perfectly Pipelined
 
@@ -720,7 +723,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSUB(UChi_11,result_31,result_31)\
   VSUB(UChi_12,result_32,result_32) );
 
-#define PREFETCH_CHIMU(A) 
+//define PREFETCH_CHIMU(A) 
 
 #define PERMUTE_DIR0 __asm__ ( 	\
   VPERM0(Chi_00,Chi_00)	\

From f7b1060aedb8a5a264897cb2a6d9119a3e96206a Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 31 Mar 2016 14:52:37 +0100
Subject: [PATCH 2/9] Use headers to clear macros and sub precision

---
 tests/Test_zmm.cc | 78 ++---------------------------------------------
 1 file changed, 2 insertions(+), 76 deletions(-)

diff --git a/tests/Test_zmm.cc b/tests/Test_zmm.cc
index 0417ce8d..90d92c46 100644
--- a/tests/Test_zmm.cc
+++ b/tests/Test_zmm.cc
@@ -252,39 +252,7 @@ int main(int argc,char **argv)
 #endif
 }
 
-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND1
-#undef ZEND2
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-#undef VMOVIDUP 
-#undef VMOVRDUP 
-#undef VMADDSUB
-#undef VSHUF
-
-#define VZERO(A) VZEROd(A)
-#define VTIMESI(A,B,C) VTIMESId(A,B,C)
-#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
-
-#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
-#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
-#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
-#define ZEND1(A,B,C)              ZEND1d(A,B,C)
-#define ZEND2(A,B,C)              ZEND2d(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADd(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULd(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDd(A,B,C,D,E)
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C)
-#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C)
-#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum) 
-#define VSHUF(A,B) VSHUFd(A,B)
+#include <simd/Intel512double.h>
 
 #define zz Z0
 
@@ -415,49 +383,7 @@ void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3)
 
 }
 
-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND1
-#undef ZEND2
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-#undef VZERO
-#undef VTIMESI
-#undef VTIMESI0
-#undef VTIMESI1
-#undef VTIMESI2
-#undef VTIMESMINUSI
-#undef ZMULMEM2SP
-#undef ZMADDMEM2SP
-#undef VMOVIDUP 
-#undef VMOVRDUP 
-#undef VMADDSUB
-#undef VSHUF
-
-#define VZERO(A) VZEROf(A)
-#define VMOV(A,B) VMOVf(A,B)
-#define VADD(A,B,C) VADDf(A,B,C)
-#define VSUB(A,B,C) VSUBf(A,B,C)
-#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
-#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
-#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
-#define VMUL(Uri,Uir,Chi)  VMULf(Uri,Uir,Chi)
-#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
-#define ZEND1(A,B,C)               ZEND1f(A,B,C)
-#define ZEND2(A,B,C)               ZEND2f(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
-#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) 
-#define VSHUF(A,B) VSHUFf(A,B)
+#include <simd/Intel512single.h>
 
 void ZmulF(void *ptr1,void *ptr2,void *ptr3)
 {

From f473ef75910c4c1950c15d1cc0c77b354b53031f Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 31 Mar 2016 07:47:42 -0700
Subject: [PATCH 3/9] Fixing the compile

---
 lib/simd/Intel512double.h | 4 ++--
 lib/simd/Intel512single.h | 2 +-
 lib/simd/Intel512wilson.h | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h
index a2e9d38f..2b2b9099 100644
--- a/lib/simd/Intel512double.h
+++ b/lib/simd/Intel512double.h
@@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #undef VMOVIDUP
 #undef VMOVRDUP
-#undef VMADDSBUB
+#undef VMADDSUB
 #undef VSHUF
 #define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
 #define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
-#define VSHUD(A,B)                                       VSHUFd(A,B)
+#define VSHUF(A,B)                                       VSHUFd(A,B)
 
 
 #undef ZEND1
diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h
index c94a7852..3a39c6a4 100644
--- a/lib/simd/Intel512single.h
+++ b/lib/simd/Intel512single.h
@@ -109,7 +109,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #undef VMOVIDUP
 #undef VMOVRDUP
-#undef VMADDSBUB
+#undef VMADDSUB
 #undef VSHUF
 #define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h
index 64087ea6..243b89ed 100644
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -302,7 +302,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
 
 // MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
-#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
 
 #if 0
 #define MULT_2SPIN_PF(ptr,pf,VPF)			\
@@ -816,4 +815,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
            VMADDSUB(Z5,Chi_12,UChi_12)\
                                                 );
 
+#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
+
 #endif

From e67fc2be183bc7da011ee087e1e0219e7843d11d Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 31 Mar 2016 16:00:37 +0100
Subject: [PATCH 4/9] Adding a trial for openmp overhead minimisation

---
 lib/qcd/action/fermion/WilsonFermion5D.cc | 120 ++++++++++++++++++++++
 lib/qcd/action/fermion/WilsonFermion5D.h  |   7 ++
 2 files changed, 127 insertions(+)

diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index b78f030e..581a3fc5 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -418,6 +418,126 @@ PARALLEL_FOR_LOOP
   alltime+=usecond();
 }
 
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
+						 DoubledGaugeField & U,
+						 const FermionField &in, FermionField &out,int dag)
+{
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  alltime-=usecond();
+  Compressor compressor(dag);
+
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
+  commtime -=usecond();
+  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
+  commtime +=usecond();
+
+  jointime -=usecond();
+  jointime +=usecond();
+  
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  // Not loop ordering and data layout.
+  // Designed to create 
+  // - per thread reuse in L1 cache for U
+  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+
+#pragma omp parallel 
+  {
+  for(int jjj=0;jjj<1000;jjj++){
+#pragma omp barrier
+  dslashtime -=usecond();
+  if ( dag == DaggerYes ) {
+    if( this->HandOptDslash ) {
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+      }
+    } else { 
+
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	{
+	  int sd;
+	  for(sd=0;sd<Ls;sd++){
+	    int sU=ss;
+	    int sF = sd+Ls*sU;
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+    }
+  } else {
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp for
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=ss+ ssoff;
+
+	    if ( LebesgueOrder::UseLebesgueOrder ) {
+	      sU = lo.Reorder(sU);
+	    }
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+#pragma omp for
+
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    } else { 
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU; 
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    }
+  }
+  }
+  }
+  dslashtime +=usecond();
+  alltime+=usecond();
+}
+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 						     DoubledGaugeField & U,
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h
index 30e663e8..164a3c1a 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -120,6 +120,13 @@ namespace Grid {
 			FermionField &out,
 			int dag);
 
+      void DhopInternalOMPbench(StencilImpl & st,
+				LebesgueOrder &lo,
+				DoubledGaugeField &U,
+				const FermionField &in, 
+				FermionField &out,
+				int dag);
+
       void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,

From e8dddb1596f271c15fa2161dca47f6ea672c8918 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 6 Apr 2016 10:32:54 +0100
Subject: [PATCH 5/9] Adding extra benchmark

---
 benchmarks/Benchmark_zmm.cc               | 174 ++++++++++++++++++++++
 lib/qcd/action/fermion/WilsonFermion5D.cc | 120 ++++++++++++++-
 2 files changed, 293 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/Benchmark_zmm.cc

diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc
new file mode 100644
index 00000000..f7bc8e8e
--- /dev/null
+++ b/benchmarks/Benchmark_zmm.cc
@@ -0,0 +1,174 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_zmm.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+#include <PerfCount.h>
+#include <simd/Intel512wilson.h>
+
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+void ZmulF(void *ptr1,void *ptr2,void *ptr3);
+void Zmul(void *ptr1,void *ptr2,void *ptr3);
+void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
+void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
+void TimesIAvx512F(void *ptr1,void *ptr3);
+void TimesIAvx512(void *ptr1,void *ptr3);
+void TimesMinusIAvx512F(void *ptr1,void *ptr3);
+void TimesMinusIAvx512(void *ptr1,void *ptr3);
+
+
+int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
+
+int main(int argc,char **argv)
+{
+  Grid_init(&argc,&argv);
+  std::ofstream os("zmm.dat");
+
+  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
+  for(int L=4;L<32;L+=2){
+    for(int m=1;m<=2;m++){
+      for(int Ls=8;Ls<=16;Ls+=8){
+	std::vector<int> grid({L,L,m*L,m*L});
+	bench(os,latt4,Ls);
+      }
+    }
+  }
+}
+
+int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
+{
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  int threads = GridThread::GetThreads();
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
+
+  LatticeFermion src (FGrid);
+  LatticeFermion tmp (FGrid);
+  LatticeFermion srce(FrbGrid);
+
+  LatticeFermion resulto(FrbGrid); resulto=zero;
+  LatticeFermion resulta(FrbGrid); resulta=zero;
+  LatticeFermion junk(FrbGrid); junk=zero;
+  LatticeFermion diff(FrbGrid); 
+  LatticeGaugeField Umu(UGrid);
+
+  double mfc, mfa, mfo, mfl1;
+
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  random(RNG5,src);
+#if 1
+  random(RNG4,Umu);
+#else
+  int mmu=2;
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    if ( mu!=mmu ) U[mu] = zero;
+    if ( mu==mmu ) U[mu] = 1.0;
+    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
+  }
+#endif
+ pickCheckerboard(Even,srce,src);
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  int ncall=50;
+  double t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.DhopOE(srce,resulto,0);
+  }
+  double t1=usecond();
+
+  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+  double flops=1344*volume/2;
+
+  mfc = flops*ncall/(t1-t0);
+  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
+
+  QCD::WilsonFermion5DStatic::AsmOptDslash=1;
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.DhopOE(srce,resulta,0);
+  }
+  t1=usecond();
+  mfa = flops*ncall/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
+
+  t0=usecond();
+  for(int i=0;i<1;i++){
+    Dw.DhopInternalOMPbench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag);
+  }
+  t1=usecond();
+  mfo = flops*100/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
+
+  t0=usecond();
+  for(int i=0;i<1;i++){
+    Dw.DhopInternalL1bench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag);
+  }
+  t1=usecond();
+  mfl1= flops*100/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
+
+  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
+     << mfc<<" "
+     << mfa<<" "
+     << mfo<<" "
+     << mfl1<<std::endl;
+
+  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+    Dw.DhopOE(srce,resulta,0);
+    PerformanceCounter Counter(i);
+    Counter.Start();
+    Dw.DhopOE(srce,resulta,0);
+    Counter.Stop();
+    Counter.Report();
+  }
+  //resulta = (-0.5) * resulta;
+
+  diff = resulto-resulta;
+  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
+  std::cout<<std::endl;
+}
+
+
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 581a3fc5..9874031d 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -450,7 +450,7 @@ void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder
 
 #pragma omp parallel 
   {
-  for(int jjj=0;jjj<1000;jjj++){
+  for(int jjj=0;jjj<100;jjj++){
 #pragma omp barrier
   dslashtime -=usecond();
   if ( dag == DaggerYes ) {
@@ -538,6 +538,124 @@ void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder
   alltime+=usecond();
 }
 
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
+						DoubledGaugeField & U,
+						const FermionField &in, FermionField &out,int dag)
+{
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  alltime-=usecond();
+  Compressor compressor(dag);
+
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
+  commtime -=usecond();
+  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
+  commtime +=usecond();
+
+  jointime -=usecond();
+  jointime +=usecond();
+  
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  // Not loop ordering and data layout.
+  // Designed to create 
+  // - per thread reuse in L1 cache for U
+  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+
+#pragma omp parallel 
+  {
+  for(int jjj=0;jjj<100;jjj++){
+#pragma omp barrier
+  dslashtime -=usecond();
+  if ( dag == DaggerYes ) {
+    if( this->HandOptDslash ) {
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+      }
+    } else { 
+
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	{
+	  int sd;
+	  for(sd=0;sd<Ls;sd++){
+	    int sU=0;
+	    int sF = sd+Ls*sU;
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+    }
+  } else {
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp for
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=0;
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+#pragma omp for
+
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    } else { 
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU; 
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    }
+  }
+  }
+  }
+  dslashtime +=usecond();
+  alltime+=usecond();
+}
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 						     DoubledGaugeField & U,

From b1192a89085c1fbafe90fc1b03c1bc486c80353b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 6 Apr 2016 03:00:07 -0700
Subject: [PATCH 6/9] Benchmark_zmm added

---
 benchmarks/Benchmark_zmm.cc              | 13 ++-----------
 benchmarks/Make.inc                      |  6 +++++-
 lib/Make.inc                             |  4 ++--
 lib/qcd/action/fermion/WilsonFermion5D.h | 10 +++++++++-
 tests/Make.inc                           | 22 +++++++++++-----------
 5 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc
index f7bc8e8e..a82fb23c 100644
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -27,21 +27,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
-#include <simd/Intel512wilson.h>
 
 
 using namespace Grid;
 using namespace Grid::QCD;
 
-void ZmulF(void *ptr1,void *ptr2,void *ptr3);
-void Zmul(void *ptr1,void *ptr2,void *ptr3);
-void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
-void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
-void TimesIAvx512F(void *ptr1,void *ptr3);
-void TimesIAvx512(void *ptr1,void *ptr3);
-void TimesMinusIAvx512F(void *ptr1,void *ptr3);
-void TimesMinusIAvx512(void *ptr1,void *ptr3);
-
 
 int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
 
@@ -55,7 +45,7 @@ int main(int argc,char **argv)
     for(int m=1;m<=2;m++){
       for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
-	bench(os,latt4,Ls);
+	bench(os,grid,Ls);
       }
     }
   }
@@ -134,6 +124,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
   mfa = flops*ncall/(t1-t0);
   std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
 
+  int dag=DaggerNo;
   t0=usecond();
   for(int i=0;i<1;i++){
     Dw.DhopInternalOMPbench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag);
diff --git a/benchmarks/Make.inc b/benchmarks/Make.inc
index 484306ff..18f08a3d 100644
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,5 +1,5 @@
 
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 
 
 Benchmark_comms_SOURCES=Benchmark_comms.cc
@@ -25,3 +25,7 @@ Benchmark_su3_LDADD=-lGrid
 Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 Benchmark_wilson_LDADD=-lGrid
 
+
+Benchmark_zmm_SOURCES=Benchmark_zmm.cc
+Benchmark_zmm_LDADD=-lGrid
+
diff --git a/lib/Make.inc b/lib/Make.inc
index b3d147f0..06b9da14 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./Lexicographic.h ./Cshift.h ./cshift/Cshift_none.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./pugixml/pugixml.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_outer.h ./tensors/Tensor_exp.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_logical.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_unary.h ./tensors/Tensor_inner.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_index.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_traits.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_reality.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_base.h ./Communicator.h ./Threads.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/QCD.h ./qcd/action/ActionParams.h ./qcd/action/ActionBase.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/Actions.h ./qcd/utils/SUn.h ./qcd/utils/CovariantCshift.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/LinalgUtils.h ./qcd/utils/WilsonLoops.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/HMC.h ./qcd/hmc/NerscCheckpointer.h ./stencil/Lebesgue.h ./Cartesian.h ./Grid.h ./Simd.h ./Tensors.h ./PerfCount.h ./Old/Tensor_poke.h ./Old/Tensor_peek.h ./AlignedAllocator.h ./parallelIO/NerscIO.h ./parallelIO/BinaryIO.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/Francis.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/Householder.h ./algorithms/iterative/Matrix.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Remez.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/CoarsenedMatrix.h ./algorithms/Preconditioner.h ./Lattice.h ./simd/Grid_neon.h ./simd/Grid_avx512.h ./simd/Grid_sse4.h ./simd/Intel512imci.h ./simd/Intel512avx.h ./simd/Grid_qpx.h ./simd/Intel512avxAddsub.h ./simd/Grid_vector_unops.h ./simd/Grid_empty.h ./simd/Intel512wilson.h ./simd/Grid_avx.h ./simd/Intel512common.h ./simd/Grid_vector_types.h ./simd/Grid_imci.h ./lattice/Lattice_reduction.h ./lattice/Lattice_where.h ./lattice/Lattice_rng.h ./lattice/Lattice_local.h ./lattice/Lattice_transfer.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_reality.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_unary.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_arith.h ./lattice/Lattice_trace.h ./Stencil.h ./Init.h ./Log.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/XmlIO.h ./serialisation/TextIO.h ./serialisation/Serialisation.h ./serialisation/MacroMagic.h ./Algorithms.h ./communicator/Communicator_base.h ./Timer.h
+HFILES=./Algorithms.h ./AlignedAllocator.h ./Cartesian.h ./Communicator.h ./Cshift.h ./Grid.h ./Init.h ./Lattice.h ./Lexicographic.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./PerfCount.h ./Simd.h ./Stencil.h ./Tensors.h ./Threads.h ./Timer.h ./algorithms/CoarsenedMatrix.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./communicator/Communicator_base.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./lattice/Lattice_ET.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/QCD.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/WilsonTMFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/gauge/GaugeImpl.h ./qcd/action/gauge/PlaqPlusRectangleAction.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/HmcRunner.h ./qcd/hmc/NerscCheckpointer.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SUn.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Intel512avx.h ./simd/Intel512avxAddsub.h ./simd/Intel512common.h ./simd/Intel512double.h ./simd/Intel512imci.h ./simd/Intel512single.h ./simd/Intel512wilson.h ./stencil/Lebesgue.h ./tensors/Tensor_Ta.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h
 
-CCFILES=./pugixml/pugixml.cc ./PerfCount.cc ./Init.cc ./qcd/spin/Dirac.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/utils/SpaceTimeGrid.cc ./qcd/hmc/HMC.cc ./stencil/Stencil_common.cc ./stencil/Lebesgue.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./algorithms/approx/MultiShiftFunction.cc ./Log.cc ./serialisation/TextIO.cc ./serialisation/BinaryIO.cc ./serialisation/XmlIO.cc
+CCFILES=./Init.cc ./Log.cc ./PerfCount.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonTMFermion.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h
index 164a3c1a..840c1a46 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -1,3 +1,4 @@
+
     /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -127,6 +128,13 @@ namespace Grid {
 				FermionField &out,
 				int dag);
 
+      void DhopInternalL1bench(StencilImpl & st,
+				LebesgueOrder &lo,
+				DoubledGaugeField &U,
+				const FermionField &in, 
+				FermionField &out,
+				int dag);
+
       void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
@@ -155,7 +163,7 @@ namespace Grid {
       ///////////////////////////////////////////////////////////////
       // Data members require to support the functionality
       ///////////////////////////////////////////////////////////////
-    protected:
+    public:
 
       // Add these to the support from Wilson
       GridBase *_FourDimGrid;
diff --git a/tests/Make.inc b/tests/Make.inc
index ef9681cd..f7c83671 100644
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,13 @@
 
-bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+
+
+Test_GaugeAction_SOURCES=Test_GaugeAction.cc
+Test_GaugeAction_LDADD=-lGrid
+
+
+Test_RectPlaq_SOURCES=Test_RectPlaq.cc
+Test_RectPlaq_LDADD=-lGrid
 
 
 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
@@ -94,8 +102,8 @@ Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid
 
 
-Test_GaugeAction_SOURCES=Test_GaugeAction.cc
-Test_GaugeAction_LDADD=-lGrid
+Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
+Test_gp_rect_force_LDADD=-lGrid
 
 
 Test_gparity_SOURCES=Test_gparity.cc
@@ -106,10 +114,6 @@ Test_gpdwf_force_SOURCES=Test_gpdwf_force.cc
 Test_gpdwf_force_LDADD=-lGrid
 
 
-Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
-Test_gp_rect_force_LDADD=-lGrid
-
-
 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 Test_gpwilson_even_odd_LDADD=-lGrid
 
@@ -186,10 +190,6 @@ Test_rect_force_SOURCES=Test_rect_force.cc
 Test_rect_force_LDADD=-lGrid
 
 
-Test_RectPlaq_SOURCES=Test_RectPlaq.cc
-Test_RectPlaq_LDADD=-lGrid
-
-
 Test_remez_SOURCES=Test_remez.cc
 Test_remez_LDADD=-lGrid
 

From 23a7176b712b70e9d7985c79022d74e0562b18c9 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 6 Apr 2016 03:22:11 -0700
Subject: [PATCH 7/9] Loop over volumes

---
 benchmarks/Benchmark_zmm.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc
index a82fb23c..12543c6d 100644
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -41,7 +41,7 @@ int main(int argc,char **argv)
   std::ofstream os("zmm.dat");
 
   os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
-  for(int L=4;L<32;L+=2){
+  for(int L=8;L<32;L+=2){
     for(int m=1;m<=2;m++){
       for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
@@ -127,7 +127,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
   int dag=DaggerNo;
   t0=usecond();
   for(int i=0;i<1;i++){
-    Dw.DhopInternalOMPbench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag);
+    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
   }
   t1=usecond();
   mfo = flops*100/(t1-t0);
@@ -135,7 +135,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
 
   t0=usecond();
   for(int i=0;i<1;i++){
-    Dw.DhopInternalL1bench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag);
+    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
   }
   t1=usecond();
   mfl1= flops*100/(t1-t0);
@@ -147,6 +147,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
      << mfo<<" "
      << mfl1<<std::endl;
 
+#if 0
   for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
     Dw.DhopOE(srce,resulta,0);
     PerformanceCounter Counter(i);
@@ -155,6 +156,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
     Counter.Stop();
     Counter.Report();
   }
+#endif
   //resulta = (-0.5) * resulta;
 
   diff = resulto-resulta;

From a524ca2a4baefca48536cdd936776bbdf4d3c423 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 6 Apr 2016 03:35:56 -0700
Subject: [PATCH 8/9] New benchmark update

---
 benchmarks/Benchmark_zmm.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc
index 12543c6d..5c84ad79 100644
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -41,10 +41,14 @@ int main(int argc,char **argv)
   std::ofstream os("zmm.dat");
 
   os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
-  for(int L=8;L<32;L+=2){
+  for(int L=8;L<=32;L+=4){
     for(int m=1;m<=2;m++){
       for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
+	for(int i=0;i<4;i++) { 
+	  std::cout << grid[i]<<"x";
+	}
+	std::cout << Ls<<std::endl;
 	bench(os,grid,Ls);
       }
     }

From 650e02b34486f970ad164e28790207fb2a16d238 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 6 Apr 2016 06:52:09 -0700
Subject: [PATCH 9/9] Smaller vols too

---
 benchmarks/Benchmark_zmm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc
index 5c84ad79..ebe7282e 100644
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -41,7 +41,7 @@ int main(int argc,char **argv)
   std::ofstream os("zmm.dat");
 
   os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
-  for(int L=8;L<=32;L+=4){
+  for(int L=4;L<=32;L+=4){
     for(int m=1;m<=2;m++){
       for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});