Merge remote-tracking branch 'origin/master' into ckelly-dec12-2015

2026-03-01 18:16:13 +00:00 · 2016-04-06 13:57:28 -04:00
parent af9c8d1372 650e02b344
commit a646260e82
11 changed files with 728 additions and 195 deletions
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -418,6 +418,244 @@ PARALLEL_FOR_LOOP
  alltime+=usecond();
 }

+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
+						 DoubledGaugeField & U,
+						 const FermionField &in, FermionField &out,int dag)
+{
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  alltime-=usecond();
+  Compressor compressor(dag);
+
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
+  commtime -=usecond();
+  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
+  commtime +=usecond();
+
+  jointime -=usecond();
+  jointime +=usecond();
+  
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  // Not loop ordering and data layout.
+  // Designed to create 
+  // - per thread reuse in L1 cache for U
+  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+
+#pragma omp parallel 
+  {
+  for(int jjj=0;jjj<100;jjj++){
+#pragma omp barrier
+  dslashtime -=usecond();
+  if ( dag == DaggerYes ) {
+    if( this->HandOptDslash ) {
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+      }
+    } else { 
+
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	{
+	  int sd;
+	  for(sd=0;sd<Ls;sd++){
+	    int sU=ss;
+	    int sF = sd+Ls*sU;
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+    }
+  } else {
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp for
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=ss+ ssoff;
+
+	    if ( LebesgueOrder::UseLebesgueOrder ) {
+	      sU = lo.Reorder(sU);
+	    }
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+#pragma omp for
+
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    } else { 
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU; 
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    }
+  }
+  }
+  }
+  dslashtime +=usecond();
+  alltime+=usecond();
+}
+
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
+						DoubledGaugeField & U,
+						const FermionField &in, FermionField &out,int dag)
+{
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  alltime-=usecond();
+  Compressor compressor(dag);
+
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
+  commtime -=usecond();
+  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
+  commtime +=usecond();
+
+  jointime -=usecond();
+  jointime +=usecond();
+  
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  // Not loop ordering and data layout.
+  // Designed to create 
+  // - per thread reuse in L1 cache for U
+  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+
+#pragma omp parallel 
+  {
+  for(int jjj=0;jjj<100;jjj++){
+#pragma omp barrier
+  dslashtime -=usecond();
+  if ( dag == DaggerYes ) {
+    if( this->HandOptDslash ) {
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+      }
+    } else { 
+
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	{
+	  int sd;
+	  for(sd=0;sd<Ls;sd++){
+	    int sU=0;
+	    int sF = sd+Ls*sU;
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+    }
+  } else {
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp for
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=0;
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+#pragma omp for
+
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    } else { 
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=0;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU; 
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    }
+  }
+  }
+  }
+  dslashtime +=usecond();
+  alltime+=usecond();
+}
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 						     DoubledGaugeField & U,
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -1,3 +1,4 @@
+
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -120,6 +121,20 @@ namespace Grid {
 			FermionField &out,
 			int dag);

+      void DhopInternalOMPbench(StencilImpl & st,
+				LebesgueOrder &lo,
+				DoubledGaugeField &U,
+				const FermionField &in, 
+				FermionField &out,
+				int dag);
+
+      void DhopInternalL1bench(StencilImpl & st,
+				LebesgueOrder &lo,
+				DoubledGaugeField &U,
+				const FermionField &in, 
+				FermionField &out,
+				int dag);
+
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
@@ -148,7 +163,7 @@ namespace Grid {
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
-    protected:
+    public:

      // Add these to the support from Wilson
      GridBase *_FourDimGrid;
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -32,81 +32,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #include <simd/Intel512wilson.h>

-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-#undef VZERO
-#undef VTIMESI
-#undef VTIMESMINUSI
-#undef VMOVIDUP 
-#undef VMOVRDUP 
-#undef VMADDSUB
-#undef VSHUF
+#include <simd/Intel512single.h>

-#define VZERO(A)                  VZEROf(A)
-#define VMOV(A,B)                 VMOVf(A,B)
-#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
-
-#define VADD(A,B,C)               VADDf(A,B,C)
-#define VSUB(A,B,C)               VSUBf(A,B,C)
-#define VMUL(Uri,Uir,Chi)  VMULf(Uri,Uir,Chi)
-#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
-
-#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
-#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
-#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
-#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
-
-#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
-#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
-#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
-#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
-
-#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
-#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
-#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
-#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
-
-#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
-#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
-#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
-#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
-
-#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
-#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
-
-#define VPERM0(A,B)               VPERM0f(A,B)
-#define VPERM1(A,B)               VPERM1f(A,B)
-#define VPERM2(A,B)               VPERM2f(A,B)
-#define VPERM3(A,B)               VPERM3f(A,B)
-#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
-
-#define ZEND1(A,B,C)               ZEND1f(A,B,C)
-#define ZEND2(A,B,C)               ZEND2f(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
-#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
-#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
-#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) 
-#define VSHUF(A,B) VSHUFf(A,B)
-
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 

 namespace Grid {
 namespace QCD {
@@ -136,26 +63,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField

  SE=st.GetEntry(ptype,Xm,ss);

-#if 0
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  LOAD64(%r9,pf);
-  __asm__( 
-	  VPREFETCH(0,%r9)
-	  VPREFETCH(1,%r9)
-	  VPREFETCH(2,%r9)
-	  VPREFETCH(3,%r9)
-	  VPREFETCH(4,%r9)
-	  VPREFETCH(5,%r9)
-	  VPREFETCH(6,%r9)
-	  VPREFETCH(7,%r9)
-	  VPREFETCH(8,%r9)
-	  VPREFETCH(9,%r9)
-	  VPREFETCH(10,%r9)
-	  VPREFETCH(11,%r9) );
-#endif
-
  // Xm
  offset = SE->_offset;
  local  = SE->_is_local;
@@ -322,8 +229,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  offset = SE->_offset;
  local  = SE->_is_local;
    
-  //  PREFETCH_R(A);
-
  // Prefetch
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
--- a/lib/simd/Intel512double.h
+++ b/lib/simd/Intel512double.h
@@ -0,0 +1,135 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearage
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROd(A)
+#define VMOV(A,B)                 VMOVd(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDd(A,B,C)
+#define VSUB(A,B,C)               VSUBd(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0d(A,B)
+#define VPERM1(A,B)               VPERM1d(A,B)
+#define VPERM2(A,B)               VPERM2d(A,B)
+#define VPERM3(A,B)               VPERM3d(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFd(A,B)
+
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
--- a/lib/simd/Intel512single.h
+++ b/lib/simd/Intel512single.h
@@ -0,0 +1,135 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearge of macros
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROf(A)
+#define VMOV(A,B)                 VMOVf(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDf(A,B,C)
+#define VSUB(A,B,C)               VSUBf(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0f(A,B)
+#define VPERM1(A,B)               VPERM1f(A,B)
+#define VPERM2(A,B)               VPERM2f(A,B)
+#define VPERM3(A,B)               VPERM3f(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFf(A,B)
+
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -201,7 +201,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
 // KNL is DUAL issue for FP, and lifting these loads is potentially important.
 // Need detailed profile data to be sure.
-
+#if 0
 #define PREFETCH_U(A) \
  LOAD64(%r8,&U._odata[sU](A)) \
  __asm__ (		       \
@@ -230,7 +230,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VPREFETCHW(9,%r8)	       \
  VPREFETCHW(10,%r8)	       \
  VPREFETCHW(11,%r8)	       );
-
+#endif
 
 #define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))

@@ -244,6 +244,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)

+#if 0
 #define MULT_2SPIN_UNOPT(ptr)				\
 	   LOAD64(%r8,ptr)			\
  __asm__ (					\
@@ -289,6 +290,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   ZEND2(UChi_11,Z3,Chi_10)			\
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
+#endif

 #define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
@@ -299,10 +301,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)

-#define MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
-#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
-
+// MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);

+#if 0
 #define MULT_2SPIN_PF(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
@@ -343,8 +344,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   VPF(11,%r9)						\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
+#endif

-
+#if 0 
 #define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
@@ -364,7 +366,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VPF(9,%r9)						\
 	   VPF(10,%r9)						\
 	   VPF(11,%r9)						);
-
+#endif

 // Pretty much Perfectly Pipelined

@@ -720,7 +722,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VSUB(UChi_11,result_31,result_31)\
  VSUB(UChi_12,result_32,result_32) );

-#define PREFETCH_CHIMU(A) 
+//define PREFETCH_CHIMU(A) 

 #define PERMUTE_DIR0 __asm__ ( 	\
  VPERM0(Chi_00,Chi_00)	\
@@ -813,4 +815,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
           VMADDSUB(Z5,Chi_12,UChi_12)\
                                                );

+#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
+
 #endif