Merge remote-tracking branch 'origin/master' into ckelly-dec12-2015

2026-07-17 15:43:27 +01:00 · 2016-04-06 13:57:28 -04:00
parent af9c8d1372 650e02b344
commit a646260e82
11 changed files with 728 additions and 195 deletions
@@ -0,0 +1,171 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_zmm.cc
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
 int main(int argc,char **argv)
 {
  Grid_init(&argc,&argv);
  std::ofstream os("zmm.dat");
  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
  for(int L=4;L<=32;L+=4){
    for(int m=1;m<=2;m++){
      for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
 	for(int i=0;i<4;i++) { 
 	  std::cout << grid[i]<<"x";
 	}
 	std::cout << Ls<<std::endl;
 	bench(os,grid,Ls);
      }
    }
  }
 }
 int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
  LatticeFermion src (FGrid);
  LatticeFermion tmp (FGrid);
  LatticeFermion srce(FrbGrid);
  LatticeFermion resulto(FrbGrid); resulto=zero;
  LatticeFermion resulta(FrbGrid); resulta=zero;
  LatticeFermion junk(FrbGrid); junk=zero;
  LatticeFermion diff(FrbGrid); 
  LatticeGaugeField Umu(UGrid);
  double mfc, mfa, mfo, mfl1;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  random(RNG5,src);
 #if 1
  random(RNG4,Umu);
 #else
  int mmu=2;
  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
    if ( mu!=mmu ) U[mu] = zero;
    if ( mu==mmu ) U[mu] = 1.0;
    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
  }
 #endif
 pickCheckerboard(Even,srce,src);
  RealD mass=0.1;
  RealD M5  =1.8;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=50;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.DhopOE(srce,resulto,0);
  }
  double t1=usecond();
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
  double flops=1344*volume/2;
  mfc = flops*ncall/(t1-t0);
  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
  QCD::WilsonFermion5DStatic::AsmOptDslash=1;
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.DhopOE(srce,resulta,0);
  }
  t1=usecond();
  mfa = flops*ncall/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
  int dag=DaggerNo;
  t0=usecond();
  for(int i=0;i<1;i++){
    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
  }
  t1=usecond();
  mfo = flops*100/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
  t0=usecond();
  for(int i=0;i<1;i++){
    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
  }
  t1=usecond();
  mfl1= flops*100/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
     << mfc<<" "
     << mfa<<" "
     << mfo<<" "
     << mfl1<<std::endl;
 #if 0
  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
    Dw.DhopOE(srce,resulta,0);
    PerformanceCounter Counter(i);
    Counter.Start();
    Dw.DhopOE(srce,resulta,0);
    Counter.Stop();
    Counter.Report();
  }
 #endif
  //resulta = (-0.5) * resulta;
  diff = resulto-resulta;
  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
  std::cout<<std::endl;
 }
@@ -1,5 +1,5 @@
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 Benchmark_comms_SOURCES=Benchmark_comms.cc
@@ -25,3 +25,7 @@ Benchmark_su3_LDADD=-lGrid
 Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 Benchmark_wilson_LDADD=-lGrid
 Benchmark_zmm_SOURCES=Benchmark_zmm.cc
 Benchmark_zmm_LDADD=-lGrid
@@ -418,6 +418,244 @@ PARALLEL_FOR_LOOP
  alltime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
 						 DoubledGaugeField & U,
 						 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
  Compressor compressor(dag);
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
  int threads = GridThread::GetThreads();
  int HT      = GridThread::GetHyperThreads();
  int cores   = GridThread::GetCores();
  int nwork = U._grid->oSites();
  commtime -=usecond();
  auto handle = st.HaloExchangeBegin(in,compressor);
  st.HaloExchangeComplete(handle);
  commtime +=usecond();
  jointime -=usecond();
  jointime +=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 #pragma omp parallel 
  {
  for(int jjj=0;jjj<100;jjj++){
 #pragma omp barrier
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
 	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
    }
  } else {
    if( this->AsmOptDslash ) {
      //      for(int i=0;i<1;i++){
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      //	PerformanceCounter Counter(i);
      //	Counter.Start();
 #pragma omp for
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
        int sswork, swork,soff,ssoff,  sU,sF;
 	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
 	  for(int s=soff;s<soff+swork;s++){
 	    sU=ss+ ssoff;
 	    if ( LebesgueOrder::UseLebesgueOrder ) {
 	      sU = lo.Reorder(sU);
 	    }
 	    sF = s+Ls*sU;
 	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
      //      Counter.Stop();
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
  }
  }
  dslashtime +=usecond();
  alltime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
 						DoubledGaugeField & U,
 						const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
  Compressor compressor(dag);
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
  int threads = GridThread::GetThreads();
  int HT      = GridThread::GetHyperThreads();
  int cores   = GridThread::GetCores();
  int nwork = U._grid->oSites();
  commtime -=usecond();
  auto handle = st.HaloExchangeBegin(in,compressor);
  st.HaloExchangeComplete(handle);
  commtime +=usecond();
  jointime -=usecond();
  jointime +=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 #pragma omp parallel 
  {
  for(int jjj=0;jjj<100;jjj++){
 #pragma omp barrier
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=0;
 	    int sF = sd+Ls*sU;
 	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
    }
  } else {
    if( this->AsmOptDslash ) {
      //      for(int i=0;i<1;i++){
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      //	PerformanceCounter Counter(i);
      //	Counter.Start();
 #pragma omp for
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
        int sswork, swork,soff,ssoff,  sU,sF;
 	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
 	  for(int s=soff;s<soff+swork;s++){
 	    sU=0;
 	    sF = s+Ls*sU;
 	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
      //      Counter.Stop();
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
  }
  }
  dslashtime +=usecond();
  alltime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 						     DoubledGaugeField & U,
@@ -1,3 +1,4 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -120,6 +121,20 @@ namespace Grid {
 			FermionField &out,
 			int dag);
      void DhopInternalOMPbench(StencilImpl & st,
 				LebesgueOrder &lo,
 				DoubledGaugeField &U,
 				const FermionField &in, 
 				FermionField &out,
 				int dag);
      void DhopInternalL1bench(StencilImpl & st,
 				LebesgueOrder &lo,
 				DoubledGaugeField &U,
 				const FermionField &in, 
 				FermionField &out,
 				int dag);
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
@@ -148,7 +163,7 @@ namespace Grid {
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
-    protected:
+    public:
      // Add these to the support from Wilson
      GridBase *_FourDimGrid;
@@ -32,81 +32,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <simd/Intel512wilson.h>
-#undef VLOAD
+#include <simd/Intel512single.h>
 #undef VSTORE
 #undef VMUL
 #undef VMADD
 #undef ZEND
 #undef ZLOAD
 #undef ZMUL
 #undef ZMADD
 #undef VZERO
 #undef VTIMESI
 #undef VTIMESMINUSI
 #undef VMOVIDUP 
 #undef VMOVRDUP 
 #undef VMADDSUB
 #undef VSHUF
 #define VZERO(A)                  VZEROf(A)
 #define VMOV(A,B)                 VMOVf(A,B)
 #define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 #define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 #define VADD(A,B,C)               VADDf(A,B,C)
 #define VSUB(A,B,C)               VSUBf(A,B,C)
 #define VMUL(Uri,Uir,Chi)  VMULf(Uri,Uir,Chi)
 #define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
 #define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
 #define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
 #define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 #define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 #define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
 #define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
 #define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
 #define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
 #define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
 #define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
 #define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
 #define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
 #define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
 #define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
 #define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
 #define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
 #define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
 #define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
 #define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 #define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 #define VPERM0(A,B)               VPERM0f(A,B)
 #define VPERM1(A,B)               VPERM1f(A,B)
 #define VPERM2(A,B)               VPERM2f(A,B)
 #define VPERM3(A,B)               VPERM3f(A,B)
 #define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 #define ZEND1(A,B,C)               ZEND1f(A,B,C)
 #define ZEND2(A,B,C)               ZEND2f(A,B,C)
 #define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 #define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 #define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 #define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
 #define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
 #define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
 #define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) 
 #define VSHUF(A,B) VSHUFf(A,B)
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 namespace Grid {
 namespace QCD {
@@ -136,26 +63,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  SE=st.GetEntry(ptype,Xm,ss);
 #if 0
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  LOAD64(%r9,pf);
  __asm__( 
 	  VPREFETCH(0,%r9)
 	  VPREFETCH(1,%r9)
 	  VPREFETCH(2,%r9)
 	  VPREFETCH(3,%r9)
 	  VPREFETCH(4,%r9)
 	  VPREFETCH(5,%r9)
 	  VPREFETCH(6,%r9)
 	  VPREFETCH(7,%r9)
 	  VPREFETCH(8,%r9)
 	  VPREFETCH(9,%r9)
 	  VPREFETCH(10,%r9)
 	  VPREFETCH(11,%r9) );
 #endif
  // Xm
  offset = SE->_offset;
  local  = SE->_is_local;
@@ -322,8 +229,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  offset = SE->_offset;
  local  = SE->_is_local;
  //  PREFETCH_R(A);
  // Prefetch
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
@@ -0,0 +1,135 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 // No guard can be multiply included as undef clearage
 #undef VZERO
 #undef VMOV
 #undef VLOAD
 #undef VSTORE
 #define VZERO(A)                  VZEROd(A)
 #define VMOV(A,B)                 VMOVd(A,B)
 #define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
 #define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
 #undef VADD
 #undef VSUB
 #undef VMUL
 #undef VMADD
 #define VADD(A,B,C)               VADDd(A,B,C)
 #define VSUB(A,B,C)               VSUBd(A,B,C)
 #define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
 #define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
 #undef VTIMESI
 #undef VTIMESI0 
 #undef VTIMESI1
 #undef VTIMESI2 
 #define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
 #define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
 #define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
 #define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
 #undef VTIMESMINUSI
 #undef VTIMESMINUSI0
 #undef VTIMESMINUSI1
 #undef VTIMESMINUSI2
 #define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
 #define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
 #define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
 #define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
 #undef VACCTIMESI
 #undef VACCTIMESI0
 #undef VACCTIMESI1
 #undef VACCTIMESI2
 #define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
 #define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
 #define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
 #define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
 #undef VACCTIMESMINUSI
 #undef VACCTIMESMINUSI0
 #undef VACCTIMESMINUSI1
 #undef VACCTIMESMINUSI2
 #define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
 #define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
 #define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
 #define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
 #undef VACCTIMESI1MEM
 #undef VACCTIMESI2MEM
 #define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
 #define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
 #undef VACCTIMESMINUSI1MEM
 #undef VACCTIMESMINUSI2MEM
 #define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
 #define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
 #undef VPERM0
 #undef VPERM1
 #undef VPERM2
 #undef VPERM3
 #define VPERM0(A,B)               VPERM0d(A,B)
 #define VPERM1(A,B)               VPERM1d(A,B)
 #define VPERM2(A,B)               VPERM2d(A,B)
 #define VPERM3(A,B)               VPERM3d(A,B)
 #undef VSHUFMEM
 #undef VADDMEM
 #undef VSUBMEM
 #define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
 #define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
 #define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef VMADDSUB
 #undef VSHUF
 #define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
 #define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
 #define VSHUF(A,B)                                       VSHUFd(A,B)
 #undef ZEND1
 #undef ZEND2
 #undef ZLOAD
 #undef ZMUL
 #undef ZMADD
 #undef ZMULMEM2SP
 #undef ZMADDMEM2SP
 #define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
 #define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
 #define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
 #define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
@@ -0,0 +1,135 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 // No guard can be multiply included as undef clearge of macros
 #undef VZERO
 #undef VMOV
 #undef VLOAD
 #undef VSTORE
 #define VZERO(A)                  VZEROf(A)
 #define VMOV(A,B)                 VMOVf(A,B)
 #define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 #define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 #undef VADD
 #undef VSUB
 #undef VMUL
 #undef VMADD
 #define VADD(A,B,C)               VADDf(A,B,C)
 #define VSUB(A,B,C)               VSUBf(A,B,C)
 #define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
 #define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
 #undef VTIMESI
 #undef VTIMESI0 
 #undef VTIMESI1
 #undef VTIMESI2 
 #define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
 #define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
 #define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
 #define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
 #undef VTIMESMINUSI
 #undef VTIMESMINUSI0
 #undef VTIMESMINUSI1
 #undef VTIMESMINUSI2
 #define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
 #define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
 #define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
 #define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
 #undef VACCTIMESI
 #undef VACCTIMESI0
 #undef VACCTIMESI1
 #undef VACCTIMESI2
 #define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 #define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
 #define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
 #define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
 #undef VACCTIMESMINUSI
 #undef VACCTIMESMINUSI0
 #undef VACCTIMESMINUSI1
 #undef VACCTIMESMINUSI2
 #define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 #define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
 #define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
 #define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
 #undef VACCTIMESI1MEM
 #undef VACCTIMESI2MEM
 #define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
 #define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
 #undef VACCTIMESMINUSI1MEM
 #undef VACCTIMESMINUSI2MEM
 #define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 #define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 #undef VPERM0
 #undef VPERM1
 #undef VPERM2
 #undef VPERM3
 #define VPERM0(A,B)               VPERM0f(A,B)
 #define VPERM1(A,B)               VPERM1f(A,B)
 #define VPERM2(A,B)               VPERM2f(A,B)
 #define VPERM3(A,B)               VPERM3f(A,B)
 #undef VSHUFMEM
 #undef VADDMEM
 #undef VSUBMEM
 #define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 #define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
 #define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef VMADDSUB
 #undef VSHUF
 #define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
 #define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
 #define VSHUF(A,B)                                       VSHUFf(A,B)
 #undef ZEND1
 #undef ZEND2
 #undef ZLOAD
 #undef ZMUL
 #undef ZMADD
 #undef ZMULMEM2SP
 #undef ZMADDMEM2SP
 #define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
 #define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
 #define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
 #define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
@@ -201,7 +201,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
 // KNL is DUAL issue for FP, and lifting these loads is potentially important.
 // Need detailed profile data to be sure.
-
+#if 0
 #define PREFETCH_U(A) \
  LOAD64(%r8,&U._odata[sU](A)) \
  __asm__ (		       \
@@ -230,7 +230,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VPREFETCHW(9,%r8)	       \
  VPREFETCHW(10,%r8)	       \
  VPREFETCHW(11,%r8)	       );
-
+#endif
 #define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
@@ -244,6 +244,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
 #if 0
 #define MULT_2SPIN_UNOPT(ptr)				\
 	   LOAD64(%r8,ptr)			\
  __asm__ (					\
@@ -289,6 +290,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   ZEND2(UChi_11,Z3,Chi_10)			\
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
 #endif
 #define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
@@ -299,10 +301,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
 #define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
-#define MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
+// MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
 #define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
 #if 0
 #define MULT_2SPIN_PF(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
@@ -343,8 +344,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   VPF(11,%r9)						\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
 #endif
-
+#if 0 
 #define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
@@ -364,7 +366,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VPF(9,%r9)						\
 	   VPF(10,%r9)						\
 	   VPF(11,%r9)						);
-
+#endif
 // Pretty much Perfectly Pipelined
@@ -720,7 +722,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VSUB(UChi_11,result_31,result_31)\
  VSUB(UChi_12,result_32,result_32) );
-#define PREFETCH_CHIMU(A) 
+//define PREFETCH_CHIMU(A) 
 #define PERMUTE_DIR0 __asm__ ( 	\
  VPERM0(Chi_00,Chi_00)	\
@@ -813,4 +815,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
           VMADDSUB(Z5,Chi_12,UChi_12)\
                                                );
 #define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
 #endif
@@ -1,5 +1,13 @@
-bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 Test_GaugeAction_LDADD=-lGrid
 Test_RectPlaq_SOURCES=Test_RectPlaq.cc
 Test_RectPlaq_LDADD=-lGrid
 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
@@ -94,8 +102,8 @@ Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid
-Test_GaugeAction_SOURCES=Test_GaugeAction.cc
+Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
-Test_GaugeAction_LDADD=-lGrid
+Test_gp_rect_force_LDADD=-lGrid
 Test_gparity_SOURCES=Test_gparity.cc
@@ -106,10 +114,6 @@ Test_gpdwf_force_SOURCES=Test_gpdwf_force.cc
 Test_gpdwf_force_LDADD=-lGrid
 Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
 Test_gp_rect_force_LDADD=-lGrid
 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 Test_gpwilson_even_odd_LDADD=-lGrid
@@ -186,10 +190,6 @@ Test_rect_force_SOURCES=Test_rect_force.cc
 Test_rect_force_LDADD=-lGrid
 Test_RectPlaq_SOURCES=Test_RectPlaq.cc
 Test_RectPlaq_LDADD=-lGrid
 Test_remez_SOURCES=Test_remez.cc
 Test_remez_LDADD=-lGrid
@@ -252,39 +252,7 @@ int main(int argc,char **argv)
 #endif
 }
-#undef VLOAD
+#include <simd/Intel512double.h>
 #undef VSTORE
 #undef VMUL
 #undef VMADD
 #undef ZEND1
 #undef ZEND2
 #undef ZLOAD
 #undef ZMUL
 #undef ZMADD
 #undef VMOVIDUP 
 #undef VMOVRDUP 
 #undef VMADDSUB
 #undef VSHUF
 #define VZERO(A) VZEROd(A)
 #define VTIMESI(A,B,C) VTIMESId(A,B,C)
 #define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
 #define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
 #define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
 #define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
 #define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
 #define ZEND1(A,B,C)              ZEND1d(A,B,C)
 #define ZEND2(A,B,C)              ZEND2d(A,B,C)
 #define ZLOAD(A,B,C,D)            ZLOADd(A,B,C,D)
 #define ZMUL(A,B,C,D,E)           ZMULd(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)          ZMADDd(A,B,C,D,E)
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C)
 #define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C)
 #define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum) 
 #define VSHUF(A,B) VSHUFd(A,B)
 #define zz Z0
@@ -415,49 +383,7 @@ void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3)
 }
-#undef VLOAD
+#include <simd/Intel512single.h>
 #undef VSTORE
 #undef VMUL
 #undef VMADD
 #undef ZEND1
 #undef ZEND2
 #undef ZLOAD
 #undef ZMUL
 #undef ZMADD
 #undef VZERO
 #undef VTIMESI
 #undef VTIMESI0
 #undef VTIMESI1
 #undef VTIMESI2
 #undef VTIMESMINUSI
 #undef ZMULMEM2SP
 #undef ZMADDMEM2SP
 #undef VMOVIDUP 
 #undef VMOVRDUP 
 #undef VMADDSUB
 #undef VSHUF
 #define VZERO(A) VZEROf(A)
 #define VMOV(A,B) VMOVf(A,B)
 #define VADD(A,B,C) VADDf(A,B,C)
 #define VSUB(A,B,C) VSUBf(A,B,C)
 #define VTIMESI(A,B,C) VTIMESIf(A,B,C)
 #define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
 #define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 #define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 #define VMUL(Uri,Uir,Chi)  VMULf(Uri,Uir,Chi)
 #define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
 #define ZEND1(A,B,C)               ZEND1f(A,B,C)
 #define ZEND2(A,B,C)               ZEND2f(A,B,C)
 #define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 #define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
 #define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) 
 #define VSHUF(A,B) VSHUFf(A,B)
 void ZmulF(void *ptr1,void *ptr2,void *ptr3)
 {