Merge branch 'master' into hadrons

2025-07-28 02:07:07 +01:00 · 2016-04-14 15:15:45 +01:00
parent 179e82b5ca cf2f69812b
commit 3834d81181
44 changed files with 1852 additions and 10008 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -58,7 +58,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -143,6 +143,7 @@ int main (int argc, char ** argv)
    Dw.Report();
  }

+  exit(0);

  if (1)
  { // Naive wilson dag implementation
@@ -197,7 +198,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
  }
-
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
  Dw.Dhop  (src  ,result,DaggerNo);
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -0,0 +1,171 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_zmm.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+#include <PerfCount.h>
+
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
+
+int main(int argc,char **argv)
+{
+  Grid_init(&argc,&argv);
+  std::ofstream os("zmm.dat");
+
+  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
+  for(int L=4;L<=32;L+=4){
+    for(int m=1;m<=2;m++){
+      for(int Ls=8;Ls<=16;Ls+=8){
+	std::vector<int> grid({L,L,m*L,m*L});
+	for(int i=0;i<4;i++) { 
+	  std::cout << grid[i]<<"x";
+	}
+	std::cout << Ls<<std::endl;
+	bench(os,grid,Ls);
+      }
+    }
+  }
+}
+
+int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
+{
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  int threads = GridThread::GetThreads();
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
+
+  LatticeFermion src (FGrid);
+  LatticeFermion tmp (FGrid);
+  LatticeFermion srce(FrbGrid);
+
+  LatticeFermion resulto(FrbGrid); resulto=zero;
+  LatticeFermion resulta(FrbGrid); resulta=zero;
+  LatticeFermion junk(FrbGrid); junk=zero;
+  LatticeFermion diff(FrbGrid); 
+  LatticeGaugeField Umu(UGrid);
+
+  double mfc, mfa, mfo, mfl1;
+
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  random(RNG5,src);
+#if 1
+  random(RNG4,Umu);
+#else
+  int mmu=2;
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    if ( mu!=mmu ) U[mu] = zero;
+    if ( mu==mmu ) U[mu] = 1.0;
+    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
+  }
+#endif
+ pickCheckerboard(Even,srce,src);
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  int ncall=50;
+  double t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.DhopOE(srce,resulto,0);
+  }
+  double t1=usecond();
+
+  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+  double flops=1344*volume/2;
+
+  mfc = flops*ncall/(t1-t0);
+  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
+
+  QCD::WilsonFermion5DStatic::AsmOptDslash=1;
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.DhopOE(srce,resulta,0);
+  }
+  t1=usecond();
+  mfa = flops*ncall/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
+
+  int dag=DaggerNo;
+  t0=usecond();
+  for(int i=0;i<1;i++){
+    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
+  }
+  t1=usecond();
+  mfo = flops*100/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
+
+  t0=usecond();
+  for(int i=0;i<1;i++){
+    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
+  }
+  t1=usecond();
+  mfl1= flops*100/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
+
+  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
+     << mfc<<" "
+     << mfa<<" "
+     << mfo<<" "
+     << mfl1<<std::endl;
+
+#if 0
+  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+    Dw.DhopOE(srce,resulta,0);
+    PerformanceCounter Counter(i);
+    Counter.Start();
+    Dw.DhopOE(srce,resulta,0);
+    Counter.Stop();
+    Counter.Report();
+  }
+#endif
+  //resulta = (-0.5) * resulta;
+
+  diff = resulto-resulta;
+  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
+  std::cout<<std::endl;
+}
+
+
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm


 Benchmark_comms_SOURCES=Benchmark_comms.cc
@@ -25,3 +25,7 @@ Benchmark_su3_LDADD=-lGrid
 Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 Benchmark_wilson_LDADD=-lGrid

+
+Benchmark_zmm_SOURCES=Benchmark_zmm.cc
+Benchmark_zmm_LDADD=-lGrid
+
--- a/8235
+++ b/8235
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -1,183 +0,0 @@
-/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* AVX Intrinsics */
-#undef AVX1
-
-/* AVX2 Intrinsics */
-#undef AVX2
-
-/* AVX512 Intrinsics for Knights Landing */
-#undef AVX512
-
-/* AVX Intrinsics with FMA4 */
-#undef AVXFMA4
-
-/* EMPTY_SIMD only for DEBUGGING */
-#undef EMPTY_SIMD
-
-/* GRID_COMMS_MPI */
-#undef GRID_COMMS_MPI
-
-/* GRID_COMMS_NONE */
-#undef GRID_COMMS_NONE
-
-/* GRID_COMMS_SHMEM */
-#undef GRID_COMMS_SHMEM
-
-/* GRID_DEFAULT_PRECISION is DOUBLE */
-#undef GRID_DEFAULT_PRECISION_DOUBLE
-
-/* GRID_DEFAULT_PRECISION is SINGLE */
-#undef GRID_DEFAULT_PRECISION_SINGLE
-
-/* Support Altivec instructions */
-#undef HAVE_ALTIVEC
-
-/* Support AVX (Advanced Vector Extensions) instructions */
-#undef HAVE_AVX
-
-/* Support AVX2 (Advanced Vector Extensions 2) instructions */
-#undef HAVE_AVX2
-
-/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
-   don't. */
-#undef HAVE_DECL_BE64TOH
-
-/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
-   */
-#undef HAVE_DECL_NTOHLL
-
-/* Define to 1 if you have the <endian.h> header file. */
-#undef HAVE_ENDIAN_H
-
-/* Define to 1 if you have the <execinfo.h> header file. */
-#undef HAVE_EXECINFO_H
-
-/* Support FMA3 (Fused Multiply-Add) instructions */
-#undef HAVE_FMA
-
-/* Define to 1 if you have the `gettimeofday' function. */
-#undef HAVE_GETTIMEOFDAY
-
-/* Define to 1 if you have the <gmp.h> header file. */
-#undef HAVE_GMP_H
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#undef HAVE_INTTYPES_H
-
-/* Define to 1 if you have the <malloc.h> header file. */
-#undef HAVE_MALLOC_H
-
-/* Define to 1 if you have the <malloc/malloc.h> header file. */
-#undef HAVE_MALLOC_MALLOC_H
-
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* Support mmx instructions */
-#undef HAVE_MMX
-
-/* Define to 1 if you have the <mm_malloc.h> header file. */
-#undef HAVE_MM_MALLOC_H
-
-/* Support SSE (Streaming SIMD Extensions) instructions */
-#undef HAVE_SSE
-
-/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
-#undef HAVE_SSE2
-
-/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
-#undef HAVE_SSE3
-
-/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
-#undef HAVE_SSE4_1
-
-/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
-#undef HAVE_SSE4_2
-
-/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
-#undef HAVE_SSSE3
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
-
-/* Define to 1 if you have the <strings.h> header file. */
-#undef HAVE_STRINGS_H
-
-/* Define to 1 if you have the <string.h> header file. */
-#undef HAVE_STRING_H
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#undef HAVE_SYS_STAT_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
-
-/* IMCI Intrinsics for Knights Corner */
-#undef IMCI
-
-/* NEON ARMv8 Experimental support */
-#undef NEONv8
-
-/* Name of package */
-#undef PACKAGE
-
-/* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
-
-/* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
-/* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
-
-/* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
-
-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
-/* Define to the version of this package. */
-#undef PACKAGE_VERSION
-
-/* RNG_MT19937 */
-#undef RNG_MT19937
-
-/* RNG_RANLUX */
-#undef RNG_RANLUX
-
-/* SSE4 Intrinsics */
-#undef SSE4
-
-/* Define to 1 if you have the ANSI C header files. */
-#undef STDC_HEADERS
-
-/* Version number of package */
-#undef VERSION
-
-/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
-   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
-   #define below would cause a syntax error. */
-#undef _UINT32_T
-
-/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
-   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
-   #define below would cause a syntax error. */
-#undef _UINT64_T
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-#undef size_t
-
-/* Define to the type of an unsigned integer type of width exactly 32 bits if
-   such a type exists and the standard includes do not define it. */
-#undef uint32_t
-
-/* Define to the type of an unsigned integer type of width exactly 64 bits if
-   such a type exists and the standard includes do not define it. */
-#undef uint64_t
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -62,6 +62,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <serialisation/Serialisation.h>
 #include <Config.h>
 #include <Timer.h>
+#include <PerfCount.h>
 #include <Log.h>
 #include <AlignedAllocator.h>
 #include <Simd.h>
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -318,7 +318,10 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  exit(0);
  return;
 };
-
+#ifdef GRID_FPE
+#define _GNU_SOURCE
+#include <fenv.h>
+#endif
 void Grid_debug_handler_init(void)
 {
  struct sigaction sa,osa;
@@ -327,5 +330,9 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
+#ifdef GRID_FPE
+  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  sigaction(SIGFPE,&sa,NULL);
+#endif
 }
 }
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -102,7 +102,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void)
 {
-  int me;
+  int me=0;
 #ifdef GRID_COMMS_MPI
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
 #endif
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -71,7 +71,7 @@ public:
            StopWatch.Start();
            stream << BLACK<< log.topName << BLACK<< " : ";
            stream << log.COLOUR <<std::setw(10) << std::left << log.name << BLACK << " : ";
-	    //	    stream << YELLOW<< now <<BLACK << " : " ;
+	    stream << YELLOW<< now <<BLACK << " : " ;
 	    stream << log.COLOUR;
            return stream;
        } else { 
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <ctime>
 #include <chrono>
 #include <string.h>
-
+#include <unistd.h>
 #include <sys/ioctl.h>

 #ifdef __linux__
@@ -163,8 +163,8 @@ public:
  {
 #ifdef __linux__
    if ( fd!= -1) {
-      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
-      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
    }
    begin  =cyclecount();
 #else
@@ -176,7 +176,7 @@ public:
    count=0;
 #ifdef __linux__
    if ( fd!= -1) {
-      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
    }
    elapsed = cyclecount() - begin;
@@ -187,16 +187,16 @@ public:
  }
  void Report(void) {
 #ifdef __linux__
-    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+    std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
 #else
-    printf("%llu cycles \n", elapsed );
+    std::printf("%llu cycles \n", elapsed );
 #endif
  }

  ~PerformanceCounter()
  {
 #ifdef __linux__
-    close(fd);
+    ::close(fd);
 #endif
  }

--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -42,6 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 #define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
+#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
 #define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
 #define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 #define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
--- a/lib/Timer.h
+++ b/lib/Timer.h
@@ -44,6 +44,7 @@ double usecond(void);
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::milliseconds          GridTime;
+typedef  std::chrono::microseconds          GridUsecs;

 inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 {
@@ -55,7 +56,7 @@ class GridStopWatch {
 private:
  bool running;
  GridTimePoint start;
-  GridTime accumulator;
+  GridUsecs accumulator;
 public:
  GridStopWatch () { 
    Reset();
@@ -67,17 +68,21 @@ public:
  }
  void     Stop(void)  { 
    assert(running == true);
-    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
+    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
    running = false; 
  };
  void     Reset(void){
    running = false;
    start = GridClock::now();
-    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
+    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
  GridTime Elapsed(void) {
    assert(running == false);
-    return accumulator;
+    return std::chrono::duration_cast<GridTime>( accumulator );
+  }
+  uint64_t useconds(void){
+    assert(running == false);
+    return (uint64_t) accumulator.count();
  }
 };

--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -147,6 +147,56 @@ namespace Grid {
      }
      Orthogonalise();
    }
+
+    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
+    {
+      // Run a Lanczos with sloppy convergence
+	const int Nstop = nn;
+	const int Nk = nn+20;
+	const int Np = nn+20;
+	const int Nm = Nk+Np;
+	const int MaxIt= 10000;
+	RealD resid = 1.0e-3;
+
+	Chebyshev<FineField> Cheb(0.5,64.0,21);
+	ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
+	//	IRL.lock = 1;
+
+	FineField noise(FineGrid); gaussian(RNG,noise);
+	FineField tmp(FineGrid); 
+	std::vector<RealD>     eval(Nm);
+	std::vector<FineField> evec(Nm,FineGrid);
+
+	int Nconv;
+	IRL.calc(eval,evec,
+		 noise,
+		 Nconv);
+
+    	// pull back nn vectors
+	for(int b=0;b<nn;b++){
+
+	  subspace[b]   = evec[b];
+
+	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
+
+	  hermop.Op(subspace[b],tmp); 
+	  std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
+
+	  noise = tmp -  sqrt(eval[b])*subspace[b] ;
+
+	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
+
+	  noise = tmp +  eval[b]*subspace[b] ;
+
+	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
+
+	}
+	Orthogonalise();
+	for(int b=0;b<nn;b++){
+	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
+	}
+    }
+
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {

      RealD scale;
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@@ -39,42 +39,33 @@ class SortEigen {
 private:
  
 //hacking for testing for now
-#if 0
-  static bool less_lmd(RealD left,RealD right){
-    return fabs(left) < fabs(right);
-  }  
-  static bool less_pair(std::pair<RealD,Field>& left,
-		 std::pair<RealD,Field>& right){
-    return fabs(left.first) < fabs(right.first);
-  }  
-#else
+ private:
  static bool less_lmd(RealD left,RealD right){
    return left > right;
  }  
-  static bool less_pair(std::pair<RealD,Field>& left,
-		 std::pair<RealD,Field>& right){
+  static bool less_pair(std::pair<RealD,Field const*>& left,
+                        std::pair<RealD,Field const*>& right){
    return left.first > (right.first);
  }  
-#endif
+  
  
 public:

  void push(DenseVector<RealD>& lmd,
-	    DenseVector<Field>& evec,int N) {
-
-    DenseVector<std::pair<RealD, Field> > emod;
-    typename DenseVector<std::pair<RealD, Field> >::iterator it;
+            DenseVector<Field>& evec,int N) {
+    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
+    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
    
-    for(int i=0;i<lmd.size();++i){
-      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
-    }
+    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
+    for(int i=0;i<lmd.size();++i)
+      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);

    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);

-    it=emod.begin();
+    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
    for(int i=0;i<N;++i){
      lmd[i]=it->first;
-      evec[i]=it->second;
+      evec[i]=*(it->second);
      ++it;
    }
  }
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -637,21 +637,20 @@ until convergence
 	abort();
 	
      converged:
-	// Sorting
-	
-	eval.clear();
-	evec.clear();
-	for(int i=0; i<Nconv; ++i){
-	  eval.push_back(eval2[Iconv[i]]);
-	  evec.push_back(B[Iconv[i]]);
-	}
-	_sort.push(eval,evec,Nconv);
-	
-	std::cout << "\n Converged\n Summary :\n";
-	std::cout << " -- Iterations  = "<< Nconv  << "\n";
-	std::cout << " -- beta(k)     = "<< beta_k << "\n";
-	std::cout << " -- Nconv       = "<< Nconv  << "\n";
-      }
+       // Sorting
+       eval.resize(Nconv);
+       evec.resize(Nconv,grid);
+       for(int i=0; i<Nconv; ++i){
+         eval[i] = eval2[Iconv[i]];
+         evec[i] = B[Iconv[i]];
+       }
+      _sort.push(eval,evec,Nconv);
+
+      std::cout << "\n Converged\n Summary :\n";
+      std::cout << " -- Iterations  = "<< Nconv  << "\n";
+      std::cout << " -- beta(k)     = "<< beta_k << "\n";
+      std::cout << " -- Nconv       = "<< Nconv  << "\n";
+     }

    /////////////////////////////////////////////////
    // Adapted from Rudy's lanczos factor routine
--- a/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -192,7 +192,7 @@ namespace Grid {
 	  return cp;
 	}

-	std::cout<<GridLogMessage<< " VPGCR_step     resid " <<sqrt(cp/rsq)<<std::endl; 
+	std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 

 	PrecTimer.Start();
 	Preconditioner(r,z);// solve Az = r
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -86,6 +86,7 @@ class CartesianCommunicator {
    void GlobalSumVector(RealD *,int N);

    void GlobalSum(uint32_t &);
+    void GlobalSum(uint64_t &);

    void GlobalSum(ComplexF &c)
    {
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -73,6 +73,10 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -53,6 +53,7 @@ void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
+void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}

 void CartesianCommunicator::SendRecvPacket(void *xmit,
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -101,6 +101,22 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
  shmem_barrier_all(); // necessary?
  u = dest;
 }
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  static long long source ;
+  static long long dest   ;
+  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
+  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+  //  int nreduce=1;
+  //  int pestart=0;
+  //  int logStride=0;
+
+  source = u;
+  dest   = 0;
+  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
+  shmem_barrier_all(); // necessary?
+  u = dest;
+}
 void CartesianCommunicator::GlobalSum(float &f){
  static float source ;
  static float dest   ;
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -168,6 +168,7 @@ class BinaryIO {
    GridBase *grid = Umu._grid;

    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
+    GridStopWatch timer; timer.Start();

    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
@@ -182,6 +183,7 @@ class BinaryIO {

    Umu = zero;
    uint32_t csum=0;
+    uint64_t bytes=0;
    fobj file_object;
    sobj munged;
    
@@ -194,7 +196,7 @@ class BinaryIO {

      if ( grid->IsBoss() ) {
 	fin.read((char *)&file_object,sizeof(file_object));
-	
+	bytes += sizeof(file_object);
 	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
 	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
 	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
@@ -205,6 +207,10 @@ class BinaryIO {
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
+    timer.Stop();
+    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+
    return csum;
  }

@@ -224,13 +230,14 @@ class BinaryIO {
    // Serialise through node zero
    //////////////////////////////////////////////////
    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
+    GridStopWatch timer; timer.Start();

    std::ofstream fout;
    if ( grid->IsBoss() ) {
      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
      fout.seekp(offset);
    }
-    
+    uint64_t bytes=0;
    uint32_t csum=0;
    fobj file_object;
    sobj unmunged;
@@ -252,10 +259,15 @@ class BinaryIO {
 	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
 	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
 	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
-	
+
+	// NB could gather an xstrip as an optimisation.
 	fout.write((char *)&file_object,sizeof(file_object));
+	bytes+=sizeof(file_object);
      }
    }}}}
+    timer.Stop();
+    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
@@ -429,6 +441,9 @@ class BinaryIO {
      std::cout << std::endl;
    }

+    GridStopWatch timer; timer.Start();
+    uint64_t bytes=0;
+
    int myrank = grid->ThisRank();
    int iorank = grid->RankFromProcessorCoor(ioproc);

@@ -475,6 +490,7 @@ class BinaryIO {
 	
 	fin.seekg(offset+g_idx*sizeof(fileObj));
 	fin.read((char *)&fileObj,sizeof(fileObj));
+	bytes+=sizeof(fileObj);
 	
 	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
 	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
@@ -499,7 +515,12 @@ class BinaryIO {
    }

    grid->GlobalSum(csum);
+    grid->GlobalSum(bytes);
    grid->Barrier();
+
+    timer.Stop();
+    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    
    return csum;
  }
@@ -563,6 +584,9 @@ class BinaryIO {
      std::cout << std::endl;
    }

+    GridStopWatch timer; timer.Start();
+    uint64_t bytes=0;
+
    int myrank = grid->ThisRank();
    int iorank = grid->RankFromProcessorCoor(ioproc);

@@ -635,11 +659,16 @@ class BinaryIO {
 	
 	fout.seekp(offset+g_idx*sizeof(fileObj));
 	fout.write((char *)&fileObj,sizeof(fileObj));
-
+	bytes+=sizeof(fileObj);
      }
    }

    grid->GlobalSum(csum);
+    grid->GlobalSum(bytes);
+
+    timer.Stop();
+    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -353,7 +353,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
      	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
    }
-  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
+  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
    if ( ieee32 || ieee32big ) {
      //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
@@ -431,7 +431,7 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu

  } else { 
    header.floating_point = std::string("IEEE64BIG");
-    header.data_type      = std::string("4D_SU3_GAUGE_3X3");
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
    NerscSimpleUnmunger<fobj3D,sobj> munge;
    BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
    offset = writeHeader(header,file);
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -335,69 +335,7 @@ PARALLEL_FOR_LOOP
  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 						     const FermionField &in, FermionField &out,int dag) {

-    assert((dag==DaggerNo) ||(dag==DaggerYes));
-
-    Compressor compressor(dag);
-
-    auto handle = st.HaloExchangeBegin(in,compressor);
-
-    bool local    = true;
-    bool nonlocal = false;
-    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    } else {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    }
-
-    st.HaloExchangeComplete(handle);
-
-    local    = false;
-    nonlocal = true;
-    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    } else {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    }
+    assert(0);

  };

--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -281,11 +281,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
-  //  if ( Impl::overlapCommsCompute () ) { 
-  //    DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
-  //  } else { 
    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
-    //  }
 }

 template<class Impl>
@@ -368,7 +364,7 @@ PARALLEL_FOR_LOOP
 	      sU = lo.Reorder(sU);
 	    }
 	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
@@ -423,14 +419,12 @@ PARALLEL_FOR_LOOP
 }

 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
-						     DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag)
+void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
+						 DoubledGaugeField & U,
+						 const FermionField &in, FermionField &out,int dag)
 {
-  assert(0);
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
-
  Compressor compressor(dag);

  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
@@ -442,116 +436,232 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
  
  commtime -=usecond();
  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
  commtime +=usecond();
+
+  jointime -=usecond();
+  jointime +=usecond();
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-  bool local    = true;
-  bool nonlocal = false;
+
+#pragma omp parallel 
+  {
+  for(int jjj=0;jjj<100;jjj++){
+#pragma omp barrier
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
+#pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
-PARALLEL_FOR_LOOP
+
+#pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
    }
  } else {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp for
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=ss+ ssoff;
+
+	    if ( LebesgueOrder::UseLebesgueOrder ) {
+	      sU = lo.Reorder(sU);
+	    }
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+#pragma omp for
+
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
-PARALLEL_FOR_LOOP
+#pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
+  }
+  }
  dslashtime +=usecond();
+  alltime+=usecond();
+}
+
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
+						DoubledGaugeField & U,
+						const FermionField &in, FermionField &out,int dag)
+{
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  alltime-=usecond();
+  Compressor compressor(dag);
+
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
+  commtime -=usecond();
+  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
+  commtime +=usecond();

  jointime -=usecond();
-  st.HaloExchangeComplete(handle);
  jointime +=usecond();
+  
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  // Not loop ordering and data layout.
+  // Designed to create 
+  // - per thread reuse in L1 cache for U
+  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.

-  local    = false;
-  nonlocal = true;
-  dslash1time -=usecond();
+#pragma omp parallel 
+  {
+  for(int jjj=0;jjj<100;jjj++){
+#pragma omp barrier
+  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
+#pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
+	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
-PARALLEL_FOR_LOOP
+
+#pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
+	    int sU=0;
 	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
    }
  } else {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp for
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=0;
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+#pragma omp for
+
      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
+	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
-PARALLEL_FOR_LOOP
+#pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
+	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
-  dslash1time +=usecond();
+  }
+  }
+  dslashtime +=usecond();
  alltime+=usecond();
+}

+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
+						     DoubledGaugeField & U,
+						     const FermionField &in, FermionField &out,int dag)
+{
+  assert(0);
 }

 template<class Impl>
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -1,3 +1,4 @@
+
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -120,6 +121,20 @@ namespace Grid {
 			FermionField &out,
 			int dag);

+      void DhopInternalOMPbench(StencilImpl & st,
+				LebesgueOrder &lo,
+				DoubledGaugeField &U,
+				const FermionField &in, 
+				FermionField &out,
+				int dag);
+
+      void DhopInternalL1bench(StencilImpl & st,
+				LebesgueOrder &lo,
+				DoubledGaugeField &U,
+				const FermionField &in, 
+				FermionField &out,
+				int dag);
+
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
@@ -148,7 +163,7 @@ namespace Grid {
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
-    protected:
+    public:

      // Add these to the support from Wilson
      GridBase *_FourDimGrid;
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -38,216 +38,177 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+					   int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
+  SiteHalfSpinor *chi_p;
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
  int ptype;

-  int num = 0;
-
-  result=zero;
-
  ///////////////////////////
  // Xp
  ///////////////////////////
  SE=st.GetEntry(ptype,Xp,sF);

-  if (local && SE->_is_local ) { 
+  if (SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXp(chi,in._odata[SE->_offset]);
    }
-  }
-
-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
+  } else { 
+    chi_p=&buf[SE->_offset];
  }
  
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
-    accumReconXp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
+  spReconXp(result,Uchi);
    
  ///////////////////////////
  // Yp
  ///////////////////////////
  SE=st.GetEntry(ptype,Yp,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
-    accumReconYp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
+  accumReconYp(result,Uchi);

  ///////////////////////////
  // Zp
  ///////////////////////////
  SE=st.GetEntry(ptype,Zp,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
-    accumReconZp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
+  accumReconZp(result,Uchi);

  ///////////////////////////
  // Tp
  ///////////////////////////
  SE=st.GetEntry(ptype,Tp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjTp(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
-    accumReconTp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
+  accumReconTp(result,Uchi);

  ///////////////////////////
  // Xm
  ///////////////////////////
  SE=st.GetEntry(ptype,Xm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
-    accumReconXm(result,Uchi);
-    num++;
-  }
-
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
+  accumReconXm(result,Uchi);
+  
  ///////////////////////////
  // Ym
  ///////////////////////////
  SE=st.GetEntry(ptype,Ym,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
-    accumReconYm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
+  accumReconYm(result,Uchi);
  
  ///////////////////////////
  // Zm
  ///////////////////////////
  SE=st.GetEntry(ptype,Zm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
-    accumReconZm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
+  accumReconZm(result,Uchi);

  ///////////////////////////
  // Tm
  ///////////////////////////
  SE=st.GetEntry(ptype,Tm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else { 
      spProjTm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
+  accumReconTm(result,Uchi);

-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
-    accumReconTm(result,Uchi);
-    num++;
-  }
-
-  if ( local ) {
-    vstream(out._odata[sF],result);
-  } else if ( num ) { 
-    vstream(out._odata[sF],out._odata[sF]+result);
-  }
+  vstream(out._odata[sF],result);
 };


@@ -255,216 +216,177 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+					   int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
+  SiteHalfSpinor *chi_p;    
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
  int ptype;

-  int num = 0;
-
-  result=zero;
-
  ///////////////////////////
  // Xp
  ///////////////////////////
  SE=st.GetEntry(ptype,Xm,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXp(chi,in._odata[SE->_offset]);
    }
-  }
-
-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
+  } else { 
+    chi_p=&buf[SE->_offset];
  }
  
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
-    accumReconXp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
+  spReconXp(result,Uchi);
    
  ///////////////////////////
  // Yp
  ///////////////////////////
  SE=st.GetEntry(ptype,Ym,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
-    accumReconYp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
+  accumReconYp(result,Uchi);

  ///////////////////////////
  // Zp
  ///////////////////////////
  SE=st.GetEntry(ptype,Zm,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
-    accumReconZp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
+  accumReconZp(result,Uchi);

  ///////////////////////////
  // Tp
  ///////////////////////////
  SE=st.GetEntry(ptype,Tm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjTp(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
-    accumReconTp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
+  accumReconTp(result,Uchi);

  ///////////////////////////
  // Xm
  ///////////////////////////
  SE=st.GetEntry(ptype,Xp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
-    accumReconXm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
+  accumReconXm(result,Uchi);

  ///////////////////////////
  // Ym
  ///////////////////////////
  SE=st.GetEntry(ptype,Yp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
-    accumReconYm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
+  accumReconYm(result,Uchi);
  
  ///////////////////////////
  // Zm
  ///////////////////////////
  SE=st.GetEntry(ptype,Zp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
-    accumReconZm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
+  accumReconZm(result,Uchi);

  ///////////////////////////
  // Tm
  ///////////////////////////
  SE=st.GetEntry(ptype,Tp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else { 
      spProjTm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
+  accumReconTm(result,Uchi);

-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
-    accumReconTm(result,Uchi);
-    num++;
-  }
-
-  if ( local ) {
-    vstream(out._odata[sF],result);
-  } else if ( num ) { 
-    vstream(out._odata[sF],out._odata[sF]+result);
-  }
+  vstream(out._odata[sF],result);
 };

 template<class Impl> 
@@ -596,11 +518,11 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
  vstream(out._odata[sF],result);
 }

-#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
+#if ( ! defined(IMCI) && ! defined(AVX512) )
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					      int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+					      int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 }
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -48,11 +48,11 @@ namespace Grid {
    public:
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			   int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			   int sF,int sU,const FermionField &in, FermionField &out);
      
     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,const FermionField &in,FermionField &out);

     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
@@ -60,15 +60,15 @@ namespace Grid {

     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,const FermionField &in, FermionField &out);

     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,const FermionField &in, FermionField &out);
     
     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-				 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+				 int sF,int sU,const FermionField &in, FermionField &out);

     WilsonKernels(const ImplParams &p= ImplParams());
     
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -28,76 +28,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid.h>
 #if defined(AVX512) || defined (IMCI)
+//#if defined (IMCI)

-#include <simd/Avx512Asm.h>
+#include <simd/Intel512wilson.h>

-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-#undef VZERO
-#undef VTIMESI
-#undef VTIMESMINUSI
+#include <simd/Intel512single.h>

-#define VZERO(A)                  VZEROf(A)
-#define VMOV(A,B)                 VMOVf(A,B)
-#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
-
-#define VADD(A,B,C)               VADDf(A,B,C)
-#define VSUB(A,B,C)               VSUBf(A,B,C)
-#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
-#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
-
-#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
-#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
-#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
-#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
-
-#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
-#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
-#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
-#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
-
-#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
-#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
-#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
-#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
-
-#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
-#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
-#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
-#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
-
-#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
-#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
-
-#define VPERM0(A,B)               VPERM0f(A,B)
-#define VPERM1(A,B)               VPERM1f(A,B)
-#define VPERM2(A,B)               VPERM2f(A,B)
-#define VPERM3(A,B)               VPERM3f(A,B)
-#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
-
-#define ZEND1(A,B,C)               ZEND1f(A,B,C)
-#define ZEND2(A,B,C)               ZEND2f(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
-#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
-
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 

 namespace Grid {
 namespace QCD {
@@ -105,7 +41,7 @@ namespace QCD {
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
+					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  uint64_t  now;
  uint64_t first ;
@@ -127,26 +63,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField

  SE=st.GetEntry(ptype,Xm,ss);

-#if 0
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  LOAD64(%r9,pf);
-  __asm__( 
-	  VPREFETCH(0,%r9)
-	  VPREFETCH(1,%r9)
-	  VPREFETCH(2,%r9)
-	  VPREFETCH(3,%r9)
-	  VPREFETCH(4,%r9)
-	  VPREFETCH(5,%r9)
-	  VPREFETCH(6,%r9)
-	  VPREFETCH(7,%r9)
-	  VPREFETCH(8,%r9)
-	  VPREFETCH(9,%r9)
-	  VPREFETCH(10,%r9)
-	  VPREFETCH(11,%r9) );
-#endif
-
  // Xm
  offset = SE->_offset;
  local  = SE->_is_local;
@@ -158,7 +74,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  else               pf=(void *)&pbuf[SE->_offset];
  
  if ( local ) {
-    XM_PROJMEM(&plocal[offset]);
+    XP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
@@ -168,7 +84,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  {
    MULT_2SPIN_DIR_PFXM(Xm,pf);
  }
-  XM_RECON;
+  XP_RECON;

  // Ym
  offset = SE->_offset;
@@ -181,7 +97,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  else               pf=(void *)&pbuf[SE->_offset];
  
  if ( local ) {
-    YM_PROJMEM(&plocal[offset]);
+    YP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
@@ -191,7 +107,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  {
    MULT_2SPIN_DIR_PFYM(Ym,pf);
  }
-  YM_RECON_ACCUM;
+  YP_RECON_ACCUM;

  // Zm
  offset = SE->_offset;
@@ -204,7 +120,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  else               pf=(void *)&pbuf[SE->_offset];

  if ( local ) {
-    ZM_PROJMEM(&plocal[offset]);
+    ZP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
@@ -214,7 +130,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  {
    MULT_2SPIN_DIR_PFZM(Zm,pf);
  }
-  ZM_RECON_ACCUM;
+  ZP_RECON_ACCUM;

  // Tm
  offset = SE->_offset;
@@ -227,7 +143,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField


  if ( local ) {
-    TM_PROJMEM(&plocal[offset]);
+    TP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
@@ -237,7 +153,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  {
    MULT_2SPIN_DIR_PFTM(Tm,pf);
  }
-  TM_RECON_ACCUM;
+  TP_RECON_ACCUM;

  // Tp
  offset = SE->_offset;
@@ -250,7 +166,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  else               pf=(void *)&pbuf[SE->_offset];
  
  if ( local ) {
-    TP_PROJMEM(&plocal[offset]);
+    TM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
@@ -260,7 +176,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  {
    MULT_2SPIN_DIR_PFTP(Tp,pf);
  }
-  TP_RECON_ACCUM;
+  TM_RECON_ACCUM;

  // Zp
  offset = SE->_offset;
@@ -273,7 +189,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  else               pf=(void *)&pbuf[SE->_offset];

  if ( local ) {
-    ZP_PROJMEM(&plocal[offset]);
+    ZM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
@@ -283,7 +199,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  {
    MULT_2SPIN_DIR_PFZP(Zp,pf);
  }
-  ZP_RECON_ACCUM;
+  ZM_RECON_ACCUM;


  offset = SE->_offset;
@@ -296,7 +212,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  else               pf=(void *)&pbuf[SE->_offset];
  
  if ( local ) {
-    YP_PROJMEM(&plocal[offset]);
+    YM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
@@ -306,22 +222,20 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  {
    MULT_2SPIN_DIR_PFYP(Yp,pf);
  }
-  YP_RECON_ACCUM;
+  YM_RECON_ACCUM;

  // Xp
  perm   = SE->_permute;
  offset = SE->_offset;
  local  = SE->_is_local;
    
-  //  PREFETCH_R(A);
-
  // Prefetch
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];

  if ( local ) {
-    XP_PROJMEM(&plocal[offset]);
+    XM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
@@ -331,7 +245,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  {
    MULT_2SPIN_DIR_PFXP(Xp,pf);
  }
-  XP_RECON_ACCUM;
+  XM_RECON_ACCUM;

 debug:
  SAVE_RESULT(&out._odata[ss]);
@@ -340,6 +254,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField

  template class WilsonKernels<WilsonImplF>;		
  template class WilsonKernels<WilsonImplD>; 
-
+  template class WilsonKernels<GparityWilsonImplF>;
+  template class WilsonKernels<GparityWilsonImplD>;
 }}
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -308,548 +308,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {

-#if 0
-template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
-						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
-{
-  //  std::cout << "Hand op Dhop "<<std::endl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
-  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
-  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
-  
-  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
-  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
-  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
-
-  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
-  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
-  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
-
-  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
-  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
-  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
-
-  REGISTER Simd Chi_00;    // two spinor; 6 regs
-  REGISTER Simd Chi_01;
-  REGISTER Simd Chi_02;
-
-  REGISTER Simd Chi_10;
-  REGISTER Simd Chi_11;
-  REGISTER Simd Chi_12;   // 14 left
-
-  REGISTER Simd UChi_00;  // two spinor; 6 regs
-  REGISTER Simd UChi_01;
-  REGISTER Simd UChi_02;
-
-  REGISTER Simd UChi_10;
-  REGISTER Simd UChi_11;
-  REGISTER Simd UChi_12;  // 8 left
-
-  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER Simd U_10;
-  REGISTER Simd U_20;  
-  REGISTER Simd U_01;
-  REGISTER Simd U_11;
-  REGISTER Simd U_21;  // 2 reg left.
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-
-  StencilEntry *SE;
-  int offset, ptype;
-  int num = 0;
-
-  // Xp
-  SE=st.GetEntry(ptype,Xp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xp);
-    XP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Yp
-  SE=st.GetEntry(ptype,Yp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Yp);
-    YP_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Zp
-  SE=st.GetEntry(ptype,Zp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }  
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zp);
-    ZP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tp
-  SE=st.GetEntry(ptype,Tp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tp);
-    TP_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Xm
-  SE=st.GetEntry(ptype,Xm,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xm);
-    XM_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Ym
-  SE=st.GetEntry(ptype,Ym,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Ym);
-    YM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Zm
-  SE=st.GetEntry(ptype,Zm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zm);
-    ZM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tm
-  SE=st.GetEntry(ptype,Tm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tm);
-    TM_RECON_ACCUM;
-    num++;  
-  }
-
-  SiteSpinor & ref (out._odata[ss]);
-  if ( Local ) {
-    vstream(ref()(0)(0),result_00);
-    vstream(ref()(0)(1),result_01);
-    vstream(ref()(0)(2),result_02);
-    vstream(ref()(1)(0),result_10);
-    vstream(ref()(1)(1),result_11);
-    vstream(ref()(1)(2),result_12);
-    vstream(ref()(2)(0),result_20);
-    vstream(ref()(2)(1),result_21);
-    vstream(ref()(2)(2),result_22);
-    vstream(ref()(3)(0),result_30);
-    vstream(ref()(3)(1),result_31);
-    vstream(ref()(3)(2),result_32);
-    return 1;
-  } else if ( num ) { 
-    vstream(ref()(0)(0),ref()(0)(0)+result_00);
-    vstream(ref()(0)(1),ref()(0)(1)+result_01);
-    vstream(ref()(0)(2),ref()(0)(2)+result_02);
-    vstream(ref()(1)(0),ref()(1)(0)+result_10);
-    vstream(ref()(1)(1),ref()(1)(1)+result_11);
-    vstream(ref()(1)(2),ref()(1)(2)+result_12);
-    vstream(ref()(2)(0),ref()(2)(0)+result_20);
-    vstream(ref()(2)(1),ref()(2)(1)+result_21);
-    vstream(ref()(2)(2),ref()(2)(2)+result_22);
-    vstream(ref()(3)(0),ref()(3)(0)+result_30);
-    vstream(ref()(3)(1),ref()(3)(1)+result_31);
-    vstream(ref()(3)(2),ref()(3)(2)+result_32);
-    return 1;
-  }
-  return 0;
-}
-
-
-
-
-template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
-{
-  //  std::cout << "Hand op Dhop "<<std::endl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
-  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
-  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
-  
-  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
-  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
-  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
-
-  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
-  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
-  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
-
-  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
-  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
-  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
-
-  REGISTER Simd Chi_00;    // two spinor; 6 regs
-  REGISTER Simd Chi_01;
-  REGISTER Simd Chi_02;
-
-  REGISTER Simd Chi_10;
-  REGISTER Simd Chi_11;
-  REGISTER Simd Chi_12;   // 14 left
-
-  REGISTER Simd UChi_00;  // two spinor; 6 regs
-  REGISTER Simd UChi_01;
-  REGISTER Simd UChi_02;
-
-  REGISTER Simd UChi_10;
-  REGISTER Simd UChi_11;
-  REGISTER Simd UChi_12;  // 8 left
-
-  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER Simd U_10;
-  REGISTER Simd U_20;  
-  REGISTER Simd U_01;
-  REGISTER Simd U_11;
-  REGISTER Simd U_21;  // 2 reg left.
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-
-  StencilEntry *SE;
-  int offset, ptype;
-  int num = 0;
-
-  // Xp
-  SE=st.GetEntry(ptype,Xp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xp);
-    XM_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Yp
-  SE=st.GetEntry(ptype,Yp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Yp);
-    YM_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Zp
-  SE=st.GetEntry(ptype,Zp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }  
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zp);
-    ZM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tp
-  SE=st.GetEntry(ptype,Tp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tp);
-    TM_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Xm
-  SE=st.GetEntry(ptype,Xm,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xm);
-    XP_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Ym
-  SE=st.GetEntry(ptype,Ym,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Ym);
-    YP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Zm
-  SE=st.GetEntry(ptype,Zm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zm);
-    ZP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tm
-  SE=st.GetEntry(ptype,Tm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tm);
-    TP_RECON_ACCUM;
-    num++;  
-  }
-
-  SiteSpinor & ref (out._odata[ss]);
-  if ( Local ) {
-    vstream(ref()(0)(0),result_00);
-    vstream(ref()(0)(1),result_01);
-    vstream(ref()(0)(2),result_02);
-    vstream(ref()(1)(0),result_10);
-    vstream(ref()(1)(1),result_11);
-    vstream(ref()(1)(2),result_12);
-    vstream(ref()(2)(0),result_20);
-    vstream(ref()(2)(1),result_21);
-    vstream(ref()(2)(2),result_22);
-    vstream(ref()(3)(0),result_30);
-    vstream(ref()(3)(1),result_31);
-    vstream(ref()(3)(2),result_32);
-    return 1;
-  } else if ( num ) { 
-    vstream(ref()(0)(0),ref()(0)(0)+result_00);
-    vstream(ref()(0)(1),ref()(0)(1)+result_01);
-    vstream(ref()(0)(2),ref()(0)(2)+result_02);
-    vstream(ref()(1)(0),ref()(1)(0)+result_10);
-    vstream(ref()(1)(1),ref()(1)(1)+result_11);
-    vstream(ref()(1)(2),ref()(1)(2)+result_12);
-    vstream(ref()(2)(0),ref()(2)(0)+result_20);
-    vstream(ref()(2)(1),ref()(2)(1)+result_21);
-    vstream(ref()(2)(2),ref()(2)(2)+result_22);
-    vstream(ref()(3)(0),ref()(3)(0)+result_30);
-    vstream(ref()(3)(1),ref()(3)(1)+result_31);
-    vstream(ref()(3)(2),ref()(3)(2)+result_32);
-    return 1;
-  }
-  return 0;
-}
-
-#else 

 template<class Impl>
 int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -1094,7 +557,7 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
 template<class Impl>
 int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out,bool l, bool nl)
+					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
  typedef typename Simd::scalar_type S;
@@ -1337,14 +800,13 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
 }


-#endif
  ////////////////////////////////////////////////
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
 template<>
 int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
  //check consistency of return types between these functions and the ones in WilsonKernels.cc
@@ -1355,7 +817,7 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
 template<>
 int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
@@ -1364,7 +826,7 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,D
 template<>
 int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
@@ -1373,7 +835,7 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Doub
 template<>
 int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
@@ -1383,29 +845,29 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D

 template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
+							       int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
+							       int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
+								  int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
+								  int ss,int sU,const FermionField &in, FermionField &out);


 template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+								      int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+								      int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+									 int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+									 int ss,int sU,const FermionField &in, FermionField &out);

 }}
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -608,14 +608,14 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    LatticeMatrix Umu(out._grid);
    for(int mu=0;mu<Nd;mu++){
      LieRandomize(pRNG,Umu,0.01);
-      pokeLorentz(out,Umu,mu);
+      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }
  static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
    LatticeMatrix Umu(out._grid);
    Umu=1.0;
    for(int mu=0;mu<Nd;mu++){
-      pokeLorentz(out,Umu,mu);
+      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }

--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <immintrin.h>


-
+namespace Grid{
 namespace Optimization {
  
  struct Vsplat{
@@ -246,26 +246,30 @@ namespace Optimization {
  struct TimesMinusI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
-      return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));   // 0x4E??
+      //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
+      //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0));   // 0x4E??
+      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
-      return _mm512_shuffle_pd(tmp,tmp,0x55);
+      //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
+      //return _mm512_shuffle_pd(tmp,tmp,0x55);
+      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
+      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
    } 
  };

  struct TimesI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
-      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
+      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); 
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
-      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
+      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
+      return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); 
    }


@@ -345,7 +349,7 @@ namespace Optimization {

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-namespace Grid {
+
  typedef __m512 SIMD_Ftype;  // Single precision type
  typedef __m512d SIMD_Dtype; // Double precision type
  typedef __m512i SIMD_Itype; // Integer type
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@@ -0,0 +1,141 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_AV512_H
+#define GRID_ASM_AV512_H
+
+////////////////////////////////////////////////////////////	  
+// Knights Landing specials
+////////////////////////////////////////////////////////////	  
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
+#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
+
+#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
+#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
+
+#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
+#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
+
+#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMULMEMf(O,P,B,Biirr) \
+  VMULMEMf(O,P,C,Ciirr) \
+  VMULf(tmp,B,Briir) \
+  VMULf(tmp,C,Criir)
+
+#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMd(O,P,tmp)  \
+  VMULMEMd(O,P,B,Biirr)  \ 
+  VMULMEMd(O,P,C,Ciirr)  \
+  VMULd(tmp,B,Briir)  \
+  VMULd(tmp,C,Criir) 
+
+#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMADDMEMf(O,P,B,Biirr) \
+  VMADDMEMf(O,P,C,Ciirr) \
+  VMADDf(tmp,B,Briir) \
+  VMADDf(tmp,C,Criir)
+
+#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
+  VSHUFMEMd(O,P,tmp) \
+  VMADDMEMd(O,P,B,Biirr) \
+  VMADDMEMd(O,P,C,Ciirr) \
+  VMADDd(tmp,B,Briir) \
+  VMADDd(tmp,C,Criir)
+
+// Merges accumulation for complex dot chain; less efficient under avx512
+#define ZEND1f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Criir "," #Criir "," #tmp   ";\n"\
+                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
+
+#define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
+                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
+                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
+
+#define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
+                         	  "vsubpd  " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
+
+#define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
+
+#define VMOVRDUPf(OFF,PTR,DEST)         "vmovsldup " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VMOVIDUPf(OFF,PTR,DEST)         "vmovshdup " #OFF "*64(" #PTR "), " #DEST  ";\n"
+
+#define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
+#define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
+
+                                  
+#define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
+#define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESI0d(A,DEST, Z)   VSHUFd(A,DEST)	 
+#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESMINUSI0f(A,DEST,Z)  VSHUFf(A,DEST)					
+#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESMINUSI0d(A,DEST,Z)  VSHUFd(A,DEST)					
+#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
+#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
+#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
+#define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
+#define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
+#define VPERM3f(A,B) "vshufps     $0xb1," #A "," #B "," #B ";\n"
+
+#define VPERM0d(A,B) "vshuff64x2  $0x4e," #A "," #B "," #B ";\n"
+#define VPERM1d(A,B) "vshuff64x2  $0xb1," #A "," #B "," #B ";\n"
+#define VPERM2d(A,B) "vshufpd     $0x55," #A "," #B "," #B ";\n"
+#define VPERM3d(A,B) VMOVd(A,B)
+
+
+#endif
--- a/lib/simd/Intel512avxAddsub.h
+++ b/lib/simd/Intel512avxAddsub.h
@@ -0,0 +1,92 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_AV512_ADDSUB_H
+#define GRID_ASM_AV512_ADDSUB_H
+
+
+////////////////////////////////////////////////////////////////
+// Building blocks for SU3 x 2spinor
+// Load columns of U
+// 18   U DUP's  rr/ii
+//  6 Chi shuffles ir,ri
+// 6muls, 30 fmaddsubs
+////////////////////////////////////////////////////////////////
+#define MULT_ADDSUB_2SPIN(ptr)	\
+	   LOAD64(%r8,ptr)			\
+  __asm__ (					\
+	   VMOVIDUPf(0,%r8,Z0 ) \
+	   VMOVIDUPf(3,%r8,Z1 )\
+	   VMOVIDUPf(6,%r8,Z2 )\
+           VSHUFf(Chi_00,T1)    \
+           VSHUFf(Chi_10,T2)    \
+				\
+	   VMULf(Z0,T1,UChi_00)	           VMOVRDUPf(0,%r8,Z3 ) \
+	   VMULf(Z0,T2,UChi_10)	           VMOVRDUPf(3,%r8,Z4 ) \
+	   VMULf(Z1,T1,UChi_01)	           VMOVRDUPf(6,%r8,Z5 ) \
+	   VMULf(Z1,T2,UChi_11)	           VMOVIDUPf(1,%r8,Z0 ) \
+	   VMULf(Z2,T1,UChi_02)            VMOVIDUPf(4,%r8,Z1 ) \
+	   VMULf(Z2,T2,UChi_12)	           VMOVIDUPf(7,%r8,Z2 ) \
+	   			\
+	   VMADDSUBf(Z3,Chi_00,UChi_00)    VSHUFf(Chi_01,T1)    \
+	   VMADDSUBf(Z3,Chi_10,UChi_10)    VSHUFf(Chi_11,T2)    \
+	   VMADDSUBf(Z4,Chi_00,UChi_01)    VMOVRDUPf(1,%r8,Z3 ) \
+	   VMADDSUBf(Z4,Chi_10,UChi_11)\
+	   VMADDSUBf(Z5,Chi_00,UChi_02)    VMOVRDUPf(4,%r8,Z4 ) \
+	   VMADDSUBf(Z5,Chi_10,UChi_12)\
+	   			       \
+	   VMADDSUBf(Z0,T1,UChi_00) 	   VMOVRDUPf(7,%r8,Z5 ) \ 
+	   VMADDSUBf(Z0,T2,UChi_10)\
+	   VMADDSUBf(Z1,T1,UChi_01) 	   VMOVIDUPf(2,%r8,Z0 ) \
+	   VMADDSUBf(Z1,T2,UChi_11)\
+	   VMADDSUBf(Z2,T1,UChi_02)        VMOVIDUPf(5,%r8,Z1 ) \
+	   VMADDSUBf(Z2,T2,UChi_12)        VMOVIDUPf(8,%r8,Z2 ) \
+					   			\
+	   VMADDSUBf(Z3,Chi_01,UChi_00)    VSHUFf(Chi_02,T1)    \
+	   VMADDSUBf(Z3,Chi_11,UChi_10)    VSHUFf(Chi_12,T2)    \
+	   VMADDSUBf(Z4,Chi_01,UChi_01)	   VMOVRDUPf(2,%r8,Z3 ) \
+	   VMADDSUBf(Z4,Chi_11,UChi_11)\
+	   VMADDSUBf(Z5,Chi_01,UChi_02)	   VMOVRDUPf(5,%r8,Z4 ) \
+	   VMADDSUBf(Z5,Chi_11,UChi_12)\
+	   			\
+	   VMADDSUBf(Z0,T1,UChi_00) 	   VMOVRDUPf(8,%r8,Z5 ) \
+	   VMADDSUBf(Z0,T2,UChi_10)\
+	   VMADDSUBf(Z1,T1,UChi_01)\
+	   VMADDSUBf(Z1,T2,UChi_11)\
+	   VMADDSUBf(Z2,T1,UChi_02)\
+	   VMADDSUBf(Z2,T2,UChi_12)\
+		   		   \
+	   VMADDSUBf(Z3,Chi_02,UChi_00)\
+	   VMADDSUBf(Z3,Chi_12,UChi_10)\
+	   VMADDSUBf(Z4,Chi_02,UChi_01)\
+	   VMADDSUBf(Z4,Chi_12,UChi_11)\
+	   VMADDSUBf(Z5,Chi_02,UChi_02)\
+	   VMADDSUBf(Z5,Chi_12,UChi_12)\
+						);
+
+
+#endif
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -0,0 +1,140 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_INTEL_COMMON_512_H
+#define GRID_ASM_INTEL_COMMON_512_H
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Opcodes common 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define MASK_REGS \
+  __asm__ ("mov     $0xAAAA, %%eax \n"\ 
+           "kmovw    %%eax, %%k6 \n"\
+           "mov     $0x5555, %%eax \n"\
+           "kmovw    %%eax, %%k7 \n" : : : "%eax");
+
+#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
+#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
+
+#define VTIMESIf(A,DEST, Z) \
+  VTIMESI0f(A,DEST, Z) \
+  VTIMESI1f(A,DEST, Z) \
+  VTIMESI2f(A,DEST, Z) 
+
+#define VTIMESId(A,DEST, Z) \
+  VTIMESI0d(A,DEST, Z) \
+  VTIMESI1d(A,DEST, Z) \
+  VTIMESI2d(A,DEST, Z) 
+
+#define VTIMESMINUSIf(A,DEST, Z) \
+        VTIMESMINUSI0f(A,DEST, Z) \
+        VTIMESMINUSI1f(A,DEST, Z) \
+        VTIMESMINUSI2f(A,DEST, Z) 
+
+#define VTIMESMINUSId(A,DEST, Z) \
+        VTIMESMINUSI0d(A,DEST, Z) \
+        VTIMESMINUSI1d(A,DEST, Z) \
+        VTIMESMINUSI2d(A,DEST, Z) 
+
+#define VACCTIMESIf(A,ACC,tmp)			\
+ VACCTIMESI0f(A,ACC,tmp)			\
+ VACCTIMESI1f(A,ACC,tmp)			\
+ VACCTIMESI2f(A,ACC,tmp)			
+
+#define VACCTIMESId(A,ACC,tmp)			\
+ VACCTIMESI0d(A,ACC,tmp)			\
+ VACCTIMESI1d(A,ACC,tmp)			\
+ VACCTIMESI2d(A,ACC,tmp)			
+
+#define VACCTIMESMINUSIf(A,ACC,tmp)			\
+  VACCTIMESMINUSI0f(A,ACC,tmp)				\
+  VACCTIMESMINUSI1f(A,ACC,tmp)				\
+  VACCTIMESMINUSI2f(A,ACC,tmp)			
+
+#define VACCTIMESMINUSId(A,ACC,tmp)			\
+  VACCTIMESMINUSI0d(A,ACC,tmp)				\
+  VACCTIMESMINUSI1d(A,ACC,tmp)				\
+  VACCTIMESMINUSI2d(A,ACC,tmp)			
+
+#define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
+#define LOAD64(A,ptr)  LOAD64i(A,ptr)
+
+#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
+#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
+
+#define VPREFETCHG(O,A) 
+#define VPREFETCHW(O,A) 
+#define VEVICT(O,A)   
+
+//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
+//  "clevict0 "#O"*64("#A");\n" 
+
+#define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+
+#define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
+#define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
+
+#define VSUBf(A,B,DEST)        "vsubps   " #A "," #B "," #DEST  ";\n"
+#define VSUBd(A,B,DEST)        "vsubpd   " #A "," #B "," #DEST  ";\n"
+
+#define VADDMEMf(O,A,B,DEST)        "vaddps   "#O"*64("#A ")," #B "," #DEST  ";\n"
+#define VADDMEMd(O,A,B,DEST)        "vaddpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VSUBMEMf(O,A,B,DEST)        "vsubps   "#O"*64("#A ")," #B "," #DEST  ";\n"
+#define VSUBMEMd(O,A,B,DEST)        "vsubpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VMULf(A,B,DEST)        "vmulps   " #A "," #B "," #DEST  ";\n"
+#define VMULd(A,B,DEST)        "vmulpd   " #A "," #B "," #DEST  ";\n"
+
+#define VMADDf(A,B,DEST)       "vfmadd231ps   " #A "," #B "," #DEST  ";\n"
+#define VMADDd(A,B,DEST)       "vfmadd231pd   " #A "," #B "," #DEST  ";\n"
+
+#define VMULMEMf(O,A,B,DEST)   "vmulps   " #O"*64("#A ")," #B "," #DEST  ";\n"
+#define VMULMEMd(O,A,B,DEST)   "vmulpd   " #O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VMADDMEMf(O,A,B,DEST)       "vfmadd231ps   " #O"*64("#A "),"#B "," #DEST  ";\n"
+#define VMADDMEMd(O,A,B,DEST)       "vfmadd231pd   " #O"*64("#A "),"#B "," #DEST  ";\n"
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define VPREFETCHNTA(O,A) 
+#define VPREFETCH(O,A)    
+
+#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+
+// Swaps Re/Im ; could unify this with IMCI
+#define VSHUFd(A,DEST)         "vpshufd  $0x4e," #A "," #DEST  ";\n"    
+#define VSHUFf(A,DEST)         "vpshufd  $0xb1," #A "," #DEST  ";\n"    
+#define VSHUFMEMd(OFF,A,DEST)  "vpshufd  $0x4e, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VSHUFMEMf(OFF,A,DEST)  "vpshufd  $0xb1, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 2,3,0,1
+
+#define TRAP " int3 ;\n"
+
+#endif
--- a/lib/simd/Intel512double.h
+++ b/lib/simd/Intel512double.h
@@ -0,0 +1,135 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearage
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROd(A)
+#define VMOV(A,B)                 VMOVd(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDd(A,B,C)
+#define VSUB(A,B,C)               VSUBd(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0d(A,B)
+#define VPERM1(A,B)               VPERM1d(A,B)
+#define VPERM2(A,B)               VPERM2d(A,B)
+#define VPERM3(A,B)               VPERM3d(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFd(A,B)
+
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
--- a/lib/simd/Intel512imci.h
+++ b/lib/simd/Intel512imci.h
@@ -0,0 +1,127 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_AV512_H
+#define GRID_ASM_AV512_H
+
+////////////////////////////////////////////////////////////	  
+// Knights Corner specials
+////////////////////////////////////////////////////////////	  
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
+#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
+
+#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
+#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
+
+#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
+#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
+
+#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMULMEMf(O,P,B,Biirr) \
+  VMULMEMf(O,P,C,Ciirr) \
+  VMULf(tmp,B,Briir) \
+  VMULf(tmp,C,Criir)
+
+#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMd(O,P,tmp)  \
+  VMULMEMd(O,P,B,Biirr)  \ 
+  VMULMEMd(O,P,C,Ciirr)  \
+  VMULd(tmp,B,Briir)  \
+  VMULd(tmp,C,Criir) 
+
+#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMADDMEMf(O,P,B,Biirr) \
+  VMADDMEMf(O,P,C,Ciirr) \
+  VMADDf(tmp,B,Briir) \
+  VMADDf(tmp,C,Criir)
+
+#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
+  VSHUFMEMd(O,P,tmp) \
+  VMADDMEMd(O,P,B,Biirr) \
+  VMADDMEMd(O,P,C,Ciirr) \
+  VMADDd(tmp,B,Briir) \
+  VMADDd(tmp,C,Criir)
+
+#define ZEND1d(Criir,Ciirr, tmp) "vaddpd  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
+#define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define ZEND1f(Criir,Ciirr, tmp) "vaddps  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
+#define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define VTIMESI0f(A,DEST, Z)   
+#define VTIMESI1f(A,DEST, Z)   "vaddps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESI2f(A,DEST, Z)   "vsubps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESI0d(A,DEST, Z)   
+#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESMINUSI0f(A,DEST,Z)  
+#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESMINUSI0d(A,DEST,Z)  
+#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define  VACCTIMESI0f(A,ACC,tmp)
+#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+#define  VACCTIMESI0d(A,ACC,tmp)
+#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+	   // Acc = Acc - i A
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
+//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
+
+#define VPERM0f(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
+#define VPERM1f(A,B) "vpermf32x4  $0xb1," #A "," #B ";\n"
+#define VPERM2f(A,B) "vmovaps     " #A "{badc}," #B ";\n"
+#define VPERM3f(A,B) "vmovaps     " #A "{cdab}," #B ";\n"
+
+#define VPERM0d(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
+#define VPERM1d(A,B) "vmovapd     " #A "{badc}," #B ";\n"
+#define VPERM2d(A,B) "vmovapd     " #A "{cdab}," #B ";\n"
+#define VPERM3d(A,B) VMOVd(A,B)
+
+#endif
--- a/lib/simd/Intel512single.h
+++ b/lib/simd/Intel512single.h
@@ -0,0 +1,135 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearge of macros
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROf(A)
+#define VMOV(A,B)                 VMOVf(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDf(A,B,C)
+#define VSUB(A,B,C)               VSUBf(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0f(A,B)
+#define VPERM1(A,B)               VPERM1f(A,B)
+#define VPERM2(A,B)               VPERM2f(A,B)
+#define VPERM3(A,B)               VPERM3f(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFf(A,B)
+
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -25,13 +25,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#ifndef GRID_ASM_AV512_H
-#define GRID_ASM_AV512_H
-
-// Serialisation elimination: 
-// i)  ZEND -> ZEND1, ZEND2, 6 fold round robin.
-// ii) TimesI -> TimesI_1, TimesI_2, 6 fold round robin
-//
+#ifndef GRID_ASM_INTEL_512_QCD_H
+#define GRID_ASM_INTEL_512_QCD_H

 //////////////////////////////////////////////////////////////////////////////////////////
 // Register allocations for Wilson Kernel are precision and IMCI/AVX512 indept
@@ -69,7 +64,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define UChi_12 %zmm23 

 #define Uir %zmm24 
+//#define ONE %zmm24 
 #define Uri %zmm25  
+#define T1 %zmm24
+#define T2 %zmm25

 #define Z0 %zmm26
 #define Z1 %zmm27
@@ -93,319 +91,29 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12

-//////////////////////////////////////////////////////////////////////////////////////////
-// CONFIG IMCI/AVX512
-//////////////////////////////////////////////////////////////////////////////////////////
-
-#define ASM_IMCI
-#undef  ASM_AVX512
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Opcodes common to AVX512 and IMCI
-////////////////////////////////////////////////////////////////////////////////////////////////////
-#define MASK_REGS \
-  __asm__ ("mov     $0xAAAA, %%eax \n"\
-	   "kmov    %%eax, %%k6 \n"\
-	   "knot     %%k6, %%k7 \n" : : : "%eax");
-
-#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
-#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
-
-#define VTIMESIf(A,DEST, Z) \
-  VTIMESI0f(A,DEST, Z) \
-  VTIMESI1f(A,DEST, Z) \
-  VTIMESI2f(A,DEST, Z) 
-
-#define VTIMESId(A,DEST, Z) \
-  VTIMESI0d(A,DEST, Z) \
-  VTIMESI1d(A,DEST, Z) \
-  VTIMESI2d(A,DEST, Z) 
-
-#define VTIMESMINUSIf(A,DEST, Z) \
-        VTIMESMINUSI0f(A,DEST, Z) \
-        VTIMESMINUSI1f(A,DEST, Z) \
-        VTIMESMINUSI2f(A,DEST, Z) 
-
-#define VTIMESMINUSId(A,DEST, Z) \
-        VTIMESMINUSI0d(A,DEST, Z) \
-        VTIMESMINUSI1d(A,DEST, Z) \
-        VTIMESMINUSI2d(A,DEST, Z) 
-
-#define VACCTIMESIf(A,ACC,tmp)			\
- VACCTIMESI0f(A,ACC,tmp)			\
- VACCTIMESI1f(A,ACC,tmp)			\
- VACCTIMESI2f(A,ACC,tmp)			
-
-#define  VACCTIMESI1MEMf(A,ACC,O,P)  "vaddps  " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
-#define  VACCTIMESI2MEMf(A,ACC,O,P)  "vsubrps  " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"
-#define  VACCTIMESMINUSI1MEMf(A,ACC,O,P)  "vsubrps  " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
-#define  VACCTIMESMINUSI2MEMf(A,ACC,O,P)  "vaddps  " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"
-
-#define VACCTIMESId(A,ACC,tmp)			\
- VACCTIMESI0d(A,ACC,tmp)			\
- VACCTIMESI1d(A,ACC,tmp)			\
- VACCTIMESI2d(A,ACC,tmp)			
-
-#define VACCTIMESMINUSIf(A,ACC,tmp)			\
-  VACCTIMESMINUSI0f(A,ACC,tmp)				\
-  VACCTIMESMINUSI1f(A,ACC,tmp)				\
-  VACCTIMESMINUSI2f(A,ACC,tmp)			
-
-#define VACCTIMESMINUSId(A,ACC,tmp)			\
-  VACCTIMESMINUSI0d(A,ACC,tmp)				\
-  VACCTIMESMINUSI1d(A,ACC,tmp)				\
-  VACCTIMESMINUSI2d(A,ACC,tmp)			
-
-#define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
-#define LOAD64(A,ptr)          LOAD64i(A,ptr)
-
-#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
-#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
-
-// Field prefetch
-#define VPREFETCHNTA(O,A) "vprefetchnta "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n"
-#define VPREFETCH(O,A)    "vprefetch0 "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n"
-#define VPREFETCHG(O,A) 
-#define VPREFETCHW(O,A) 
-//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
-#define VEVICT(O,A)   
-//  "clevict0 "#O"*64("#A");\n" 
-
-#define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
-#define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
-
-#define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
-#define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
-
-#define VSUBf(A,B,DEST)        "vsubps   " #A "," #B "," #DEST  ";\n"
-#define VSUBd(A,B,DEST)        "vsubpd   " #A "," #B "," #DEST  ";\n"
-
-#define VADDMEMf(O,A,B,DEST)        "vaddps   "#O"*64("#A ")," #B "," #DEST  ";\n"
-#define VADDMEMd(O,A,B,DEST)        "vaddpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
-
-#define VSUBMEMf(O,A,B,DEST)        "vsubps   "#O"*64("#A ")," #B "," #DEST  ";\n"
-#define VSUBMEMd(O,A,B,DEST)        "vsubpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
-
-#define VMULf(A,B,DEST)        "vmulps   " #A "," #B "," #DEST  ";\n"
-#define VMULd(A,B,DEST)        "vmulpd   " #A "," #B "," #DEST  ";\n"
-
-#define VMADDf(A,B,DEST)       "vfmadd231ps   " #A "," #B "," #DEST  ";\n"
-#define VMADDd(A,B,DEST)       "vfmadd231pd   " #A "," #B "," #DEST  ";\n"
-
-#define VMULMEMf(O,A,B,DEST)   "vmulps   " #O"*64("#A ")," #B "," #DEST  ";\n"
-#define VMULMEMd(O,A,B,DEST)   "vmulpd   " #O"*64("#A ")," #B "," #DEST  ";\n"
-
-#define VMADDMEMf(O,A,B,DEST)       "vfmadd231ps   " #O"*64("#A "),"#B "," #DEST  ";\n"
-#define VMADDMEMd(O,A,B,DEST)       "vfmadd231pd   " #O"*64("#A "),"#B "," #DEST  ";\n"
-
-#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
-#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
-
-#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
-#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
-
-#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
-#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
-
-#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
-#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
-
-// Need VSHUFMULMEMf,d for KNC
-// AVX512 friendly
-#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
-  VSHUFMEMf(O,P,tmp) \
-  VMULMEMf(O,P,B,Biirr) \
-  VMULMEMf(O,P,C,Ciirr) \
-  VMULf(tmp,B,Briir) \
-  VMULf(tmp,C,Criir)
-
-#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
-  VSHUFMEMd(O,P,tmp)  \
-  VMULMEMd(O,P,B,Biirr)  \ 
-  VMULMEMd(O,P,C,Ciirr)  \
-  VMULd(tmp,B,Briir)  \
-  VMULd(tmp,C,Criir) 
-
-#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
-  VSHUFMEMf(O,P,tmp) \
-  VMADDMEMf(O,P,B,Biirr) \
-  VMADDMEMf(O,P,C,Ciirr) \
-  VMADDf(tmp,B,Briir) \
-  VMADDf(tmp,C,Criir)
-
-#define TRAP " int3 ;\n"
-
-#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
-  VSHUFMEMd(O,P,tmp) \
-  VMADDMEMd(O,P,B,Biirr) \
-  VMADDMEMd(O,P,C,Ciirr) \
-  VMADDd(tmp,B,Briir) \
-  VMADDd(tmp,C,Criir)
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Lane swizzling changed between AVX512 and IMCI and requires arch dependent complex support
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// AVX512 special (Knights Landing)
-#ifdef ASM_AVX512
-#define VSTOREf(OFF,PTR,SRC)   "vmovntps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
-#define VSTOREd(OFF,PTR,SRC)   "vmovntpd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
-// Swaps Re/Im
-#define VSHUFd(A,DEST)         "vshufpd  $0x5, " #A "," #A "," #DEST  ";\n"
-#define VSHUFf(A,DEST)         "vshufps  $0x55," #A "," #A "," #DEST  ";\n"
-// Memops are useful for optimisation
-#define VSHUFMEMd(OFF,A,DEST)       "vpshufpd  $0x4e, " #OFF"("#A ")," #DEST  ";\n"
-#define VSHUFMEMf(OFF,A,DEST)       "vpshufps  $0xb1, " #OFF"("#A ")," #DEST  ";\n"
-
-
-// Merges accumulation for complex dot chain
-// TODO:  12 operation saving:
-//	# could SWIZ op 18{cdab} and eliminate temporary // 12cycles
-//	# no use KNL though. Fingour something else there.
-//	# All swizzles become perms ops, but gain addsub; subadd must use this
-//	# uint32_t (0x7F << 23 )
-//	# uint64_t (0x3FF<< 52 ) ; vpbroadcast
-#define ZEND1f(Criir,Ciirr, tmp)	\
-  "vshufps $0xb1," #Ciirr "," #Criir "," #tmp   ";\n"\
-  "vaddps  " #Criir "," #tmp "," #Criir"{%k6}"  ";\n"
-
-#define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "," #tmp "," #Criir"{%k7}"  ";\n"
-
-#define ZEND2d(Criir,Ciirr, tmp)	\
-  "vshufpd $0x33," #Ciirr "," #Criir "," #tmp  ";\n"\
-  "vaddpd  " #Criir "," #tmp "," #Criir"{%k6}" ";\n"
-#define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "," #tmp "," #Criir"{%k7}" ";\n"
-
-// Further opt possible: KNC -- use swizzle operand ; no addsub.
-//                       KNL -- addsub. Saves 6 ops, 12 cycles; KNL cost of loading "1" as only fmaddsub
-//                              no swizzle on KNL.
-#define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	 
-#define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
-#define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
-
-#define VTIMESI0d(A,DEST, Z)   VSHUFd(A,DEST)	 
-#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
-#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
-
-#define VTIMESMINUSI0f(A,DEST,Z)  VSHUFf(A,DEST)					
-#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
-#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
-
-#define VTIMESMINUSI0d(A,DEST,Z)  VSHUFd(A,DEST)					
-#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
-#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
-
-#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
-#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
-#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
-
-#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
-#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
-#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
-
-#define  VACCTIMESI0f(A,ACC,tmp)	VSHUFf(A,tmp)	
-#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
-#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
-
-#define  VACCTIMESI0d(A,ACC,tmp)	VSHUFd(A,tmp)	
-#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
-#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
-
-#define VPERM0f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
-#define VPERM1f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
-#define VPERM2f(A,B) "vshufps    " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
-#define VPERM3f(A,B) "vshufps    " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
-
-#define VPERM0d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
-#define VPERM1d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
-#define VPERM2d(A,B) "vshufpd    " #A "," #B "," "#B" ", " 0x55 ";\n"
-#define VPERM3d(A,B) VMOVd(A,B)
-
+#include <simd/Intel512common.h>
+#ifdef AVX512
+#include <simd/Intel512avx.h>
+//#include <simd/Intel512avxAddsub.h> // Alternate implementation
+#endif
+#ifdef IMCI
+#include <simd/Intel512imci.h>
 #endif

-// Knights Corner specials
-#ifdef ASM_IMCI
-#define VSTOREf(OFF,PTR,SRC)   "vmovnrngoaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
-#define VSTOREd(OFF,PTR,SRC)   "vmovnrngoapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
-  //#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
-  //#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+//////////////////////////////////////////////////////////////////
+// Macros used to build wilson kernel -- can rationalise and simplify
+// a little as some duplication developed during trying different
+// variants during optimisation. Could cut back to only those used.
+//////////////////////////////////////////////////////////////////

-#define VSHUFf(A,DEST)         "vmovaps " #A "{cdab} , " #DEST  ";\n"
-#define VSHUFd(A,DEST)         "vmovapd " #A "{cdab} , " #DEST  ";\n"
-
-// Memops are useful for optimisation
-#define VSHUFMEMd(OFF,A,DEST)  "vpshufd  $0x4e, " #OFF"*64("#A ")," #DEST  ";\n"
-#define VSHUFMEMf(OFF,A,DEST)  "vpshufd  $0xb1, " #OFF"*64("#A ")," #DEST  ";\n"
-
-#define ZEND1d(Criir,Ciirr, tmp) "vaddpd  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
-#define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
-
-#define ZEND1f(Criir,Ciirr, tmp) "vaddps  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
-#define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
-
-// Further opt possible: KNC -- use swizzle operand ; no addsub.
-//                       KNL -- addsub. Saves 6 ops, 12 cycles; KNL cost of loading "1" as only fmaddsub
-//                              no swizzle on KNL.
-#define VTIMESI0f(A,DEST, Z)   
-#define VTIMESI1f(A,DEST, Z)   "vaddps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
-#define VTIMESI2f(A,DEST, Z)   "vsubps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
-
-#define VTIMESI0d(A,DEST, Z)   
-#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
-#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
-
-#define VTIMESMINUSI0f(A,DEST,Z)  
-#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
-#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
-
-#define VTIMESMINUSI0d(A,DEST,Z)  
-#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
-#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
-
-#define  VACCTIMESI0f(A,ACC,tmp)
-#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
-#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
-
-#define  VACCTIMESI0d(A,ACC,tmp)
-#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
-#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
-
-#define VACCTIMESMINUSI0f(A,ACC,tmp)  
-#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
-#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
-
-#define VACCTIMESMINUSI0d(A,ACC,tmp)  
-#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
-#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
-
-//#define ZENDf(Criir,Ciirr, tmp)	
-
-//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
-//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
-
-#define VPERM0f(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
-#define VPERM1f(A,B) "vpermf32x4  $0xb1," #A "," #B ";\n"
-#define VPERM2f(A,B) "vmovaps     " #A "{badc}," #B ";\n"
-#define VPERM3f(A,B) "vmovaps     " #A "{cdab}," #B ";\n"
-
-#define VPERM0d(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
-#define VPERM1d(A,B) "vmovapd     " #A "{badc}," #B ";\n"
-#define VPERM2d(A,B) "vmovapd     " #A "{cdab}," #B ";\n"
-#define VPERM3d(A,B) VMOVd(A,B)
-
-#endif
-
-                    
 //  const SiteSpinor * ptr = & in._odata[offset];	
 #define LOAD_CHIMU(PTR)	 LOAD_CHIMUi(PTR)
-#define LOAD_CHI(PTR)	 LOAD_CHIi(PTR) 
+#define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 #define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 #define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
 #define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)

-#define LOAD_CHIMUi(PTR) \
-	   LOAD64(%r8,PTR)	\
-	   __asm__ (\
+#define LOAD_CHIMUi \
 	   LOAD_CHIMU01i	\
 	   LOAD_CHIMU23i	);

@@ -437,16 +145,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 //  const SiteHalfSpinor *ptr = &buf[offset];	

-#define LOAD_CHIi(PTR)				\
-  LOAD64(%r8,PTR)					\
-  __asm__ (						\
+#define LOAD_CHIi				\
  VLOAD(0,%r8,Chi_00)					\
  VLOAD(1,%r8,Chi_01)					\
  VLOAD(2,%r8,Chi_02)					\
  VLOAD(3,%r8,Chi_10)					\
  VLOAD(4,%r8,Chi_11)					\
-  VLOAD(5,%r8,Chi_12)					\
-							);
+  VLOAD(5,%r8,Chi_12)	
+	

 #define SAVE_UCHIi(PTR)				\
  LOAD64(%r8,PTR)				\
@@ -495,7 +201,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
 // KNL is DUAL issue for FP, and lifting these loads is potentially important.
 // Need detailed profile data to be sure.
-
+#if 0
 #define PREFETCH_U(A) \
  LOAD64(%r8,&U._odata[sU](A)) \
  __asm__ (		       \
@@ -524,7 +230,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VPREFETCHW(9,%r8)	       \
  VPREFETCHW(10,%r8)	       \
  VPREFETCHW(11,%r8)	       );
-
+#endif
 
 #define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))

@@ -538,6 +244,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)

+#if 0
 #define MULT_2SPIN_UNOPT(ptr)				\
 	   LOAD64(%r8,ptr)			\
  __asm__ (					\
@@ -583,19 +290,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   ZEND2(UChi_11,Z3,Chi_10)			\
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
+#endif

-#define MULT_2SPIN(ptr)	MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
-
-#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
-#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
-#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
-#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
-#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
-#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
-#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCH)
-#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
+#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
+#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
+#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr)
+#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr)
+#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr)
+#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr)
+#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
+#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)

+// MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);

+#if 0
 #define MULT_2SPIN_PF(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
@@ -636,8 +344,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   ZEND2(UChi_02,Z4,Chi_02)			\
 	   VPF(11,%r9)						\
 	   ZEND2(UChi_12,Z5,Chi_12)	     );
+#endif

-
+#if 0 
 #define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
 	   LOAD64(%r8,ptr)			\
 	   LOAD64(%r9,pf)			\
@@ -657,7 +366,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VPF(9,%r9)						\
 	   VPF(10,%r9)						\
 	   VPF(11,%r9)						);
-
+#endif

 // Pretty much Perfectly Pipelined

@@ -667,56 +376,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
-//define VTIMESIf(A,DEST, Z)		
-// These don't work if DEST==Z. FIXME.	
-#define XP_PROJ __asm__ (		\
-			 VACCTIMESI(Chimu_30,Chi_00,Z0)	\
-			 VACCTIMESI(Chimu_31,Chi_01,Z1)	\
-			 VACCTIMESI(Chimu_32,Chi_02,Z2)	\
-			 VACCTIMESI(Chimu_20,Chi_10,Z3)	\
-			 VACCTIMESI(Chimu_21,Chi_11,Z4)	\
-			 VACCTIMESI(Chimu_22,Chi_12,Z5)	);
-
 #define XP_PROJMEM(PTR) \
  LOAD64(%r8,PTR)							\
  __asm__ (								\
+	   LOAD_CHIi						\
 	   SHUF_CHIMU23i						\
-	   VACCTIMESI1MEM(Chimu_30,Chi_00,0,%r8)			\
-	   VACCTIMESI1MEM(Chimu_31,Chi_01,1,%r8)			\
-	   VACCTIMESI1MEM(Chimu_32,Chi_02,2,%r8)			\
-	   VACCTIMESI1MEM(Chimu_20,Chi_10,3,%r8)			\
-	   VACCTIMESI1MEM(Chimu_21,Chi_11,4,%r8)			\
-	   VACCTIMESI1MEM(Chimu_22,Chi_12,5,%r8)			\
-	   VACCTIMESI2MEM(Chimu_30,Chi_00,0,%r8)			\
-	   VACCTIMESI2MEM(Chimu_31,Chi_01,1,%r8)			\
-	   VACCTIMESI2MEM(Chimu_32,Chi_02,2,%r8)			\
-	   VACCTIMESI2MEM(Chimu_20,Chi_10,3,%r8)			\
-	   VACCTIMESI2MEM(Chimu_21,Chi_11,4,%r8)			\
-	   VACCTIMESI2MEM(Chimu_22,Chi_12,5,%r8)			);
-
-
-#define YP_PROJ __asm__ ( 			\
-  VSUB(Chimu_30,Chimu_00,Chi_00)\
-  VSUB(Chimu_31,Chimu_01,Chi_01)\
-  VSUB(Chimu_32,Chimu_02,Chi_02)\
-  VADD(Chimu_10,Chimu_20,Chi_10)\
-  VADD(Chimu_11,Chimu_21,Chi_11)\
-  VADD(Chimu_12,Chimu_22,Chi_12) );
-
-#define EVICT_SPINOR(reg) \
-  VEVICT(0,reg)		  \
-  VEVICT(1,reg)		  \
-  VEVICT(2,reg)		  \
-  VEVICT(3,reg)		  \
-  VEVICT(4,reg)		  \
-  VEVICT(5,reg)		  \
-  VEVICT(6,reg)		  \
-  VEVICT(7,reg)		  \
-  VEVICT(8,reg)		  \
-  VEVICT(9,reg)		  \
-  VEVICT(9,reg)		  \
-  VEVICT(10,reg)	  \
-  VEVICT(11,reg)	  
+	   VACCTIMESI1(Chi_00,Chi_00,Chimu_30)		\
+	   VACCTIMESI1(Chi_01,Chi_01,Chimu_31)		\
+	   VACCTIMESI1(Chi_02,Chi_02,Chimu_32)		\
+	   VACCTIMESI1(Chi_10,Chi_10,Chimu_20)		\
+	   VACCTIMESI1(Chi_11,Chi_11,Chimu_21)		\
+	   VACCTIMESI1(Chi_12,Chi_12,Chimu_22)		\
+	   VACCTIMESI2(Chi_00,Chi_00,Chimu_30)		\
+	   VACCTIMESI2(Chi_01,Chi_01,Chimu_31)		\
+	   VACCTIMESI2(Chi_02,Chi_02,Chimu_32)		\
+	   VACCTIMESI2(Chi_10,Chi_10,Chimu_20)		\
+	   VACCTIMESI2(Chi_11,Chi_11,Chimu_21)		\
+	   VACCTIMESI2(Chi_12,Chi_12,Chimu_22)		);


 #define YP_PROJMEM(ptr) \
@@ -729,43 +405,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VADDMEM(6,%r8,Chimu_10,Chi_10)		\
  VADDMEM(7,%r8,Chimu_11,Chi_11)		\
  VADDMEM(8,%r8,Chimu_12,Chi_12)		);
-  //  EVICT_SPINOR(%r8) 			);
-  
-#define ZP_PROJ __asm__ ( \
-			 VACCTIMESI(Chimu_20,Chi_00,Z0)	\
-			 VACCTIMESI(Chimu_21,Chi_01,Z1)	\
-			 VACCTIMESI(Chimu_22,Chi_02,Z2)	\
-			 VACCTIMESMINUSI(Chimu_30,Chi_10,Z3)	\
-			 VACCTIMESMINUSI(Chimu_31,Chi_11,Z4)	\
-			 VACCTIMESMINUSI(Chimu_32,Chi_12,Z5) );

 #define ZP_PROJMEM(PTR) \
  LOAD64(%r8,PTR)							\
  __asm__ (								\
+	   LOAD_CHIi						\
 	   SHUF_CHIMU23i						\
-	   VACCTIMESI1MEM(Chimu_20,Chi_00,0,%r8)			\
-	   VACCTIMESI1MEM(Chimu_21,Chi_01,1,%r8)			\
-	   VACCTIMESI1MEM(Chimu_22,Chi_02,2,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_30,Chi_10,3,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_31,Chi_11,4,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_32,Chi_12,5,%r8)			\
-	   VACCTIMESI2MEM(Chimu_20,Chi_00,0,%r8)			\
-	   VACCTIMESI2MEM(Chimu_21,Chi_01,1,%r8)			\
-	   VACCTIMESI2MEM(Chimu_22,Chi_02,2,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_30,Chi_10,3,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_31,Chi_11,4,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_32,Chi_12,5,%r8)			\
-	   EVICT_SPINOR(%r8) 			);
-
-
-
-#define TP_PROJ __asm__ (			\
-			 VADD(Chimu_00,Chimu_20,Chi_00)	\
-			 VADD(Chimu_01,Chimu_21,Chi_01)	\
-			 VADD(Chimu_02,Chimu_22,Chi_02)	\
-			 VADD(Chimu_10,Chimu_30,Chi_10)	\
-			 VADD(Chimu_11,Chimu_31,Chi_11)	\
-			 VADD(Chimu_12,Chimu_32,Chi_12) );
+	   VACCTIMESI1(Chi_00,Chi_00,Chimu_20)				\
+	   VACCTIMESI1(Chi_01,Chi_01,Chimu_21)		   	        \
+	   VACCTIMESI1(Chi_02,Chi_02,Chimu_22)				\
+	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30)			\
+	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31)			\
+	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32)			\
+	   VACCTIMESI2(Chi_00,Chi_00,Chimu_20)				\
+	   VACCTIMESI2(Chi_01,Chi_01,Chimu_21)				\
+	   VACCTIMESI2(Chi_02,Chi_02,Chimu_22)				\
+	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30)		\
+	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31)		\
+	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32)	);


 #define TP_PROJMEM(ptr)				\
@@ -777,44 +434,28 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VADDMEM(8,%r8,Chimu_02,Chi_02)	\
 	   VADDMEM(9,%r8,Chimu_10,Chi_10)	\
 	   VADDMEM(10,%r8,Chimu_11,Chi_11)	\
-	   VADDMEM(11,%r8,Chimu_12,Chi_12)	\
-	   EVICT_SPINOR(%r8) 			);
-
+	   VADDMEM(11,%r8,Chimu_12,Chi_12)	);

 //      hspin(0)=fspin(0)-timesI(fspin(3))
 //      hspin(1)=fspin(1)-timesI(fspin(2))
-#define XM_PROJ __asm__ ( \
- VACCTIMESMINUSI(Chimu_30,Chi_00,Z0)		\
- VACCTIMESMINUSI(Chimu_31,Chi_01,Z1)		\
- VACCTIMESMINUSI(Chimu_32,Chi_02,Z2)		\
- VACCTIMESMINUSI(Chimu_20,Chi_10,Z3)		\
- VACCTIMESMINUSI(Chimu_21,Chi_11,Z4)		\
- VACCTIMESMINUSI(Chimu_22,Chi_12,Z5)		);

 #define XM_PROJMEM(PTR) \
-  LOAD64(%r8,PTR)							\
+  LOAD64(%r8,PTR)\
  __asm__ (								\
 	   SHUF_CHIMU23i						\
-	   VACCTIMESMINUSI1MEM(Chimu_30,Chi_00,0,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_31,Chi_01,1,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_32,Chi_02,2,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_20,Chi_10,3,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_21,Chi_11,4,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_22,Chi_12,5,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_30,Chi_00,0,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_31,Chi_01,1,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_32,Chi_02,2,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_20,Chi_10,3,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_21,Chi_11,4,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_22,Chi_12,5,%r8)			);
-
-#define YM_PROJ __asm__ ( \
-  VADD(Chimu_00,Chimu_30,Chi_00)\
-  VADD(Chimu_01,Chimu_31,Chi_01)\
-  VADD(Chimu_02,Chimu_32,Chi_02)\
-  VSUB(Chimu_20,Chimu_10,Chi_10)\
-  VSUB(Chimu_21,Chimu_11,Chi_11)\
-  VSUB(Chimu_22,Chimu_12,Chi_12) );
+	   LOAD_CHIi \
+	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
+	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
+	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
+	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
+	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
+	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
+	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
+	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
+	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
+	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
+	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
+	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );

 #define YM_PROJMEM(ptr)				\
  LOAD64(%r8,ptr)				\
@@ -825,45 +466,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VADDMEM(11,%r8,Chimu_02,Chi_02)		\
  VSUBMEM(6,%r8,Chimu_10,Chi_10)		\
  VSUBMEM(7,%r8,Chimu_11,Chi_11)		\
-  VSUBMEM(8,%r8,Chimu_12,Chi_12)			\
-  EVICT_SPINOR(%r8) 			);
-
-
-#define ZM_PROJ __asm__ ( \
-  VACCTIMESMINUSI(Chimu_20,Chi_00,Z0)\
-  VACCTIMESMINUSI(Chimu_21,Chi_01,Z1)\
-  VACCTIMESMINUSI(Chimu_22,Chi_02,Z2)\
-  VACCTIMESI(Chimu_30,Chi_10,Z3)\
-  VACCTIMESI(Chimu_31,Chi_11,Z4)\
-  VACCTIMESI(Chimu_32,Chi_12,Z5));
+  VSUBMEM(8,%r8,Chimu_12,Chi_12)			);

 #define ZM_PROJMEM(PTR) \
  LOAD64(%r8,PTR)							\
  __asm__ (								\
 	   SHUF_CHIMU23i						\
-	   VACCTIMESMINUSI1MEM(Chimu_20,Chi_00,0,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_21,Chi_01,1,%r8)			\
-	   VACCTIMESMINUSI1MEM(Chimu_22,Chi_02,2,%r8)			\
-	   VACCTIMESI1MEM(Chimu_30,Chi_10,3,%r8)			\
-	   VACCTIMESI1MEM(Chimu_31,Chi_11,4,%r8)			\
-	   VACCTIMESI1MEM(Chimu_32,Chi_12,5,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_20,Chi_00,0,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_21,Chi_01,1,%r8)			\
-	   VACCTIMESMINUSI2MEM(Chimu_22,Chi_02,2,%r8)			\
-	   VACCTIMESI2MEM(Chimu_30,Chi_10,3,%r8)			\
-	   VACCTIMESI2MEM(Chimu_31,Chi_11,4,%r8)			\
-	   VACCTIMESI2MEM(Chimu_32,Chi_12,5,%r8)			\
-	   EVICT_SPINOR(%r8) 			);
-
-
-#define TM_PROJ __asm__ ( \
-  VSUB(Chimu_20,Chimu_00,Chi_00)\
-  VSUB(Chimu_21,Chimu_01,Chi_01)\
-  VSUB(Chimu_22,Chimu_02,Chi_02)\
-  VSUB(Chimu_30,Chimu_10,Chi_10)\
-  VSUB(Chimu_31,Chimu_11,Chi_11)\
-  VSUB(Chimu_32,Chimu_12,Chi_12) );
-
+           LOAD_CHIi \
+	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
+	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
+	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
+	   VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
+	   VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
+	   VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
+	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
+	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
+	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
+	   VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
+	   VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
+	   VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );

 #define TM_PROJMEM(ptr)				\
  LOAD64(%r8,ptr)				\
@@ -874,8 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VSUBMEM(8,%r8,Chimu_02,Chi_02)		\
  VSUBMEM(9,%r8,Chimu_10,Chi_10)		\
  VSUBMEM(10,%r8,Chimu_11,Chi_11)		\
-  VSUBMEM(11,%r8,Chimu_12,Chi_12)		\
-  EVICT_SPINOR(%r8) );
+  VSUBMEM(11,%r8,Chimu_12,Chi_12)		);

 //      fspin(0)=hspin(0)
 //      fspin(1)=hspin(1)
@@ -1102,7 +722,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VSUB(UChi_11,result_31,result_31)\
  VSUB(UChi_12,result_32,result_32) );

-#define PREFETCH_CHIMU(A) 
+//define PREFETCH_CHIMU(A) 

 #define PERMUTE_DIR0 __asm__ ( 	\
  VPERM0(Chi_00,Chi_00)	\
@@ -1136,4 +756,65 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VPERM3(Chi_11,Chi_11)	\
  VPERM3(Chi_12,Chi_12) );

+#define MULT_ADDSUB_2SPIN1(ptr)  \
+           LOAD64(%r8,ptr)                      
+/*
+ * __asm__ (                                     \
+);
+  VMUL(Z0,%zmm2,%zmm3) \
+*/
+#define MULT_ADDSUB_2SPIN(ptr)  \
+           LOAD64(%r8,ptr)                      \
+  __asm__ (                                     \
+           VMOVIDUP(0,%r8,Z0 ) \
+           VMOVIDUP(3,%r8,Z1 )\
+           VMOVIDUP(6,%r8,Z2 )\
+           VSHUF(Chi_00,T1)    \
+           VSHUF(Chi_10,T2)    \
+                                \
+           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 ) \
+           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 ) \
+           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 ) \
+           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 ) \
+           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 ) \
+           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 ) \
+                                \
+           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)    \
+           VMADDSUB(Z3,Chi_10,UChi_10)    VSHUF(Chi_11,T2)    \
+           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 ) \
+           VMADDSUB(Z4,Chi_10,UChi_11)\
+           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 ) \
+           VMADDSUB(Z5,Chi_10,UChi_12)\
+                                       \
+           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 ) \
+           VMADDSUB(Z0,T2,UChi_10)\
+           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 ) \
+           VMADDSUB(Z1,T2,UChi_11)\
+           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 ) \
+           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 ) \
+                                                                \
+           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)    \
+           VMADDSUB(Z3,Chi_11,UChi_10)    VSHUF(Chi_12,T2)    \
+           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 ) \
+           VMADDSUB(Z4,Chi_11,UChi_11)\
+           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 ) \
+           VMADDSUB(Z5,Chi_11,UChi_12)\
+                                \
+           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
+           VMADDSUB(Z0,T2,UChi_10)\
+           VMADDSUB(Z1,T1,UChi_01)\
+           VMADDSUB(Z1,T2,UChi_11)\
+           VMADDSUB(Z2,T1,UChi_02)\
+           VMADDSUB(Z2,T2,UChi_12)\
+                                   \
+           VMADDSUB(Z3,Chi_02,UChi_00)\
+           VMADDSUB(Z3,Chi_12,UChi_10)\
+           VMADDSUB(Z4,Chi_02,UChi_01)\
+           VMADDSUB(Z4,Chi_12,UChi_11)\
+           VMADDSUB(Z5,Chi_02,UChi_02)\
+           VMADDSUB(Z5,Chi_12,UChi_12)\
+                                                );
+
+#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
+
 #endif
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 


 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -20,6 +20,8 @@ endif

 if BUILD_ZMM
  bin_PROGRAMS=Test_zmm
+else
+  bin_PROGRAMS=
 endif

 include Make.inc
--- a/tests/Test_dwf_hdcr.cc
+++ b/tests/Test_dwf_hdcr.cc
@@ -42,6 +42,8 @@ public:
 			  int, domaindecompose,
 			  int, domainsize,
 			  int, order,
+			  int, Ls,
+			  double, mq,
 			  double, lo,
 			  double, hi,
 			  int, steps);
@@ -327,7 +329,7 @@ public:
    CoarseVector Ctmp(_CoarseOperator.Grid());
    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;

-    ConjugateGradient<CoarseVector>  CG(1.0e-2,100000);
+    ConjugateGradient<CoarseVector>  CG(3.0e-3,100000);
    //    ConjugateGradient<FineField>    fCG(3.0e-2,1000);

    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
@@ -474,7 +476,7 @@ int main (int argc, char ** argv)
  read(RD,"params",params);
  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;

-  const int Ls=16;
+  const int Ls=params.Ls;

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -536,7 +538,7 @@ int main (int argc, char ** argv)
  //  SU3::HotConfiguration(RNG4,Umu);
  //  Umu=zero;

-  RealD mass=0.001;
+  RealD mass=params.mq;
  RealD M5=1.8;

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
@@ -558,7 +560,8 @@ int main (int argc, char ** argv)
  assert ( (nbasis & 0x1)==0);
  int nb=nbasis/2;
  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
-  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
+  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
+  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
  for(int n=0;n<nb;n++){
    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
@@ -594,7 +597,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
  ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
-  CG(PosdefLdop,c_src,c_res);
+  //  CG(PosdefLdop,c_src,c_res);

  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  //  std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
@@ -619,17 +622,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  Precon.SmootherTest(src);
+  //  Precon.SmootherTest(src);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PreconDD.SmootherTest(src);
+  //  PreconDD.SmootherTest(src);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PreconDD.SAP(src,result);
+  //  PreconDD.SAP(src,result);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
@@ -657,18 +660,18 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
-  result=zero;
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  PGCRDD(HermIndefOp,src,result);
+  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
+  //  result=zero;
+  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  //  PGCRDD(HermIndefOp,src,result);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
-  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  //  result=zero;
-  //  PGCR(HermIndefOp,src,result);
+  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,8);
+  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  result=zero;
+  PGCR(HermIndefOp,src,result);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -145,7 +145,7 @@ void Tester(const functor &func)

  int ok=0;
  for(int i=0;i<Nsimd;i++){
-    if ( abs(reference[i]-result[i])>0){
+    if ( abs(reference[i]-result[i])>1.0e-7){
      std::cout<<GridLogMessage<< "*****" << std::endl;
      std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
      ok++;
--- a/tests/Test_zmm.cc
+++ b/tests/Test_zmm.cc
@@ -27,15 +27,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
-#include <simd/Avx512Asm.h>
+#include <simd/Intel512wilson.h>


 using namespace Grid;
 using namespace Grid::QCD;
+
+void ZmulF(void *ptr1,void *ptr2,void *ptr3);
+void Zmul(void *ptr1,void *ptr2,void *ptr3);
 void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
 void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
 void TimesIAvx512F(void *ptr1,void *ptr3);
 void TimesIAvx512(void *ptr1,void *ptr3);
+void TimesMinusIAvx512F(void *ptr1,void *ptr3);
+void TimesMinusIAvx512(void *ptr1,void *ptr3);



@@ -63,50 +68,106 @@ int main(int argc,char **argv)

  vColourMatrixD mat;
  vHalfSpinColourVectorD vec;
+  vHalfSpinColourVectorD vec1;
+  vHalfSpinColourVectorD vec2;
+  vHalfSpinColourVectorD vec3;
+ 
  vHalfSpinColourVectorD matvec;
  vHalfSpinColourVectorD ref;
  vComplexD err;

+  random(sRNG,vec1); 
+  vec1 = std::complex<double>(0.1,3.0);
+  random(sRNG,vec2);
+  vec2=2.0;
+  random(sRNG,vec3);
+
+  //std::cout << "Zmul  vec1"<<vec1<<" &vec1 "<<& vec1<<std::endl;
+  //std::cout << "Zmul  vec2"<<vec2<<" &vec2 "<<& vec2<<std::endl;
+  //std::cout << "Zmul  vec3"<<vec3<<" &vec3 "<<& vec3<<std::endl;
+  for(int sp=0;sp<2;sp++){
+  for(int co=0;co<3;co++){
+  ref()(sp)(co) = vec1()(sp)(co)*vec2()(sp)(co);
+  }}
+
+  Zmul((void *)&vec1,(void *)&vec2,(void *)&vec3);
+  //std::cout << "Zmul  vec3"<<vec3<<" &vec3 "<<& vec3<<std::endl;
+  //std::cout << "Zmul \n\t ref "<<ref<<"\n\t vec3"<<vec3 <<std::endl;
+  ref = ref - vec3;
+  err = TensorRemove(innerProduct(ref,ref));
+  std::cout <<"Zmul diff   "<< Reduce(err)<<std::endl;
+
  random(sRNG,mat);
+  mat = zero;
+  mat()()(0,0) = 1.0;
  random(sRNG,vec);

  ref = mat*vec;
  
  WilsonDslashAvx512((void *)&vec, (void *)&mat,(void *)&matvec);

+  //std::cout << ref   <<std::endl;
+  //std::cout << matvec<<std::endl;
  ref = ref - matvec;
  err = TensorRemove(innerProduct(ref,ref));
  std::cout <<"Double SU3 x 2spin diff   "<< Reduce(err)<<std::endl;
-
  vColourMatrixF matF;
+  vHalfSpinColourVectorF vec1F;
+  vHalfSpinColourVectorF vec2F;
+  vHalfSpinColourVectorF vec3F;
  vHalfSpinColourVectorF vecF;
  vHalfSpinColourVectorF matvecF;
  vHalfSpinColourVectorF refF;
  vComplexF errF;

  random(sRNG,matF);
+  matF = zero;
+  matF()()(0,0)=1.0;
  random(sRNG,vecF);

  refF = matF*vecF;

  WilsonDslashAvx512F((void *)&vecF, (void *)&matF,(void *)&matvecF);
-  
+  //std::cout << refF   <<std::endl;
+  //std::cout << matvecF<<std::endl;
+ 
  refF = refF-matvecF;
  errF = TensorRemove(innerProduct(refF,refF));
  std::cout <<"Single SU3 x 2spin diff   "<< Reduce(errF)<<std::endl;

  TimesIAvx512F((void *)&vecF,(void *)&matvecF);
+  //std::cout << timesI(vecF)<<std::endl;
+  //std::cout << matvecF<<std::endl;
  refF = timesI(vecF)-matvecF;
  errF = TensorRemove(innerProduct(refF,refF));
  std::cout <<" timesI single diff  "<< Reduce(errF)<<std::endl;

  TimesIAvx512((void *)&vec,(void *)&matvec);
-  
+  //std::cout << timesI(vec)<<std::endl;
+  //std::cout << matvec<<std::endl;
+ 
  ref = timesI(vec)-matvec;
  err = TensorRemove(innerProduct(ref,ref));
  std::cout <<" timesI double diff  "<< Reduce(err)<<std::endl;

+  TimesMinusIAvx512F((void *)&vecF,(void *)&matvecF);
+  //std::cout << timesMinusI(vecF)<<std::endl;
+  //std::cout << matvecF<<std::endl;
+  refF = timesMinusI(vecF)-matvecF;
+  errF = TensorRemove(innerProduct(refF,refF));
+  std::cout <<" timesMinusI single diff  "<< Reduce(errF)<<std::endl;
+
+  TimesMinusIAvx512((void *)&vec,(void *)&matvec);
+  //std::cout << timesMinusI(vec)<<std::endl;
+  //std::cout << matvec<<std::endl;
+
+  ref = timesMinusI(vec)-matvec;
+  err = TensorRemove(innerProduct(ref,ref));
+  std::cout <<" timesMinusI double diff  "<< Reduce(err)<<std::endl;
+
+
  LatticeFermion src (FGrid);
+  LatticeFermion tmp (FGrid);
  LatticeFermion srce(FrbGrid);

  LatticeFermion resulto(FrbGrid); resulto=zero;
@@ -114,13 +175,14 @@ int main(int argc,char **argv)
  LatticeFermion diff(FrbGrid); 
  LatticeGaugeField Umu(UGrid);

-#if 1
+
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  random(RNG5,src);
+#if 1
  random(RNG4,Umu);
 #else
-  int mmu=3;
+  int mmu=2;
  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -157,7 +219,7 @@ int main(int argc,char **argv)
  }
  t1=usecond();

-
+#if 1
  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
    Dw.DhopOE(srce,resulta,0);
    PerformanceCounter Counter(i);
@@ -166,50 +228,119 @@ int main(int argc,char **argv)
    Counter.Stop();
    Counter.Report();
  }
-  resulta = (-0.5) * resulta;
+#endif
+  //resulta = (-0.5) * resulta;

  std::cout<<GridLogMessage << "Called Asm Dw"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(resulta)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops*ncall/(t1-t0)<<std::endl;
  diff = resulto-resulta;
  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
-
+  std::cout<<std::endl;
+#if 0
+  std::cout<<"=========== result Grid ============="<<std::endl;
+  std::cout<<std::endl;
+  tmp = zero;
+  setCheckerboard(tmp,resulto);
+  std::cout<<tmp<<std::endl;
+  std::cout<<std::endl;
+  std::cout<<"=========== result ASM ============="<<std::endl;
+  std::cout<<std::endl;
+  tmp = zero;
+  setCheckerboard(tmp,resulta);
+  std::cout<<tmp<<std::endl;
+#endif
 }

-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND1
-#undef ZEND2
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-
-#define VZERO(A) VZEROd(A)
-#define VTIMESI(A,B,C) VTIMESId(A,B,C)
-#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
-
-#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
-#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULd(Uri,Uir,Chi,UChi,Z)
-#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDd(Uri,Uir,Chi,UChi,Z)
-#define ZEND1(A,B,C)               ZEND1d(A,B,C)
-#define ZEND2(A,B,C)               ZEND2d(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADd(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULd(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDd(A,B,C,D,E)
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#include <simd/Intel512double.h>

 #define zz Z0

+
+void Zmul(void *ptr1,void *ptr2,void *ptr3)
+{
+  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
+  __asm__ ("kmovw    %%eax, %%k6 " : : :);
+  __asm__ ("mov     $0x5555, %%eax "  : : :"%eax");
+  __asm__ ("kmovw    %%eax, %%k7 " : : :);
+
+#define CC result_00
+  LOAD64(%r9,ptr1);
+  LOAD64(%r8,ptr2);
+  LOAD64(%r10,ptr3)
+  __asm__ (
+  VLOAD(0,%r8,CC)
+  ZLOAD(0,%r9,Chi_00,Z0) 
+  ZMUL(Chi_00,Z0,CC,UChi_00,Z1)
+  //VSTORE(0,%r10,UChi_00)
+  //VSTORE(1,%r10,Z1)
+  ZEND1(UChi_00,Z1,Z0)
+  //VSTORE(2,%r10,UChi_00)
+  ZEND2(UChi_00,Z1,Z0)
+  //VSTORE(3,%r10,UChi_00)
+  VSTORE(0,%r10,UChi_00)
+  VLOAD(1,%r8,CC)
+  ZLOAD(1,%r9,Chi_01,Z0) 
+  ZMUL(Chi_01,Z0,CC,UChi_01,Z1)
+  ZEND1(UChi_01,Z1,Z0)
+  ZEND2(UChi_01,Z1,Z0)
+  VSTORE(1,%r10,UChi_01)
+  VLOAD(2,%r8,CC)
+  ZLOAD(2,%r9,Chi_02,Z0) 
+  ZMUL(Chi_02,Z0,CC,UChi_02,Z1)
+  ZEND1(UChi_02,Z1,Z0)
+  ZEND2(UChi_02,Z1,Z0)
+  VSTORE(2,%r10,UChi_02)
+  VLOAD(3,%r8,CC)
+  ZLOAD(3,%r9,Chi_10,Z0) 
+  ZMUL(Chi_10,Z0,CC,UChi_10,Z1)
+  ZEND1(UChi_10,Z1,Z0)
+  ZEND2(UChi_10,Z1,Z0)
+  VSTORE(3,%r10,UChi_10)
+  VLOAD(4,%r8,CC)
+  ZLOAD(4,%r9,Chi_11,Z0) 
+  ZMUL(Chi_11,Z0,CC,UChi_11,Z1)
+  ZEND1(UChi_11,Z1,Z0)
+  ZEND2(UChi_11,Z1,Z0)
+  VSTORE(4,%r10,UChi_11)
+  VLOAD(5,%r8,CC)
+  ZLOAD(5,%r9,Chi_12,Z0) 
+  ZMUL(Chi_12,Z0,CC,UChi_12,Z1)
+  ZEND1(UChi_12,Z1,Z0)
+  ZEND2(UChi_12,Z1,Z0)
+  VSTORE(5,%r10,UChi_12)
+  );
+}
+void TimesMinusIAvx512(void *ptr1,void *ptr3)
+{
+  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
+  __asm__ ("kmovw    %%eax, %%k6 " : : :);
+  __asm__ ("mov     $0x5555, %%eax "  : : :"%eax");
+  __asm__ ("kmovw    %%eax, %%k7 " : : :);
+
+  MASK_REGS;
+
+  LOAD_CHI(ptr1);
+
+  __asm__ (
+  VZERO(zz)
+  VTIMESMINUSI(Chi_00,UChi_00,zz)
+  VTIMESMINUSI(Chi_01,UChi_01,zz)
+  VTIMESMINUSI(Chi_02,UChi_02,zz)
+  VTIMESMINUSI(Chi_10,UChi_10,zz)
+  VTIMESMINUSI(Chi_11,UChi_11,zz)
+  VTIMESMINUSI(Chi_12,UChi_12,zz)
+  );
+
+  SAVE_UCHI(ptr3);
+}
+
 void TimesIAvx512(void *ptr1,void *ptr3)
 {
  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
-  __asm__ ("kmov    %%eax, %%k6 " : : :);
-  __asm__ ("knot     %%k6, %%k7 " : : :);
-
+  __asm__ ("kmovw    %%eax, %%k6 " : : :);
+  __asm__ ("mov     $0x5555, %%eax "  : : :"%eax");
+  __asm__ ("kmovw    %%eax, %%k7 " : : :);

  MASK_REGS;
  
@@ -252,41 +383,69 @@ void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3)

 }

-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND1
-#undef ZEND2
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-#undef VZERO
-#undef VTIMESI
-#undef VTIMESI0
-#undef VTIMESI1
-#undef VTIMESI2
-#undef VTIMESMINUSI
-#undef ZMULMEM2SP
-#undef ZMADDMEM2SP
-#define VZERO(A) VZEROf(A)
-#define VMOV(A,B) VMOVf(A,B)
-#define VADD(A,B,C) VADDf(A,B,C)
-#define VSUB(A,B,C) VSUBf(A,B,C)
-#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
-#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
+#include <simd/Intel512single.h>

-#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
-#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
-#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
-#define ZEND1(A,B,C)               ZEND1f(A,B,C)
-#define ZEND2(A,B,C)               ZEND2f(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+void ZmulF(void *ptr1,void *ptr2,void *ptr3)
+{
+  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
+  __asm__ ("kmovw    %%eax, %%k6 " : : :);
+  __asm__ ("mov     $0x5555, %%eax "  : : :"%eax");
+  __asm__ ("kmovw    %%eax, %%k7 " : : :);
+  MASK_REGS;
+  ZLOAD(0,ptr1,Chi_00,Z0);
+  ZLOAD(1,ptr1,Chi_01,Z1);
+  ZLOAD(2,ptr1,Chi_02,Z2);
+  ZLOAD(3,ptr1,Chi_10,Z3);
+  ZLOAD(4,ptr1,Chi_11,Z4);
+  ZLOAD(5,ptr1,Chi_12,Z5);
+
+  VLOAD(0,ptr2,Chi_20);
+  VLOAD(1,ptr2,Chi_21);
+  VLOAD(2,ptr2,Chi_22);
+  VLOAD(3,ptr2,Chi_30);  
+  VLOAD(4,ptr2,Chi_31);  
+  VLOAD(5,ptr2,Chi_32);  
+
+  ZMUL(Chi_00,Z0,Chi_20,UChi_00,UChi_20);
+  ZMUL(Chi_01,Z1,Chi_21,UChi_01,UChi_21);
+  ZMUL(Chi_02,Z2,Chi_22,UChi_02,UChi_22);
+  ZMUL(Chi_10,Z3,Chi_23,UChi_10,UChi_30);
+  ZMUL(Chi_11,Z4,Chi_24,UChi_11,UChi_31);
+  ZMUL(Chi_12,Z5,Chi_25,UChi_12,UChi_32);
+  
+  ZEND1(UChi_00,UChi_20,Z0);
+  ZEND1(UChi_01,UChi_21,Z1);
+  ZEND1(UChi_02,UChi_22,Z2);
+  ZEND1(UChi_10,UChi_30,Z3);
+  ZEND1(UChi_11,UChi_31,Z4);
+  ZEND1(UChi_12,UChi_32,Z5);
+
+  ZEND2(UChi_00,UChi_20,Z0);
+  ZEND2(UChi_01,UChi_21,Z1);
+  ZEND2(UChi_02,UChi_22,Z2);
+  ZEND2(UChi_10,UChi_30,Z3);
+  ZEND2(UChi_11,UChi_31,Z4);
+  ZEND2(UChi_12,UChi_32,Z5);
+
+  SAVE_UCHI(ptr3); 
+}
+
+void TimesMinusIAvx512F(void *ptr1,void *ptr3)
+{
+  MASK_REGS;
+
+  LOAD_CHI(ptr1);
+  __asm__ (
+  VZERO(zz)
+  VTIMESMINUSI(Chi_00,UChi_00,zz)
+  VTIMESMINUSI(Chi_01,UChi_01,zz)
+  VTIMESMINUSI(Chi_02,UChi_02,zz)
+  VTIMESMINUSI(Chi_10,UChi_10,zz)
+  VTIMESMINUSI(Chi_11,UChi_11,zz)
+  VTIMESMINUSI(Chi_12,UChi_12,zz)
+           );
+  SAVE_UCHI(ptr3);
+}

 void TimesIAvx512F(void *ptr1,void *ptr3)
 {
@@ -311,7 +470,8 @@ void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3)

  LOAD_CHI(ptr1);

-  MULT_2SPIN(ptr2);
+  MULT_ADDSUB_2SPIN(ptr2);
+  //MULT_2SPIN(ptr2);

  SAVE_UCHI(ptr3);