Large change with KNL preparation

2025-08-02 20:57:06 +01:00 · 2016-06-03 03:24:26 -07:00
parent 1c0e922585
commit 139cc5f1ae
26 changed files with 1810 additions and 1705 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
+#include <PerfCount.h>

 using namespace std;
 using namespace Grid;
@@ -45,6 +46,10 @@ struct scal {
  };

 bool overlapComms = false;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+

 int main (int argc, char ** argv)
 {
@@ -58,12 +63,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=8;
+  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});

@@ -78,7 +88,9 @@ int main (int argc, char ** argv)

  ColourMatrix cm = Complex(1.0,0.0);

-  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); 
+  random(RNG4,Umu);
+
  LatticeGaugeField Umu5d(FGrid); 

  // replicate across fifth dimension
@@ -119,11 +131,16 @@ int main (int argc, char ** argv)
  
  RealD NP = UGrid->_Nprocessors;

+  for(int doasm=1;doasm<2;doasm++){
+
+    QCD::WilsonKernelsStatic::AsmOpt=doasm;
+
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=100;
-  {
+  int ncall =50;
+  if (0) {
+
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
@@ -140,10 +157,83 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    Dw.Report();
+    //    Dw.Report();
  }

-  exit(0);
+  if (1)
+  {
+    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+    LatticeFermionF ssrc(sFGrid);
+    LatticeFermionF sref(sFGrid);
+    LatticeFermionF sresult(sFGrid);
+    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5,params);
+  
+    for(int x=0;x<latt4[0];x++){
+    for(int y=0;y<latt4[1];y++){
+    for(int z=0;z<latt4[2];z++){
+    for(int t=0;t<latt4[3];t++){
+    for(int s=0;s<Ls;s++){
+      std::vector<int> site({s,x,y,z,t});
+      SpinColourVectorF tmp;
+      peekSite(tmp,src,site);
+      pokeSite(tmp,ssrc,site);
+    }}}}}
+
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      sDw.Dhop(ssrc,sresult,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    //  sDw.Report();
+  
+    if(1){
+      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+	sDw.Dhop(ssrc,sresult,0);
+	PerformanceCounter Counter(i);
+	Counter.Start();
+	sDw.Dhop(ssrc,sresult,0);
+	Counter.Stop();
+	Counter.Report();
+      }
+    }
+
+    
+    if(0){
+      std::cout<<GridLogMessage << " Cycle reporting "<<std::endl;
+
+    QCD::WilsonFermion5DStatic::CycleReport=1;
+    for(int i=0;i<ncall;i++){
+      sDw.Dhop(ssrc,sresult,0);
+    }
+    QCD::WilsonFermion5DStatic::CycleReport=0;
+    }
+
+    RealF sum=0;
+    for(int x=0;x<latt4[0];x++){
+    for(int y=0;y<latt4[1];y++){
+    for(int z=0;z<latt4[2];z++){
+    for(int t=0;t<latt4[3];t++){
+    for(int s=0;s<Ls;s++){
+      std::vector<int> site({s,x,y,z,t});
+      SpinColourVectorF normal, simd;
+      peekSite(normal,result,site);
+      peekSite(simd,sresult,site);
+      sum=sum+norm2(normal-simd);
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
+    }}}}}
+    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
+
+  }

  if (1)
  { // Naive wilson dag implementation
@@ -217,5 +307,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;

+
+  }
+
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -119,7 +119,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  mfc = flops*ncall/(t1-t0);
  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;

-  QCD::WilsonFermion5DStatic::AsmOptDslash=1;
+  QCD::WilsonKernelsStatic::AsmOpt=1;
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.DhopOE(srce,resulta,0);
--- a/configure.ac
+++ b/configure.ac
@@ -55,6 +55,15 @@ echo :::::::::::::::::::::::::::::::::::::::::::

 AC_CHECK_FUNCS([gettimeofday])

+#AC_CHECK_LIB([gmp],[__gmpf_init],,
+#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
+#Please install or provide the correct path to your installation
+#Info at: http://www.gmplib.org)])
+
+#AC_CHECK_LIB([mpfr],[mpfr_init],,
+#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
+#Please install or provide the correct path to your installation
+#Info at: http://www.mpfr.org/)])

 #
 # SIMD instructions selection
@@ -199,6 +208,25 @@ case ${ac_RNG} in
     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac
+
+#
+# SDE timing mode
+#
+AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|mp],\
+	[Enable system dependent high res timers])],\
+	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
+case ${ac_TIMERS} in
+     yes)
+     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
+     ;;
+     no)
+     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
+     ;;
+     *)
+     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
+     ;;
+esac
+
 #
 # Chroma regression tests
 #
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -211,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    QCD::WilsonFermionStatic::HandOptDslash=1;
-    QCD::WilsonFermion5DStatic::HandOptDslash=1;
+    QCD::WilsonKernelsStatic::HandOpt=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
@@ -276,11 +275,6 @@ void Grid_finalize(void)
  Grid_unquiesce_nodes();
 #endif
 }
-double usecond(void) {
-  struct timeval tv;
-  gettimeofday(&tv,NULL);
-  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
-}

 void * Grid_backtrace_buffer[_NBACKTRACE];

--- a/lib/PerfCount.cc
+++ b/lib/PerfCount.cc
@@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
-
+#define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 #ifdef __linux__
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
-  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
+    // 4
+#ifdef AVX512
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
+    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
+    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
+    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
+    // 11
+#else
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
+    // 11
 #endif
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
+    //15
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
+    //19
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
+#endif
 };
 }
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -58,6 +58,28 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 }
 #endif

+#ifdef TIMERS_OFF
+
+#warning PerfCount: Disabling cycle timers
+
+inline uint64_t cyclecount(void){ 
+  return 0;
+}
+#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
+#define __SSC_STOP  __SSC_MARK(0x110)
+#define __SSC_START __SSC_MARK(0x111)
+
+
+#else
+
+#define __SSC_MARK(mark) 
+#define __SSC_STOP  
+#define __SSC_START 
+
+/*
+ * cycle counters arch dependent
+ */
+
 #ifdef __bgq__
 inline uint64_t cyclecount(void){ 
   uint64_t tmp;
@@ -65,18 +87,21 @@ inline uint64_t cyclecount(void){
   return tmp;
 }
 #elif defined __x86_64__
-#include <immintrin.h>
-#ifndef __INTEL_COMPILER
 #include <x86intrin.h>
-#endif
-inline uint64_t cyclecount(void){
-   return __rdtsc();
+inline uint64_t cyclecount(void){ 
+  return __rdtsc();
+  //  unsigned int dummy;
+  // return __rdtscp(&dummy);
 }
 #else
+
 #warning No cycle counter implemented for this architecture
 inline uint64_t cyclecount(void){ 
   return 0;
 }
+
+#endif
+
 #endif

 class PerformanceCounter {
@@ -87,6 +112,7 @@ private:
    uint32_t type;
    uint64_t config;
    const char *name;
+    int normalisation;
  } PerformanceCounterConfig; 
  
  static const PerformanceCounterConfig PerformanceCounterConfigs [];
@@ -94,26 +120,12 @@ private:
 public:

  enum PerformanceCounterType {
-    CPUCYCLES=0,
-    INSTRUCTIONS,
-    //    STALL_CYCLES,
-    CACHE_REFERENCES,
-    CACHE_MISSES,
-    L1D_READ_MISS,
-    L1D_READ_ACCESS,
-    L1D_WRITE_MISS,
-    L1D_WRITE_ACCESS,
-    L1D_PREFETCH_MISS,
-    L1D_PREFETCH_ACCESS,
-    LL_READ_MISS,
-    //    LL_READ_ACCESS,
-    LL_WRITE_MISS,
-    LL_WRITE_ACCESS,
-    LL_PREFETCH_MISS,
-    LL_PREFETCH_ACCESS,
-    L1I_READ_MISS,
-    L1I_READ_ACCESS,
-    PERFORMANCE_COUNTER_NUM_TYPES
+    CACHE_REFERENCES=0,
+    CACHE_MISSES=1,
+    CPUCYCLES=2,
+    INSTRUCTIONS=3,
+    L1D_READ_ACCESS=4,
+    PERFORMANCE_COUNTER_NUM_TYPES=19
  };

 public:
@@ -121,7 +133,9 @@ public:
  int PCT;

  long long count;
+  long long cycles;
  int fd;
+  int cyclefd;
  unsigned long long elapsed;
  uint64_t begin;

@@ -134,7 +148,9 @@ public:
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
+    cyclefd=-1;
    count=0;
+    cycles=0;
    PCT =_pct;
    Open();
 #endif
@@ -159,6 +175,15 @@ public:
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
+    int norm = PerformanceCounterConfigs[PCT].normalisation;
+    pe.type  = PerformanceCounterConfigs[norm].type;
+    pe.config= PerformanceCounterConfigs[norm].config;
+    name = PerformanceCounterConfigs[norm].name;
+    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
+    if (cyclefd == -1) {
+      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
+      perror("Error is");
+    }
 #endif
  }

@@ -168,6 +193,8 @@ public:
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
    }
    begin  =cyclecount();
 #else
@@ -177,10 +204,13 @@ public:

  void Stop(void) {
    count=0;
+    cycles=0;
 #ifdef __linux__
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
+      ::read(cyclefd, &cycles, sizeof(long long));
    }
    elapsed = cyclecount() - begin;
 #else
@@ -190,7 +220,11 @@ public:
  }
  void Report(void) {
 #ifdef __linux__
-    std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+    int N = PerformanceCounterConfigs[PCT].normalisation;
+    const char * sn = PerformanceCounterConfigs[N].name ;
+    const char * sc = PerformanceCounterConfigs[PCT].name;
+      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
+		  sc, count, sc,sn, (double)count/(double)cycles);
 #else
    std::printf("%llu cycles \n", elapsed );
 #endif
@@ -199,7 +233,7 @@ public:
  ~PerformanceCounter()
  {
 #ifdef __linux__
-    ::close(fd);
+    ::close(fd);    ::close(cyclefd);
 #endif
  }

--- a/lib/Stencil.h
+++ b/lib/Stencil.h
--- a/lib/Timer.h
+++ b/lib/Timer.h
@@ -35,11 +35,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

+#ifdef TIMERS_OFF
+#warning Timer.h Disabling timers
+#endif

  // Dress the output; use std::chrono

 // C++11 time facilities better?
-double usecond(void);
+inline double usecond(void) {
+  struct timeval tv;
+#ifdef TIMERS_ON
+  gettimeofday(&tv,NULL);
+#endif
+  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
+}

 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
@@ -63,17 +72,23 @@ public:
  }
  void     Start(void) { 
    assert(running == false);
+#ifdef TIMERS_ON
    start = GridClock::now(); 
+#endif
    running = true;
  }
  void     Stop(void)  { 
    assert(running == true);
+#ifdef TIMERS_ON
    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
+#endif
    running = false; 
  };
  void     Reset(void){
    running = false;
+#ifdef TIMERS_ON
    start = GridClock::now();
+#endif
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
  GridTime Elapsed(void) {
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -172,6 +172,9 @@ public:
 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];

 	// all elements of a simd vector must have same checkerboard.
+	//
+	// If Ls vectorised, this must still be the case; e.g. 
+	// layout == 8 , require _rdimensions[d] >= 2;
 	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 

 	_osites *= _rdimensions[d];
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -288,40 +288,20 @@ PARALLEL_FOR_LOOP
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
-    DhopInternalCommsThenCompute(st,U,in,out,dag);
-  }
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
-							 const FermionField &in, FermionField &out,int dag) {
-
    assert((dag==DaggerNo) ||(dag==DaggerYes));

    Compressor compressor(dag);
    st.HaloExchange(in,compressor);
    
    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
-	}
+      for(int sss=0;sss<in._grid->oSites();sss++){
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
      }
    } else {
-      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
-	}
+      for(int sss=0;sss<in._grid->oSites();sss++){
+	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
      }
    }
  };
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -114,9 +114,6 @@ namespace Grid {
      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;

-      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
-				    const FermionField &in, FermionField &out,int dag) ;
-
      // Constructor
      WilsonFermion(GaugeField &_Umu,
 		    GridCartesian         &Fgrid,
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -1,5 +1,4 @@
-
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -39,8 +38,6 @@ namespace QCD {
 // S-direction is INNERMOST and takes no part in the parity.
 const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
-int WilsonFermion5DStatic::HandOptDslash;
-int WilsonFermion5DStatic::AsmOptDslash;

  // 5d lattice for DWF.
 template<class Impl>
@@ -300,6 +297,8 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
+  return;
+#if 0
  std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
  std::cout<<GridLogMessage << "Wilson5d      time "<<alltime <<" us"<<std::endl;
  std::cout<<GridLogMessage << "HaloBegin     time "<<commtime <<" us"<<std::endl;
@@ -320,6 +319,7 @@ void WilsonFermion5D<Impl>::Report(void)
  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
+#endif
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
@@ -342,25 +342,14 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
-{
-    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
-}
-
-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
  Compressor compressor(dag);

-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
  int LLs = in._grid->_rdimensions[0];
  
  commtime -=usecond();
-  //  auto handle = st.HaloExchangeBegin(in,compressor);
-  //  st.HaloExchangeComplete(handle);
  st.HaloExchange(in,compressor);
  commtime +=usecond();

@@ -368,59 +357,24 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
  jointime +=usecond();
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<LLs;s++){
-	  int sU=ss;
-	  int sF = s+LLs*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<LLs;s++){
-	  int sU=ss;
-	  int sF = s+LLs*sU;
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	}
+    for(int ss=0;ss<U._grid->oSites();ss++){
+      for(int s=0;s<LLs;s++){
+	int sU=ss;
+	int sF=s+LLs*sU;
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
      }
    }
  } else {
-    if( this->AsmOptDslash ) {
-
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<LLs;s++){
-	  int sU=ss;
-	  int sF = s+LLs*sU;
-	  Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    } else if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP     
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<LLs;s++){
-	  int sU=ss;
-	  int sF = s+LLs*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<LLs;s++){
-	  int sU=ss;
-	  int sF = s+LLs*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
+#pragma omp parallel for schedule(runtime)
+    for(int ss=0;ss<U._grid->oSites();ss++){
+      int sU=lo.Reorder(ss);
+      int sF=LLs*sU;
+      for(int s=0;s<LLs;s++){
+	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	sF++;
      }
    }
  }
@@ -473,7 +427,7 @@ FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
 template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
-
+  
 }}


--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -49,8 +49,6 @@ namespace Grid {
    class WilsonFermion5DStatic { 
    public:
      // S-direction is INNERMOST and takes no part in the parity.
-      static int AsmOptDslash; // these are a temporary hack
-      static int HandOptDslash; // these are a temporary hack
      static const std::vector<int> directions;
      static const std::vector<int> displacements;
      const int npoint = 8;
@@ -122,13 +120,6 @@ namespace Grid {
 			FermionField &out,
 			int dag);

-      void DhopInternalCommsThenCompute(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);
-
      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -31,14 +31,43 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {

+  int WilsonKernelsStatic::HandOpt;
+  int WilsonKernelsStatic::AsmOpt;
+
 template<class Impl> 
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};

-  // Need controls to do interior, exterior, or both
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int sF,int sU,const FermionField &in, FermionField &out)
+{
+  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,in,out);
+  else if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
+  else              WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
+}
+
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,const FermionField &in, FermionField &out)
+{
+  // No asm implementation yet.
+  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
+  //  else
+  if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
+  else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
+}
+
+
+  ////////////////////////////////////////////
+  // Generic implementation; move to different file?
+  ////////////////////////////////////////////
+
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					   int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
@@ -214,9 +243,9 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField

  // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
-					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out)
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
@@ -518,17 +547,9 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
  vstream(out._odata[sF],result);
 }

-#if ( ! defined(AVX512) )
-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
-					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					      int sF,int sU,const FermionField &in, FermionField &out)
-{
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-}
-#endif

  FermOpTemplateInstantiate(WilsonKernels);
+
 template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 template class WilsonKernels<DomainWallRedBlack5dImplD>;

--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -38,14 +38,21 @@ namespace Grid {
    // Helper routines that implement Wilson stencil for a single site.
    // Common to both the WilsonFermion and WilsonFermion5D
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    class WilsonKernelsStatic { 
+    public:
+      // S-direction is INNERMOST and takes no part in the parity.
+      static int AsmOpt;  // these are a temporary hack
+      static int HandOpt; // these are a temporary hack
+    };

-    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
    public:

     INHERIT_IMPL_TYPES(Impl);
     typedef FermionOperator<Impl> Base;
     
    public:
+
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU,const FermionField &in, FermionField &out);
@@ -58,15 +65,26 @@ namespace Grid {
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);

+    private:
+     // Specialised variants
+     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			   int sF,int sU,const FermionField &in, FermionField &out);
+      
+     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			      int sF,int sU,const FermionField &in,FermionField &out);
+
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out);

-     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+
+     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out);
     
-     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				 int sF,int sU,const FermionField &in, FermionField &out);

--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -2,6 +2,8 @@

    Grid physics library, www.github.com/paboyle/Grid 

+
+
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc

    Copyright (C) 2015
@@ -26,237 +28,75 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
 #include <Grid.h>
-#if defined(AVX512) 
-//#if defined (IMCI)
-
-#include <simd/Intel512wilson.h>
-
-#include <simd/Intel512single.h>
-

 namespace Grid {
 namespace QCD {

+
+  ///////////////////////////////////////////////////////////
+  // Default to no assembler implementation
+  ///////////////////////////////////////////////////////////
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
-  uint64_t  now;
-  uint64_t first ;
-  int offset,local,perm, ptype;
-  const SiteHalfSpinor *pbuf = & buf[0];
-  const SiteSpinor   *plocal = & in._odata[0];
-  void *pf;
-  int osites = in._grid->oSites();
-
-  
-  StencilEntry *SE;
-
-  //#define STAMP(i) timers[i] = cyclecount() ; 
-#define STAMP(i) //timers[i] = cyclecount() ; 
-
-  MASK_REGS;
-
-  first = cyclecount();
-
-  SE=st.GetEntry(ptype,Xm,ss);
-
-  // Xm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Ym,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    XP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFXM(Xm,pf);
-  }
-  XP_RECON;
-
-  // Ym
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Zm,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    YP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFYM(Ym,pf);
-  }
-  YP_RECON_ACCUM;
-
-  // Zm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Tm,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    ZP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFZM(Zm,pf);
-  }
-  ZP_RECON_ACCUM;
-
-  // Tm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  SE=st.GetEntry(ptype,Tp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-
-  if ( local ) {
-    TP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFTM(Tm,pf);
-  }
-  TP_RECON_ACCUM;
-
-  // Tp
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Zp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    TM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFTP(Tp,pf);
-  }
-  TM_RECON_ACCUM;
-
-  // Zp
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Yp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    ZM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFZP(Zp,pf);
-  }
-  ZM_RECON_ACCUM;
-
-
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Xp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    YM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFYP(Yp,pf);
-  }
-  YM_RECON_ACCUM;
-
-  // Xp
-  perm   = SE->_permute;
-  offset = SE->_offset;
-  local  = SE->_is_local;
-    
-  // Prefetch
-  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    XM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFXP(Xp,pf);
-  }
-  XM_RECON_ACCUM;
-
- debug:
-  SAVE_RESULT(&out._odata[ss]);
-
+  assert(0);
 }

-  template class WilsonKernels<WilsonImplF>;		
-  template class WilsonKernels<WilsonImplD>; 
-  template class WilsonKernels<GparityWilsonImplF>;
-  template class WilsonKernels<GparityWilsonImplD>;
-  template class WilsonKernels<DomainWallRedBlack5dImplF>;
-  template class WilsonKernels<DomainWallRedBlack5dImplD>;
-}}
+#if defined(AVX512) 
+
+
+  ///////////////////////////////////////////////////////////
+  // If we are AVX512 specialise the single precision routine
+  ///////////////////////////////////////////////////////////
+
+#include <simd/Intel512wilson.h>
+#include <simd/Intel512single.h>
+
+static Vector<vComplexF> signs;
+
+int setupSigns(void ){
+  Vector<vComplexF> bother(2);
+  signs = bother;
+  vrsign(signs[0]);
+  visign(signs[1]);
+  return 1;
+}
+static int signInit = setupSigns();
+
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr)
+
+template<>
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int ss,int sU,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+template<>
+void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								   int ss,int sU,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
 #endif
+
+template class WilsonKernels<WilsonImplF>;		
+template class WilsonKernels<WilsonImplD>; 
+template class WilsonKernels<GparityWilsonImplF>;
+template class WilsonKernels<GparityWilsonImplD>;
+template class WilsonKernels<DomainWallRedBlack5dImplF>;
+template class WilsonKernels<DomainWallRedBlack5dImplD>;
+}}
+
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -0,0 +1,154 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  uint64_t basea, baseb;
+
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  SAVE_RESULT(&out._odata[ss]);
+
+}
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -312,7 +312,7 @@ namespace QCD {


 template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -552,12 +552,10 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
    vstream(ref()(3)(1),result_31);
    vstream(ref()(3)(2),result_32);
  }
-  return 0;
 }

-
 template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -798,7 +796,6 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
    vstream(ref()(3)(1),result_31);
    vstream(ref()(3)(2),result_32);
  }
-  return 0;
 }


@@ -806,125 +803,80 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
-  return 0;
-  
 }

 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
 }

 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
 }

 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
 }



-  //////////////
-/*
-template<>
-int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out)
-{
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
-  return 0;
-  
-}
+////////////// Wilson ; uses this implementation /////////////////////
+// Need Nc=3 though //

-template<>
-int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out)
-{
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
-}
-
-template<>
-int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out)
-{
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
-}
-
-template<>
-int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out)
-{
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
-}
-
-*/
-
-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);


-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);


-
-
-template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);

--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -367,6 +367,9 @@ namespace Grid {
  template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,S(0.0,0.0)); }// use xor?
  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 

+  template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));} 
+  template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));} 
+
  // if not complex overload here 
  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@@ -87,14 +87,39 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 #define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
-
 #define VMOVRDUPf(OFF,PTR,DEST)         "vmovsldup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 #define VMOVIDUPf(OFF,PTR,DEST)         "vmovshdup " #OFF "*64(" #PTR "), " #DEST  ";\n"

+#define VRDUPd(SRC,DEST)       "vpshufd  $0x44," #SRC"," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VRDUPf(SRC,DEST)         "vmovsldup " #SRC ", " #DEST  ";\n"
+#define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
+#define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"
+
+#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
+#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
+#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
+#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
+
 #define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
 #define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
+#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps   " #O"*64("#P "),"#B "," #accum  ";\n"
+#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
+
+
+#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
+
+#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMULIDUPd(O,P,B,accum) "vmulpd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
+  /*
+   * TimesI is used only in the XP recon
+   * Could zero the regs and use RECON_ACCUM
+   */

-                                  
 #define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
 #define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
@@ -111,6 +136,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"

+#if 0
+
 #define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 #define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 #define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
@@ -127,6 +154,35 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 #define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"

+#else
+
+// o_p must point to floating 1.0f/d
+//
+// Ai, Ar -> tmp (r i)
+// tmp *1.0 
+// ACC i - Ar ; ACC r + Ai
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  VMADDMEMf(1,%r10,tmp,ACC)
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  
+
+
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  VMADDMEMd(1,%r10,tmp,ACC)  
+#define VACCTIMESMINUSI2d(A,ACC,tmp)
+
+// Ai, Ar -> tmp (r i)
+// tmp *1.0 
+// ACC i + Ar ; ACC r - Ai
+#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
+#define  VACCTIMESI1f(A,ACC,tmp)  VMADDMEMf(0,%r10,tmp,ACC)  
+#define  VACCTIMESI2f(A,ACC,tmp)
+
+#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
+#define  VACCTIMESI1d(A,ACC,tmp)  VMADDMEMd(0,%r10,tmp,ACC)  
+#define  VACCTIMESI2d(A,ACC,tmp)
+
+#endif
+
 #define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
 #define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
 #define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
--- a/lib/simd/Intel512avxAddsub.h
+++ b/lib/simd/Intel512avxAddsub.h
@@ -1,92 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/simd/Avx512Asm.h
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_ASM_AV512_ADDSUB_H
-#define GRID_ASM_AV512_ADDSUB_H
-
-
-////////////////////////////////////////////////////////////////
-// Building blocks for SU3 x 2spinor
-// Load columns of U
-// 18   U DUP's  rr/ii
-//  6 Chi shuffles ir,ri
-// 6muls, 30 fmaddsubs
-////////////////////////////////////////////////////////////////
-#define MULT_ADDSUB_2SPIN(ptr)	\
-	   LOAD64(%r8,ptr)			\
-  __asm__ (					\
-	   VMOVIDUPf(0,%r8,Z0 ) \
-	   VMOVIDUPf(3,%r8,Z1 )\
-	   VMOVIDUPf(6,%r8,Z2 )\
-           VSHUFf(Chi_00,T1)    \
-           VSHUFf(Chi_10,T2)    \
-				\
-	   VMULf(Z0,T1,UChi_00)	           VMOVRDUPf(0,%r8,Z3 ) \
-	   VMULf(Z0,T2,UChi_10)	           VMOVRDUPf(3,%r8,Z4 ) \
-	   VMULf(Z1,T1,UChi_01)	           VMOVRDUPf(6,%r8,Z5 ) \
-	   VMULf(Z1,T2,UChi_11)	           VMOVIDUPf(1,%r8,Z0 ) \
-	   VMULf(Z2,T1,UChi_02)            VMOVIDUPf(4,%r8,Z1 ) \
-	   VMULf(Z2,T2,UChi_12)	           VMOVIDUPf(7,%r8,Z2 ) \
-	   			\
-	   VMADDSUBf(Z3,Chi_00,UChi_00)    VSHUFf(Chi_01,T1)    \
-	   VMADDSUBf(Z3,Chi_10,UChi_10)    VSHUFf(Chi_11,T2)    \
-	   VMADDSUBf(Z4,Chi_00,UChi_01)    VMOVRDUPf(1,%r8,Z3 ) \
-	   VMADDSUBf(Z4,Chi_10,UChi_11)\
-	   VMADDSUBf(Z5,Chi_00,UChi_02)    VMOVRDUPf(4,%r8,Z4 ) \
-	   VMADDSUBf(Z5,Chi_10,UChi_12)\
-	   			       \
-	   VMADDSUBf(Z0,T1,UChi_00) 	   VMOVRDUPf(7,%r8,Z5 ) \ 
-	   VMADDSUBf(Z0,T2,UChi_10)\
-	   VMADDSUBf(Z1,T1,UChi_01) 	   VMOVIDUPf(2,%r8,Z0 ) \
-	   VMADDSUBf(Z1,T2,UChi_11)\
-	   VMADDSUBf(Z2,T1,UChi_02)        VMOVIDUPf(5,%r8,Z1 ) \
-	   VMADDSUBf(Z2,T2,UChi_12)        VMOVIDUPf(8,%r8,Z2 ) \
-					   			\
-	   VMADDSUBf(Z3,Chi_01,UChi_00)    VSHUFf(Chi_02,T1)    \
-	   VMADDSUBf(Z3,Chi_11,UChi_10)    VSHUFf(Chi_12,T2)    \
-	   VMADDSUBf(Z4,Chi_01,UChi_01)	   VMOVRDUPf(2,%r8,Z3 ) \
-	   VMADDSUBf(Z4,Chi_11,UChi_11)\
-	   VMADDSUBf(Z5,Chi_01,UChi_02)	   VMOVRDUPf(5,%r8,Z4 ) \
-	   VMADDSUBf(Z5,Chi_11,UChi_12)\
-	   			\
-	   VMADDSUBf(Z0,T1,UChi_00) 	   VMOVRDUPf(8,%r8,Z5 ) \
-	   VMADDSUBf(Z0,T2,UChi_10)\
-	   VMADDSUBf(Z1,T1,UChi_01)\
-	   VMADDSUBf(Z1,T2,UChi_11)\
-	   VMADDSUBf(Z2,T1,UChi_02)\
-	   VMADDSUBf(Z2,T2,UChi_12)\
-		   		   \
-	   VMADDSUBf(Z3,Chi_02,UChi_00)\
-	   VMADDSUBf(Z3,Chi_12,UChi_10)\
-	   VMADDSUBf(Z4,Chi_02,UChi_01)\
-	   VMADDSUBf(Z4,Chi_12,UChi_11)\
-	   VMADDSUBf(Z5,Chi_02,UChi_02)\
-	   VMADDSUBf(Z5,Chi_12,UChi_12)\
-						);
-
-
-#endif
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -86,8 +86,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 #define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"

-#define VPREFETCHG(O,A) 
-#define VPREFETCHW(O,A) 
+#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
+#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 #define VEVICT(O,A)   

 //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
--- a/lib/simd/Intel512double.h
+++ b/lib/simd/Intel512double.h
@@ -133,3 +133,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 

+
+#undef VRDUP
+#undef VIDUP
+#undef VMADDSUBMEM
+#undef VMADDMEM
+#undef VMULMEM
+#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST) 
+#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST) 
+#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
+#define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
+#define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
+#undef VMADDSUBRDUP   
+#undef VMADDSUBIDUP   
+#undef VMULRDUP   
+#undef VMULIDUP   
+#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
+#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
+#define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
+#define VMULIDUP(O,P,B,accum)     VMULIDUPd(O,P,B,accum) 
--- a/lib/simd/Intel512single.h
+++ b/lib/simd/Intel512single.h
@@ -116,7 +116,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
 #define VSHUF(A,B)                                       VSHUFf(A,B)

-
 #undef ZEND1
 #undef ZEND2
 #undef ZLOAD
@@ -133,3 +132,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 

+#undef VRDUP
+#undef VIDUP
+#undef VMADDSUBMEM
+#undef VMADDMEM
+#undef VMULMEM
+
+#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST) 
+#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST) 
+#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
+#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
+#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
+
+#undef VMADDSUBRDUP   
+#undef VMADDSUBIDUP   
+#undef VMULRDUP   
+#undef VMULIDUP   
+#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
+#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
+#define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
+#define VMULIDUP(O,P,B,accum)     VMULIDUPf(O,P,B,accum) 
+   
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -27,9 +27,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #ifndef GRID_ASM_INTEL_512_QCD_H
 #define GRID_ASM_INTEL_512_QCD_H
- 
+
 //////////////////////////////////////////////////////////////////////////////////////////
-// Register allocations for Wilson Kernel are precision and IMCI/AVX512 indept
+// Register allocations for Wilson Kernel are precision indept
 //////////////////////////////////////////////////////////////////////////////////////////
 #define result_00 %zmm0 
 #define result_01 %zmm1
@@ -64,7 +64,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define UChi_12 %zmm23 

 #define Uir %zmm24 
-//#define ONE %zmm24 
 #define Uri %zmm25  
 #define T1 %zmm24
 #define T2 %zmm25
@@ -92,13 +91,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_32 UChi_12

 #include <simd/Intel512common.h>
-#ifdef AVX512
 #include <simd/Intel512avx.h>
-//#include <simd/Intel512avxAddsub.h> // Alternate implementation
-#endif
-#ifdef IMCI
-#include <simd/Intel512imci.h>
-#endif

 //////////////////////////////////////////////////////////////////
 // Macros used to build wilson kernel -- can rationalise and simplify
@@ -193,47 +186,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 	   VSTORE(11,%r8,result_32) 		\
 						);

-//   auto ptr = &U._odata[sU](A);		
-// A plan for lifting loads 
-//  can use Z2/3/4/5/U/U for U field in first step. 
-//  can use Chi_00, Chi_10, U U for U field in second step
-//  can use Chi_00, Chi_10, Chi_01,11, U U for U field in third step
-// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
-// KNL is DUAL issue for FP, and lifting these loads is potentially important.
-// Need detailed profile data to be sure.
-#if 0
-#define PREFETCH_U(A) \
-  LOAD64(%r8,&U._odata[sU](A)) \
-  __asm__ (		       \
-  VPREFETCHG(0,%r8)	       \
-  VPREFETCHG(1,%r8)	       \
-  VPREFETCHG(2,%r8)	       \
-  VPREFETCHG(3,%r8)	       \
-  VPREFETCHG(4,%r8)	       \
-  VPREFETCHG(5,%r8)	       \
-  VPREFETCHG(6,%r8)	       \
-  VPREFETCHG(7,%r8)	       \
-  VPREFETCHG(8,%r8)	       );
-
-#define PREFETCH_R(A)  \
-  LOAD64(%r8,&out._odata[ss]) \
-  __asm__ (		       \
-  VPREFETCHW(0,%r8)	       \
-  VPREFETCHW(1,%r8)	       \
-  VPREFETCHW(2,%r8)	       \
-  VPREFETCHW(3,%r8)	       \
-  VPREFETCHW(4,%r8)	       \
-  VPREFETCHW(5,%r8)	       \
-  VPREFETCHW(6,%r8)	       \
-  VPREFETCHW(7,%r8)	       \
-  VPREFETCHW(8,%r8)	       \
-  VPREFETCHW(9,%r8)	       \
-  VPREFETCHW(10,%r8)	       \
-  VPREFETCHW(11,%r8)	       );
-#endif
- 
-#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
-
 #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
@@ -244,131 +196,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)

-#if 0
-#define MULT_2SPIN_UNOPT(ptr)				\
-	   LOAD64(%r8,ptr)			\
-  __asm__ (					\
-	   ZLOAD (0,%r8,UChi_01,UChi_11)	\
-	   ZLOAD (3,%r8,UChi_02,UChi_12)	\
-	   ZLOAD (6,%r8,Uri,Uir)		\
-	   ZMUL (UChi_01,UChi_11,Chi_00,UChi_00,Z0)	\
-	   ZMUL (UChi_01,UChi_11,Chi_10,UChi_10,Z1)	\
-	   ZMUL (UChi_02,UChi_12,Chi_00,UChi_01,Z2)	\
-	   ZMUL (UChi_02,UChi_12,Chi_10,UChi_11,Z3)	\
-	   ZMUL (Uri,Uir,        Chi_00,UChi_02,Z4)	\
-	   ZMUL (Uri,Uir,        Chi_10,UChi_12,Z5)	\
-	   						\
-	   ZLOAD (1,%r8,Uri,Uir)			\
-	   ZLOAD (4,%r8,Chi_00, Chi_10)		     	\
-	   ZMADD (Uri,Uir,       Chi_01,UChi_00,Z0)	\
-	   ZMADD (Uri,Uir,       Chi_11,UChi_10,Z1)	\
-	   ZLOAD (7,%r8,Uri,Uir)			\
-	   ZMADD (Chi_00, Chi_10,Chi_01,UChi_01,Z2)	\
-	   ZMADD (Chi_00, Chi_10,Chi_11,UChi_11,Z3)	\
-	   ZLOAD (2,%r8,Chi_00,Chi_10)			\
-	   ZMADD(Uri,Uir,        Chi_01,UChi_02,Z4)	\
-	   ZMADD(Uri,Uir,        Chi_11,UChi_12,Z5)	\
-							\
-	   ZLOAD  (5,%r8,Uri,Uir)			\
-	   ZMADD (Chi_00,Chi_10, Chi_02,UChi_00,Z0)	\
-	   ZMADD (Chi_00,Chi_10, Chi_12,UChi_10,Z1)	\
-	   ZLOAD  (8,%r8,Chi_00,Chi_10)			\
-	   ZMADD (Uri,Uir,       Chi_02,UChi_01,Z2)    	\
-	   ZMADD (Uri,Uir,       Chi_12,UChi_11,Z3)	\
-	   ZMADD(Chi_00,Chi_10,  Chi_02,UChi_02,Z4)	\
-	   ZMADD(Chi_00,Chi_10,  Chi_12,UChi_12,Z5)	\
-	   						\
-	   ZEND1(UChi_00,Z0,Chi_01)			\
-	   ZEND1(UChi_10,Z1,Chi_11)			\
-	   ZEND1(UChi_01,Z2,Chi_00)			\
-	   ZEND1(UChi_11,Z3,Chi_10)			\
-	   ZEND1(UChi_02,Z4,Chi_02)			\
-	   ZEND1(UChi_12,Z5,Chi_12)			\
-	   ZEND2(UChi_00,Z0,Chi_01)			\
-	   ZEND2(UChi_10,Z1,Chi_11)			\
-	   ZEND2(UChi_01,Z2,Chi_00)			\
-	   ZEND2(UChi_11,Z3,Chi_10)			\
-	   ZEND2(UChi_02,Z4,Chi_02)			\
-	   ZEND2(UChi_12,Z5,Chi_12)	     );
-#endif
-
-#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
-#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
-#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr)
-#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr)
-#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr)
-#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr)
-#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
-#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
-
-// MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
-
-#if 0
-#define MULT_2SPIN_PF(ptr,pf,VPF)			\
-	   LOAD64(%r8,ptr)			\
-	   LOAD64(%r9,pf)			\
-  __asm__ (					\
-	   ZMULMEM2SP(0,%r8,Uri,Chi_00,Chi_10,UChi_00,Z0,UChi_10,Z1)	\
-	   VPF(0,%r9)						\
-	   ZMULMEM2SP(3,%r8,Uri,Chi_00,Chi_10,UChi_01,Z2,UChi_11,Z3)	\
-	   VPF(1,%r9)						\
-	   ZMULMEM2SP(6,%r8,Uri,Chi_00,Chi_10,UChi_02,Z4,UChi_12,Z5)	\
-	   VPF(2,%r9)						\
-									\
-	   ZMADDMEM2SP(1,%r8,Uri,Chi_01,Chi_11,UChi_00,Z0,UChi_10,Z1)	\
-	   VPF(3,%r9)						\
-	   ZMADDMEM2SP(4,%r8,Uri,Chi_01,Chi_11,UChi_01,Z2,UChi_11,Z3)	\
-	   VPF(4,%r9)						\
-	   ZMADDMEM2SP(7,%r8,Uri,Chi_01,Chi_11,UChi_02,Z4,UChi_12,Z5)	\
-	   VPF(5,%r9)						\
-									\
-	   ZMADDMEM2SP(2,%r8,Uri,Chi_02,Chi_12,UChi_00,Z0,UChi_10,Z1)	\
-	   VPF(6,%r9)						\
-	   ZMADDMEM2SP(5,%r8,Uri,Chi_02,Chi_12,UChi_01,Z2,UChi_11,Z3)	\
-	   VPF(7,%r9)						\
-	   ZMADDMEM2SP(8,%r8,Uri,Chi_02,Chi_12,UChi_02,Z4,UChi_12,Z5)	\
-	   VPF(8,%r9)						\
-	   						\
-	   ZEND1(UChi_00,Z0,Chi_01)			\
-	   ZEND1(UChi_10,Z1,Chi_11)			\
-	   ZEND1(UChi_01,Z2,Chi_00)			\
-	   ZEND1(UChi_11,Z3,Chi_10)			\
-	   VPF(9,%r9)						\
-	   ZEND1(UChi_02,Z4,Chi_02)			\
-	   ZEND1(UChi_12,Z5,Chi_12)			\
-	   ZEND2(UChi_00,Z0,Chi_01)			\
-	   ZEND2(UChi_10,Z1,Chi_11)			\
-	   VPF(10,%r9)						\
-	   ZEND2(UChi_01,Z2,Chi_00)			\
-	   ZEND2(UChi_11,Z3,Chi_10)			\
-	   ZEND2(UChi_02,Z4,Chi_02)			\
-	   VPF(11,%r9)						\
-	   ZEND2(UChi_12,Z5,Chi_12)	     );
-#endif
-
-#if 0 
-#define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
-	   LOAD64(%r8,ptr)			\
-	   LOAD64(%r9,pf)			\
-  __asm__ (					\
-	   VPF(0,%r9)						\
-	   VPF(1,%r9)						\
-	   VPF(2,%r9)						\
-	   							\
-	   VPF(3,%r9)						\
-	   VPF(4,%r9)						\
-	   VPF(5,%r9)						\
-	   							\
-	   VPF(6,%r9)						\
-	   VPF(7,%r9)						\
-	   VPF(8,%r9)						\
-	   							\
-	   VPF(9,%r9)						\
-	   VPF(10,%r9)						\
-	   VPF(11,%r9)						);
-#endif
-
-// Pretty much Perfectly Pipelined
+#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)

 //////////////////////////////////////////////////////////////////
 // Dirac algebra
@@ -490,7 +325,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  LOAD64(%r8,ptr)				\
  __asm__ (					\
  LOAD_CHIMU01i					\
-  VSUBMEM(6,%r8 ,Chimu_00,Chi_00)		\
+  VSUBMEM(6,%r8,Chimu_00,Chi_00)		\
  VSUBMEM(7,%r8,Chimu_01,Chi_01)		\
  VSUBMEM(8,%r8,Chimu_02,Chi_02)		\
  VSUBMEM(9,%r8,Chimu_10,Chi_10)		\
@@ -503,18 +338,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 //      fspin(3)=timesMinusI(hspin(0))
 #define XP_RECON __asm__ (			\
 			  VZERO(TMP)		\
-			  VMOV(UChi_00,result_00)	\
-			  VMOV(UChi_01,result_01)	\
-			  VMOV(UChi_02,result_02)	\
-			  VMOV(UChi_10,result_10)	\
-			  VMOV(UChi_11,result_11)	\
-			  VMOV(UChi_12,result_12)	\
-			  VTIMESMINUSI0(UChi_10,result_20,TMP)	\
-			  VTIMESMINUSI0(UChi_11,result_21,TMP)	\
-			  VTIMESMINUSI0(UChi_12,result_22,TMP)	\
 			  VTIMESMINUSI0(UChi_00,result_30,TMP)	\
+			  VTIMESMINUSI0(UChi_10,result_20,TMP)	\
 			  VTIMESMINUSI0(UChi_01,result_31,TMP)	\
+			  VTIMESMINUSI0(UChi_11,result_21,TMP)	\
 			  VTIMESMINUSI0(UChi_02,result_32,TMP)   \
+			  VTIMESMINUSI0(UChi_12,result_22,TMP)	\
+			  VMOV(UChi_00,result_00)	\
+			  VMOV(UChi_10,result_10)	\
+			  VMOV(UChi_01,result_01)	\
+			  VMOV(UChi_11,result_11)	\
+			  VMOV(UChi_02,result_02)	\
+			  VMOV(UChi_12,result_12)	\
 			  VTIMESMINUSI1(UChi_10,result_20,TMP)	\
 			  VTIMESMINUSI1(UChi_11,result_21,TMP)	\
 			  VTIMESMINUSI1(UChi_12,result_22,TMP)	\
@@ -531,24 +366,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  // NB could save 6 ops using addsub => 12 cycles
 #define XP_RECON_ACCUM __asm__ ( \
  VZERO(TMP)\
-  VADD(UChi_00,result_00,result_00)\
-  VADD(UChi_01,result_01,result_01)\
-  VADD(UChi_02,result_02,result_02)\
-  VADD(UChi_10,result_10,result_10)\
-  VADD(UChi_11,result_11,result_11)\
-  VADD(UChi_12,result_12,result_12)\
-  VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
-  VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
-  VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
  VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
+  VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
+  VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
-  VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
-  VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
-  VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
+  VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
+  VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
+  VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
+  VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
  VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
@@ -559,24 +394,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define XM_RECON __asm__ ( \
  VZERO(TMP)\
-  VMOV(UChi_00,result_00)\
-  VMOV(UChi_01,result_01)\
-  VMOV(UChi_02,result_02)\
-  VMOV(UChi_10,result_10)\
-  VMOV(UChi_11,result_11)\
-  VMOV(UChi_12,result_12)\
-  VTIMESI0(UChi_10,result_20,TMP)\
-  VTIMESI0(UChi_11,result_21,TMP)\
-  VTIMESI0(UChi_12,result_22,TMP)\
  VTIMESI0(UChi_00,result_30,TMP)\
+  VTIMESI0(UChi_10,result_20,TMP)\
  VTIMESI0(UChi_01,result_31,TMP)\
+  VTIMESI0(UChi_11,result_21,TMP)\
  VTIMESI0(UChi_02,result_32,TMP)\
-  VTIMESI1(UChi_10,result_20,TMP)\
-  VTIMESI1(UChi_11,result_21,TMP)\
-  VTIMESI1(UChi_12,result_22,TMP)\
+  VTIMESI0(UChi_12,result_22,TMP)\
+  VMOV(UChi_00,result_00)\
+  VMOV(UChi_10,result_10)\
+  VMOV(UChi_01,result_01)\
+  VMOV(UChi_11,result_11)\
+  VMOV(UChi_02,result_02)\
+  VMOV(UChi_12,result_12)\
  VTIMESI1(UChi_00,result_30,TMP)\
+  VTIMESI1(UChi_10,result_20,TMP)\
  VTIMESI1(UChi_01,result_31,TMP)\
+  VTIMESI1(UChi_11,result_21,TMP)\
  VTIMESI1(UChi_02,result_32,TMP)\
+  VTIMESI1(UChi_12,result_22,TMP)\
  VTIMESI2(UChi_10,result_20,TMP)\
  VTIMESI2(UChi_11,result_21,TMP)\
  VTIMESI2(UChi_12,result_22,TMP)\
@@ -586,23 +421,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 			   );

 #define XM_RECON_ACCUM __asm__ ( \
-  VADD(UChi_00,result_00,result_00)\
-  VADD(UChi_01,result_01,result_01)\
-  VADD(UChi_02,result_02,result_02)\
-  VADD(UChi_10,result_10,result_10)\
-  VADD(UChi_11,result_11,result_11)\
-  VADD(UChi_12,result_12,result_12)\
  VACCTIMESI0(UChi_10,result_20,Z0)\
-  VACCTIMESI0(UChi_11,result_21,Z1)\
-  VACCTIMESI0(UChi_12,result_22,Z2)\
  VACCTIMESI0(UChi_00,result_30,Z3)\
+  VACCTIMESI0(UChi_11,result_21,Z1)\
  VACCTIMESI0(UChi_01,result_31,Z4)\
+  VACCTIMESI0(UChi_12,result_22,Z2)\
  VACCTIMESI0(UChi_02,result_32,Z5)\
+  \
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_12,result_12,result_12)\
+  VADD(UChi_02,result_02,result_02)\
+  \
  VACCTIMESI1(UChi_10,result_20,Z0)\
-  VACCTIMESI1(UChi_11,result_21,Z1)\
-  VACCTIMESI1(UChi_12,result_22,Z2)\
  VACCTIMESI1(UChi_00,result_30,Z3)\
+  VACCTIMESI1(UChi_11,result_21,Z1)\
  VACCTIMESI1(UChi_01,result_31,Z4)\
+  VACCTIMESI1(UChi_12,result_22,Z2)\
  VACCTIMESI1(UChi_02,result_32,Z5)\
  VACCTIMESI2(UChi_10,result_20,Z0)\
  VACCTIMESI2(UChi_11,result_21,Z1)\
@@ -614,10 +451,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define YP_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
-  VADD(UChi_01,result_01,result_01)\
-  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VADD(UChi_10,result_20,result_20)\
  VADD(UChi_11,result_21,result_21)\
@@ -628,10 +465,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define YM_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
-  VADD(UChi_01,result_01,result_01)\
-  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VSUB(UChi_10,result_20,result_20)\
  VSUB(UChi_11,result_21,result_21)\
@@ -641,23 +478,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VADD(UChi_02,result_32,result_32) );

 #define ZP_RECON_ACCUM __asm__ ( \
-  VADD(UChi_00,result_00,result_00)\
-  VADD(UChi_01,result_01,result_01)\
-  VADD(UChi_02,result_02,result_02)\
-  VADD(UChi_10,result_10,result_10)\
-  VADD(UChi_11,result_11,result_11)\
-  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
-  VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
-  VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
  VACCTIMESI0(UChi_10,result_30,Z3)\
+  VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
  VACCTIMESI0(UChi_11,result_31,Z4)\
+  VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
  VACCTIMESI0(UChi_12,result_32,Z5)\
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
-  VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
-  VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
  VACCTIMESI1(UChi_10,result_30,Z3)\
+  VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
  VACCTIMESI1(UChi_11,result_31,Z4)\
+  VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
  VACCTIMESI1(UChi_12,result_32,Z5)\
  VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
  VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
@@ -668,23 +505,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 				 );

 #define ZM_RECON_ACCUM __asm__ ( \
-  VADD(UChi_00,result_00,result_00)\
-  VADD(UChi_01,result_01,result_01)\
-  VADD(UChi_02,result_02,result_02)\
-  VADD(UChi_10,result_10,result_10)\
-  VADD(UChi_11,result_11,result_11)\
-  VADD(UChi_12,result_12,result_12)\
  VACCTIMESI0(UChi_00,result_20,Z0)\
-  VACCTIMESI0(UChi_01,result_21,Z1)\
-  VACCTIMESI0(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
+  VACCTIMESI0(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
+  VACCTIMESI0(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
  VACCTIMESI1(UChi_00,result_20,Z0)\
-  VACCTIMESI1(UChi_01,result_21,Z1)\
-  VACCTIMESI1(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
+  VACCTIMESI1(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
+  VACCTIMESI1(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
  VACCTIMESI2(UChi_00,result_20,Z0)\
  VACCTIMESI2(UChi_01,result_21,Z1)\
@@ -696,30 +533,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define TP_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
-  VADD(UChi_01,result_01,result_01)\
-  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VADD(UChi_00,result_20,result_20)\
-  VADD(UChi_01,result_21,result_21)\
-  VADD(UChi_02,result_22,result_22)\
  VADD(UChi_10,result_30,result_30)\
+  VADD(UChi_01,result_21,result_21)\
  VADD(UChi_11,result_31,result_31)\
+  VADD(UChi_02,result_22,result_22)\
  VADD(UChi_12,result_32,result_32) );

 #define TM_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
-  VADD(UChi_01,result_01,result_01)\
-  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VSUB(UChi_00,result_20,result_20)\
-  VSUB(UChi_01,result_21,result_21)\
-  VSUB(UChi_02,result_22,result_22)\
  VSUB(UChi_10,result_30,result_30)\
+  VSUB(UChi_01,result_21,result_21)\
  VSUB(UChi_11,result_31,result_31)\
+  VSUB(UChi_02,result_22,result_22)\
  VSUB(UChi_12,result_32,result_32) );

 //define PREFETCH_CHIMU(A) 
@@ -758,63 +595,200 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define MULT_ADDSUB_2SPIN1(ptr)  \
           LOAD64(%r8,ptr)                      
-/*
- * __asm__ (                                     \
-);
-  VMUL(Z0,%zmm2,%zmm3) \
-*/
-#define MULT_ADDSUB_2SPIN(ptr)  \
-           LOAD64(%r8,ptr)                      \
-  __asm__ (                                     \
-           VMOVIDUP(0,%r8,Z0 ) \
-           VMOVIDUP(3,%r8,Z1 )\
-           VMOVIDUP(6,%r8,Z2 )\
-           VSHUF(Chi_00,T1)    \
-           VSHUF(Chi_10,T2)    \
-                                \
-           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 ) \
-           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 ) \
-           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 ) \
-           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 ) \
-           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 ) \
-           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 ) \
-                                \
-           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)    \
-           VMADDSUB(Z3,Chi_10,UChi_10)    VSHUF(Chi_11,T2)    \
-           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 ) \
-           VMADDSUB(Z4,Chi_10,UChi_11)\
-           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 ) \
-           VMADDSUB(Z5,Chi_10,UChi_12)\
-                                       \
-           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 ) \
-           VMADDSUB(Z0,T2,UChi_10)\
-           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 ) \
-           VMADDSUB(Z1,T2,UChi_11)\
-           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 ) \
-           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 ) \
-                                                                \
-           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)    \
-           VMADDSUB(Z3,Chi_11,UChi_10)    VSHUF(Chi_12,T2)    \
-           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 ) \
-           VMADDSUB(Z4,Chi_11,UChi_11)\
-           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 ) \
-           VMADDSUB(Z5,Chi_11,UChi_12)\
-                                \
-           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
-           VMADDSUB(Z0,T2,UChi_10)\
-           VMADDSUB(Z1,T1,UChi_01)\
-           VMADDSUB(Z1,T2,UChi_11)\
-           VMADDSUB(Z2,T1,UChi_02)\
-           VMADDSUB(Z2,T2,UChi_12)\
-                                   \
-           VMADDSUB(Z3,Chi_02,UChi_00)\
-           VMADDSUB(Z3,Chi_12,UChi_10)\
-           VMADDSUB(Z4,Chi_02,UChi_01)\
-           VMADDSUB(Z4,Chi_12,UChi_11)\
-           VMADDSUB(Z5,Chi_02,UChi_02)\
-           VMADDSUB(Z5,Chi_12,UChi_12)\
-                                                );

-#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
+#define MULT_ADDSUB_2SPIN(ptr)					\
+  LOAD64(%r8,ptr)						\
+	   __asm__ (						\
+		    VSHUF(Chi_00,T1)				\
+	   VMOVIDUP(0,%r8,Z0 )					\
+           VMOVIDUP(3,%r8,Z1 )					\
+           VMOVIDUP(6,%r8,Z2 )	          VSHUF(Chi_10,T2)		\
+	   /*6*/							\
+           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 )	\
+           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 )	\
+           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 )	\
+           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
+           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
+           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
+	   /*18*/						\
+           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
+           VMADDSUB(Z3,Chi_10,UChi_10)				\
+           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 )	\
+           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
+           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
+           VMADDSUB(Z5,Chi_10,UChi_12)				\
+	   /*28*/						\
+           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
+           VMADDSUB(Z0,T2,UChi_10)				\
+           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 )	\
+           VMADDSUB(Z1,T2,UChi_11)				\
+           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 )	\
+           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 )	\
+           /*38*/						\
+           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)	\
+           VMADDSUB(Z3,Chi_11,UChi_10)				\
+           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 )	\
+           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
+           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
+           VMADDSUB(Z5,Chi_11,UChi_12)				\
+	   /*48*/						\
+           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
+           VMADDSUB(Z0,T2,UChi_10)			      \
+           VMADDSUB(Z1,T1,UChi_01)			      \
+           VMADDSUB(Z1,T2,UChi_11)			      \
+           VMADDSUB(Z2,T1,UChi_02)			      \
+           VMADDSUB(Z2,T2,UChi_12)			      \
+	   /*55*/					      \
+           VMADDSUB(Z3,Chi_02,UChi_00)			      \
+           VMADDSUB(Z3,Chi_12,UChi_10)			      \
+           VMADDSUB(Z4,Chi_02,UChi_01)			      \
+           VMADDSUB(Z4,Chi_12,UChi_11)			      \
+           VMADDSUB(Z5,Chi_02,UChi_02)			      \
+           VMADDSUB(Z5,Chi_12,UChi_12)			      \
+	   /*61 insns*/							);
+
+
+#define MULT_ADDSUB_2SPIN_LS(ptr,pf)				   \
+  LOAD64(%r8,ptr)						   \
+  LOAD64(%r9,pf)						   \
+  __asm__ (							   \
+           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
+           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
+           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
+           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
+	   VPREFETCHG(0,%r9)					   \
+	   VPREFETCHG(1,%r9)					   \
+	   VPREFETCHG(2,%r9)					   \
+	   VPREFETCHG(3,%r9)					   \
+	   /*8*/						   \
+           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
+           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
+           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
+           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
+	   VPREFETCHG(4,%r9)					   \
+	   VPREFETCHG(5,%r9)					   \
+	   VPREFETCHG(6,%r9)					   \
+	   VPREFETCHG(7,%r9)					   \
+	   /*16*/					  	   \
+           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
+           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
+	   VPREFETCHG(8,%r9)					   \
+	   VPREFETCHG(9,%r9)					   \
+	   VPREFETCHG(10,%r9)					   \
+	   VPREFETCHG(11,%r9)					   \
+           /*22*/						   \
+           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
+           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
+           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
+           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
+	   VPREFETCH2(12,%r9)					   \
+	   VPREFETCH2(13,%r9)					   \
+	   VPREFETCH2(14,%r9)					   \
+	   VPREFETCH2(15,%r9)					   \
+	   /*30*/						   \
+           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
+	   VPREFETCH2(16,%r9)					   \
+	   VPREFETCH2(17,%r9)					   \
+	   VPREFETCH2(18,%r9)					   \
+	   VPREFETCH2(19,%r9)					   \
+           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
+	   /*36*/					           \
+           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
+           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
+           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
+	   VPREFETCH2(20,%r9)					   \
+	   VPREFETCH2(21,%r9)					   \
+	   VPREFETCH2(22,%r9)					   \
+	   VPREFETCH2(23,%r9)					   \
+	   VPREFETCHG(2,%r8)					   \
+	   VPREFETCHG(3,%r8)					   \
+	   VPREFETCH2(4,%r8)					   \
+	   VPREFETCH2(5,%r8)					   \
+	   /*42 insns*/						);
+
+#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
+  LOAD64(%r8,ptr)						   \
+  LOAD64(%r9,pf)						   \
+  __asm__ (							   \
+           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
+           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
+           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
+           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
+	   /*8*/						   \
+           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
+           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
+           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
+           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
+	   /*16*/					  	   \
+           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
+           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
+           /*22*/						   \
+           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
+           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
+           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
+           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
+	   /*30*/						   \
+           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
+           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
+	   /*36*/					           \
+           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
+           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
+           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
+	   /*	   VPREFETCHG(2,%r8)*/				   \
+	   /*	   VPREFETCHG(3,%r8)*/				   \
+	   /*42 insns*/						);
+
+
+#define Z6 Chi_00
+#define MULT_ADDSUB_2SPIN_NEW(ptr)  \
+  LOAD64(%r8,ptr)					       \
+  __asm__ (							  \
+   VSHUFMEM(0,%r8,Z0)					          \
+   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
+   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
+   VMUL(Z0,Chi_00,Z1)         VMUL(Z0,Chi_10,Z2)		  \
+   VSHUFMEM(3,%r8,Z0)						  \
+   VMUL(Z0,Chi_00,Z3)         VMUL(Z0,Chi_10,Z4)		  \
+   VSHUFMEM(6,%r8,Z0)						  \
+   VMUL(Z0,Chi_00,Z5)         VMUL(Z0,Chi_10,Z6)		  \
+   VMULMEM(0,%r8,T1,UChi_00)  VMULMEM(0,%r8,T2,UChi_10)		  \
+   VMULMEM(3,%r8,T1,UChi_01)  VMULMEM(3,%r8,T2,UChi_11)		  \
+   VMULMEM(6,%r8,T1,UChi_02)  VMULMEM(6,%r8,T2,UChi_12)		  \
+   /*11 cycles*/						  \
+   VSHUFMEM(1,%r8,Z0)						  \
+   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
+   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
+   VMADD(Z0,Chi_01,Z1)        VMADD(Z0,Chi_11,Z2)		  \
+   VSHUFMEM(4,%r8,Z0)						  \
+   VMADD(Z0,Chi_01,Z3)        VMADD(Z0,Chi_11,Z4)		  \
+   VSHUFMEM(7,%r8,Z0)						  \
+   VMADD(Z0,Chi_01,Z5)        VMADD(Z0,Chi_11,Z6)		  \
+   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
+   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
+   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
+   /*22 cycles*/						  \
+   VSHUFMEM(2,%r8,Z0)						  \
+   VRDUP(Chi_02,T1)        VIDUP(Chi_02,Chi_02)			  \
+   VRDUP(Chi_12,T2)        VIDUP(Chi_12,Chi_12)			  \
+   VMADD(Z0,Chi_02,Z1)        VMADD(Z0,Chi_12,Z2)		  \
+   VSHUFMEM(5,%r8,Z0)						  \
+   VMADD(Z0,Chi_02,Z3)        VMADD(Z0,Chi_12,Z4)		  \
+   VSHUFMEM(8,%r8,Z0)						  \
+   VMADD(Z0,Chi_02,Z5)        VMADD(Z0,Chi_12,Z6)		  \
+   /*33 cycles*/						  \
+   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
+   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
+   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
+  /*stall*/						       \
+  /*stall*/						       \
+  /*stall*/						       \
+  VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
+  VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
+  VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) )
+

 #endif
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -103,7 +103,9 @@ void LebesgueOrder::IterateI(int ND,
    } else {
      for(int d=0;d<ND;d++){
 	x[d]=xi[d]+xo[d];
+	std::cout << x[d]<<" ";
      }
+      std::cout << "\n";
      IndexInteger index;
      Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
      _LebesgueReorder.push_back(index);
@@ -188,6 +190,7 @@ void LebesgueOrder::ZGraph(void)
  }
  assert( _LebesgueReorder.size() == vol );

+  /*
  std::vector<int> coor(4);
  for(IndexInteger asite=0;asite<vol;asite++){
    grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
@@ -198,5 +201,6 @@ void LebesgueOrder::ZGraph(void)
 		<< coor[3]<<"]"
 		<<std::endl;
  }
+  */
 }
 }