Merge branch 'develop' into feature/qed-fvol

2025-07-02 06:27:05 +01:00 · 2016-12-20 12:33:02 +01:00
parent 2e3c5890b6 f8d11ff673
commit 6f1ea96293
26 changed files with 1067 additions and 284 deletions
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@ -113,6 +113,36 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;
 #define BENCH_ZDW(A,in,out)			\
    zDw.CayleyZeroCounters();			\
    zDw. A (in,out);				\
    FGrid->Barrier();				\
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      zDw. A (in,out);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
    zDw.CayleyReport();							\
    std::cout<<GridLogMessage << "Called ZDw " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;
 #define BENCH_DW_SSC(A,in,out)			\
    Dw.CayleyZeroCounters();			\
    Dw. A (in,out);				\
    FGrid->Barrier();				\
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      __SSC_START ;				\
      Dw. A (in,out);				\
      __SSC_STOP ;				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;
 #define BENCH_DW_MEO(A,in,out)			\
    Dw.CayleyZeroCounters();			\
    Dw. A (in,out,0);				\
@ -148,9 +178,15 @@ int main (int argc, char ** argv)
    LatticeFermion sref(sFGrid);
    LatticeFermion result(sFGrid);
    std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
    DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
    RealD b=1.5;// Scale factor b+c=2, b-c=1
    RealD c=0.5;
    std::vector<ComplexD> gamma(Ls,std::complex<double>(1.0,0.0));
    ZMobiusFermionVec5dR zDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c);
    std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
    FGrid->Barrier();
@ -173,10 +209,13 @@ int main (int argc, char ** argv)
    BENCH_DW_MEO(Dhop    ,src,result);
    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
-    BENCH_DW(Meooe   ,src_o,r_e);
+    BENCH_DW_SSC(Meooe   ,src_o,r_e);
    BENCH_DW(Mooee   ,src_o,r_o);
    BENCH_DW(MooeeInv,src_o,r_o);
    BENCH_ZDW(Mooee   ,src_o,r_o);
    BENCH_ZDW(MooeeInv,src_o,r_o);
  }
  Grid_finalize();
--- a/lib/AlignedAllocator.cc
+++ b/lib/AlignedAllocator.cc
@ -0,0 +1,65 @@
 #include <Grid/Grid.h>
 namespace Grid {
 int PointerCache::victim;
  PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
 void *PointerCache::Insert(void *ptr,size_t bytes) {
  if (bytes < 4096 ) return NULL;
 #ifdef _OPENMP
  assert(omp_in_parallel()==0);
 #endif 
  void * ret = NULL;
  int v = -1;
  for(int e=0;e<Ncache;e++) {
    if ( Entries[e].valid==0 ) {
      v=e; 
      break;
    }
  }
  if ( v==-1 ) {
    v=victim;
    victim = (victim+1)%Ncache;
  }
  if ( Entries[v].valid ) {
    ret = Entries[v].address;
    Entries[v].valid = 0;
    Entries[v].address = NULL;
    Entries[v].bytes = 0;
  }
  Entries[v].address=ptr;
  Entries[v].bytes  =bytes;
  Entries[v].valid  =1;
  return ret;
 }
 void *PointerCache::Lookup(size_t bytes) {
 if (bytes < 4096 ) return NULL;
 #ifdef _OPENMP
  assert(omp_in_parallel()==0);
 #endif 
  for(int e=0;e<Ncache;e++){
    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
      Entries[e].valid = 0;
      return Entries[e].address;
    }
  }
  return NULL;
 }
 }
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -42,9 +42,32 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
  class PointerCache {
  private:
    static const int Ncache=8;
    static int victim;
    typedef struct { 
      void *address;
      size_t bytes;
      int valid;
    } PointerCacheEntry;
    static PointerCacheEntry Entries[Ncache];
  public:
    static void *Insert(void *ptr,size_t bytes) ;
    static void *Lookup(size_t bytes) ;
  };
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class alignedAllocator {
 public: 
@ -66,27 +89,27 @@ public:
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 #endif
    _Tp tmp;
 #ifdef GRID_NUMA
 #pragma omp parallel for schedule(static)
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
 #endif 
    return ptr;
  }
-  void deallocate(pointer __p, size_type) { 
+  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
-    _mm_free((void *)__p); 
+    if ( __freeme ) _mm_free((void *)__freeme); 
 #else
-    free((void *)__p);
+    if ( __freeme ) free((void *)__freeme);
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@ -205,12 +205,13 @@ public:
  void Stop(void) {
    count=0;
    cycles=0;
    size_t ign;
 #ifdef __linux__
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
-      ::read(fd, &count, sizeof(long long));
+      ign=::read(fd, &count, sizeof(long long));
-      ::read(cyclefd, &cycles, sizeof(long long));
+      ign=::read(cyclefd, &cycles, sizeof(long long));
    }
    elapsed = cyclecount() - begin;
 #else
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@ -113,7 +113,7 @@ Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice
 {
 PARALLEL_FOR_LOOP     
     for(int i=0;i<table.size();i++){
-       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+       vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
     }
 }
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -29,6 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Eigen/Dense>
 #include <Grid.h>
@ -48,18 +49,18 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- { }
+ { 
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField tmp(psi._grid);
-  this->DW(psi,tmp,DaggerNo);
+  this->DW(psi,this->tmp(),DaggerNo);
  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
  }
 }
@ -87,8 +88,8 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-    // Flops = 9*12*Ls*vol/2
+    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
-    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  }
@ -110,12 +111,11 @@ template<class Impl>
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField tmp(psi._grid);
-  this->DW(psi,tmp,DaggerYes);
+  this->DW(psi,this->tmp(),DaggerYes);
  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
  }
 }
 template<class Impl>  
@ -138,6 +138,7 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
@ -259,36 +260,33 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField tmp(psi._grid);
-  Meooe5D(psi,tmp); 
+  Meooe5D(psi,this->tmp()); 
  if ( psi.checkerboard == Odd ) {
-    this->DhopEO(tmp,chi,DaggerNo);
+    this->DhopEO(this->tmp(),chi,DaggerNo);
  } else {
-    this->DhopOE(tmp,chi,DaggerNo);
+    this->DhopOE(this->tmp(),chi,DaggerNo);
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
 {
  FermionField tmp(psi._grid);
  // Apply 4d dslash
  if ( psi.checkerboard == Odd ) {
-    this->DhopEO(psi,tmp,DaggerYes);
+    this->DhopEO(psi,this->tmp(),DaggerYes);
  } else {
-    this->DhopOE(psi,tmp,DaggerYes);
+    this->DhopOE(psi,this->tmp(),DaggerYes);
  }
-  MeooeDag5D(tmp,chi); 
+  MeooeDag5D(this->tmp(),chi); 
 }
 template<class Impl>
 void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  FermionField tmp(psi._grid);
+  Meo5D(psi,this->tmp());
  Meo5D(psi,tmp);
  // Apply 4d dslash fragment
-  this->DhopDir(tmp,chi,dir,disp);
+  this->DhopDir(this->tmp(),chi,dir,disp);
 }
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
@ -459,9 +457,91 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
    dee[Ls-1] += delta_d;
  }  
  int inv=1;
  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
 						 Vector<iSinglet<Simd> > & Matp,
 						 Vector<iSinglet<Simd> > & Matm)
 {
  int Ls=this->Ls;
  GridBase *grid = this->FermionRedBlackGrid();
  int LLs = grid->_rdimensions[0];
  if ( LLs == Ls ) return; // Not vectorised in 5th direction
  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
    Pminus(s,s)= bee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pminus(s,s+1) = -cee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXcd PplusMat ;
  Eigen::MatrixXcd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  typedef typename SiteHalfSpinor::scalar_type scalar_type;
  const int Nsimd=Simd::Nsimd();
  Matp.resize(Ls*LLs);
  Matm.resize(Ls*LLs);
  for(int s2=0;s2<Ls;s2++){
  for(int s1=0;s1<LLs;s1++){
    int istride = LLs;
    int ostride = 1;
    Simd Vp;
    Simd Vm;
    scalar_type *sp = (scalar_type *)&Vp;
    scalar_type *sm = (scalar_type *)&Vm;
    for(int l=0;l<Nsimd;l++){
      if ( switcheroo<Coeff_t>::iscomplex() ) {
 	sp[l] = PplusMat (l*istride+s1*ostride,s2);
 	sm[l] = PminusMat(l*istride+s1*ostride,s2);
      } else { 
      // if real
 	scalar_type tmp;
 	tmp = PplusMat (l*istride+s1*ostride,s2);
 	sp[l] = scalar_type(tmp.real(),tmp.real());
 	tmp = PminusMat(l*istride+s1*ostride,s2);
 	sm[l] = scalar_type(tmp.real(),tmp.real());
      }
    }
    Matp[LLs*s2+s1] = Vp;
    Matm[LLs*s2+s1] = Vm;
  }}
 }
  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@ -33,6 +33,31 @@ namespace Grid {
  namespace QCD {
     template<typename T> struct switcheroo   {  
       static inline int iscomplex()  { return 0; } 
       template<class vec>
       static inline vec mult(vec a, vec b) {
 	 return real_mult(a,b);
       }
     };
     template<> struct switcheroo<ComplexD> {  
       static inline int iscomplex()  { return 1; } 
       template<class vec>
       static inline vec mult(vec a, vec b) {
 	 return a*b;
       }
     };
     template<> struct switcheroo<ComplexF> {  
       static inline int iscomplex()  { return 1; } 
       template<class vec>
       static inline vec mult(vec a, vec b) {
 	 return a*b;
       }
     };
    template<class Impl>
    class CayleyFermion5D : public WilsonFermion5D<Impl>
    {
@ -75,7 +100,19 @@ namespace Grid {
 		  std::vector<Coeff_t> &lower,
 		  std::vector<Coeff_t> &diag,
 		  std::vector<Coeff_t> &upper);
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd> > & Matp, Vector<iSinglet<Simd> > & Matm);
      void MooeeInternalAsm(const FermionField &in, FermionField &out,
 			    int LLs, int site,
 			    Vector<iSinglet<Simd> > &Matp,
 			    Vector<iSinglet<Simd> > &Matm);
      void MooeeInternalZAsm(const FermionField &in, FermionField &out,
 			    int LLs, int site,
 			    Vector<iSinglet<Simd> > &Matp,
 			    Vector<iSinglet<Simd> > &Matm);
      virtual void   Instantiatable(void)=0;
@ -112,6 +149,12 @@ namespace Grid {
      std::vector<Coeff_t> ueem;    
      std::vector<Coeff_t> dee;    
      // Matrices of 5d ee inverse params
      Vector<iSinglet<Simd> >  MatpInv;
      Vector<iSinglet<Simd> >  MatmInv;
      Vector<iSinglet<Simd> >  MatpInvDag;
      Vector<iSinglet<Simd> >  MatmInvDag;
      // Constructors
      CayleyFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@ -29,13 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Eigen/Dense>
+
 #include <Grid.h>
 namespace Grid {
-namespace QCD {
+namespace QCD {  /*
  /*
   * Dense matrix versions of routines
   */
 template<class Impl>
@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP
      for(int v=0;v<LLs;v++){
 	vprefetch(psi[ss+v+LLs]);
 	//	vprefetch(phi[ss+v+LLs]);
 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
@ -145,9 +143,6 @@ PARALLEL_FOR_LOOP
 	Simd hm_11 = psi[ss+vm]()(1)(1); 
 	Simd hm_12 = psi[ss+vm]()(1)(2); 
 	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
 	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
 	if ( vp<=v ) {
 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP
 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 	}
-	/*
+	// Can force these to real arithmetic and save 2x.
-	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
+	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
-	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
+	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
-	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
+	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
-	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
+	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
-	*/	
+	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
-	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
+	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
-	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
+	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
-	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
+	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
-	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
+	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
-	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
+	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
-	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
+	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
-	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
+	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
 	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
 	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
 	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
 	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
 	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
 	//	if ( ss==0){
 	/*
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
 	}
 	*/
 	vstream(chi[ss+v]()(0)(0),p_00);
 	vstream(chi[ss+v]()(0)(1),p_01);
 	vstream(chi[ss+v]()(0)(2),p_02);
@ -261,7 +234,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-
+#if 0
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
@ -287,9 +260,504 @@ PARALLEL_FOR_LOOP
      chi[ss+v] = chi[ss+v]     +l[v]*fm;
    }
 #else
      for(int v=0;v<LLs;v++){
 	vprefetch(psi[ss+v+LLs]);
 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
 	Simd hp_00 = psi[ss+vp]()(0)(0); 
 	Simd hp_01 = psi[ss+vp]()(0)(1); 
 	Simd hp_02 = psi[ss+vp]()(0)(2); 
 	Simd hp_10 = psi[ss+vp]()(1)(0); 
 	Simd hp_11 = psi[ss+vp]()(1)(1); 
 	Simd hp_12 = psi[ss+vp]()(1)(2); 
 	Simd hm_00 = psi[ss+vm]()(2)(0); 
 	Simd hm_01 = psi[ss+vm]()(2)(1); 
 	Simd hm_02 = psi[ss+vm]()(2)(2); 
 	Simd hm_10 = psi[ss+vm]()(3)(0); 
 	Simd hm_11 = psi[ss+vm]()(3)(1); 
 	Simd hm_12 = psi[ss+vm]()(3)(2); 
 	if ( vp<=v ) {
 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 	}
 	if ( vm>=v ) {
 	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 	}
 	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
 	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
 	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
 	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
 	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
 	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
 	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
 	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
 	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
 	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
 	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
 	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
 	vstream(chi[ss+v]()(0)(0),p_00);
 	vstream(chi[ss+v]()(0)(1),p_01);
 	vstream(chi[ss+v]()(0)(2),p_02);
 	vstream(chi[ss+v]()(1)(0),p_10);
 	vstream(chi[ss+v]()(1)(1),p_11);
 	vstream(chi[ss+v]()(1)(2),p_12);
 	vstream(chi[ss+v]()(2)(0),p_20);
 	vstream(chi[ss+v]()(2)(1),p_21);
 	vstream(chi[ss+v]()(2)(2),p_22);
 	vstream(chi[ss+v]()(3)(0),p_30);
 	vstream(chi[ss+v]()(3)(1),p_31);
 	vstream(chi[ss+v]()(3)(2),p_32);
      }
 #endif
  }
  M5Dtime+=usecond();
 }
 #ifdef AVX512 
 #include <simd/Intel512common.h>
 #include <simd/Intel512avx.h>
 #include <simd/Intel512single.h>
 #endif 
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
 					     int LLs, int site,
 					     Vector<iSinglet<Simd> > &Matp,
 					     Vector<iSinglet<Simd> > &Matm)
 {
 #ifndef AVX512
  {
  SiteHalfSpinor BcastP;
  SiteHalfSpinor BcastM;
  SiteHalfSpinor SiteChiP;
  SiteHalfSpinor SiteChiM;
  // Ls*Ls * 2 * 12 * vol flops
  for(int s1=0;s1<LLs;s1++){ 
    for(int s2=0;s2<LLs;s2++){ 
      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
        int s=s2+l*LLs;
 	int lex=s2+LLs*site;
 	if ( s2==0 && l==0) {
 	  SiteChiP=zero;
 	  SiteChiM=zero;
 	}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
 	}}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
 	}}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
 	  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
 	}}
    }}
    {
      int lex = s1+LLs*site;
      for(int sp=0;sp<2;sp++){
      for(int co=0;co<Nc;co++){
 	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
 	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
      }}
    }
  }
  }
 #else
  {
  // pointers
    //  MASK_REGS;
 #define Chi_00 %%zmm1
 #define Chi_01 %%zmm2
 #define Chi_02 %%zmm3
 #define Chi_10 %%zmm4
 #define Chi_11 %%zmm5
 #define Chi_12 %%zmm6
 #define Chi_20 %%zmm7
 #define Chi_21 %%zmm8
 #define Chi_22 %%zmm9
 #define Chi_30 %%zmm10
 #define Chi_31 %%zmm11
 #define Chi_32 %%zmm12
 #define BCAST0   %%zmm13
 #define BCAST1   %%zmm14
 #define BCAST2   %%zmm15
 #define BCAST3   %%zmm16
 #define BCAST4   %%zmm17
 #define BCAST5   %%zmm18
 #define BCAST6   %%zmm19
 #define BCAST7   %%zmm20
 #define BCAST8   %%zmm21
 #define BCAST9   %%zmm22
 #define BCAST10  %%zmm23
 #define BCAST11  %%zmm24
  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
  for(int s1=0;s1<LLs;s1++){ 
    for(int s2=0;s2<LLs;s2++){ 
      int lex=s2+LLs*site;
      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
      uint64_t a2 = (uint64_t)&psi[lex];
      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
 	if ( (s2+l)==0 ) {
 	  asm (
  	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
  	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
  	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
 		   VBCASTCDUP(0,%2,BCAST0)   
 		   VBCASTCDUP(1,%2,BCAST1)   
 		   VBCASTCDUP(2,%2,BCAST2)   
 		   VBCASTCDUP(3,%2,BCAST3)   
 		   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
 		   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
 		   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
 		   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
 		   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
 		   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
 		   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
 		   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
 		   VMULMEM (0,%1,BCAST8,Chi_22)         
 		   VMULMEM (0,%1,BCAST9,Chi_30)
 		   VMULMEM (0,%1,BCAST10,Chi_31)       
 		   VMULMEM (0,%1,BCAST11,Chi_32)
 		   : : "r" (a0), "r" (a1), "r" (a2)  );
 	} else { 
 	  asm (
 		   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
 		   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
 		   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
 		   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
 		   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
 		   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
 		   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
 		   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
 		   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
 		   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
 		   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
 		   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
 		   : : "r" (a0), "r" (a1), "r" (a2)  );
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
 	a2 = a2+sizeof(Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
      asm (
 	       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
 	       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
 	       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
 	       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
 	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
    }
  }
  }
 #undef Chi_00
 #undef Chi_01
 #undef Chi_02
 #undef Chi_10
 #undef Chi_11
 #undef Chi_12
 #undef Chi_20
 #undef Chi_21
 #undef Chi_22
 #undef Chi_30
 #undef Chi_31
 #undef Chi_32
 #undef BCAST0
 #undef BCAST1
 #undef BCAST2
 #undef BCAST3
 #undef BCAST4
 #undef BCAST5
 #undef BCAST6
 #undef BCAST7
 #undef BCAST8
 #undef BCAST9
 #undef BCAST10
 #undef BCAST11
 #endif
 };
  // Z-mobius version
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
 					     int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
 {
 #ifndef AVX512
  {
  SiteHalfSpinor BcastP;
  SiteHalfSpinor BcastM;
  SiteHalfSpinor SiteChiP;
  SiteHalfSpinor SiteChiM;
  // Ls*Ls * 2 * 12 * vol flops
  for(int s1=0;s1<LLs;s1++){ 
    for(int s2=0;s2<LLs;s2++){ 
      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
        int s=s2+l*LLs;
 	int lex=s2+LLs*site;
 	if ( s2==0 && l==0) {
 	  SiteChiP=zero;
 	  SiteChiM=zero;
 	}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
 	}}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
 	}}
 	for(int sp=0;sp<2;sp++){
        for(int co=0;co<Nc;co++){
 	  SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
 	  SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
 	}}
    }}
    {
      int lex = s1+LLs*site;
      for(int sp=0;sp<2;sp++){
      for(int co=0;co<Nc;co++){
 	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
 	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
      }}
    }
  }
  }
 #else
  {
  // pointers
  //  MASK_REGS;
 #define Chi_00 %zmm0
 #define Chi_01 %zmm1
 #define Chi_02 %zmm2
 #define Chi_10 %zmm3
 #define Chi_11 %zmm4
 #define Chi_12 %zmm5
 #define Chi_20 %zmm6
 #define Chi_21 %zmm7
 #define Chi_22 %zmm8
 #define Chi_30 %zmm9
 #define Chi_31 %zmm10
 #define Chi_32 %zmm11
 #define pChi_00 %%zmm0
 #define pChi_01 %%zmm1
 #define pChi_02 %%zmm2
 #define pChi_10 %%zmm3
 #define pChi_11 %%zmm4
 #define pChi_12 %%zmm5
 #define pChi_20 %%zmm6
 #define pChi_21 %%zmm7
 #define pChi_22 %%zmm8
 #define pChi_30 %%zmm9
 #define pChi_31 %%zmm10
 #define pChi_32 %%zmm11
 #define BCAST_00   %zmm12
 #define  SHUF_00   %zmm13
 #define BCAST_01   %zmm14
 #define  SHUF_01   %zmm15
 #define BCAST_02   %zmm16
 #define  SHUF_02   %zmm17
 #define BCAST_10   %zmm18
 #define  SHUF_10   %zmm19
 #define BCAST_11   %zmm20
 #define  SHUF_11   %zmm21
 #define BCAST_12   %zmm22
 #define  SHUF_12   %zmm23
 #define Mp  %zmm24
 #define Mps %zmm25
 #define Mm  %zmm26
 #define Mms %zmm27
 #define N 8
  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
  for(int s1=0;s1<LLs;s1++){ 
    for(int s2=0;s2<LLs;s2++){ 
      int lex=s2+LLs*site;
      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
      uint64_t a2 = (uint64_t)&psi[lex];
      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
 	if ( (s2+l)==0 ) {
 	  LOAD64(%r8,a0);
 	  LOAD64(%r9,a1);
 	  LOAD64(%r10,a2);
 	  asm (
 	       VLOAD(0,%r8,Mp)// i r
 	       VLOAD(0,%r9,Mm)
 	       VSHUF(Mp,Mps)  // r i 
 	       VSHUF(Mm,Mms)
 	       VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
 	       VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
 	       VMULIDUP(0*N,%r10,Mps,Chi_00)
 	       VMULIDUP(1*N,%r10,Mps,Chi_01)
 	       VMULIDUP(2*N,%r10,Mps,Chi_02)
 	       VMULIDUP(3*N,%r10,Mps,Chi_10)
 	       VMULIDUP(4*N,%r10,Mps,Chi_11)
 	       VMULIDUP(5*N,%r10,Mps,Chi_12)
 	       VMULIDUP(6*N ,%r10,Mms,Chi_20)
 	       VMULIDUP(7*N ,%r10,Mms,Chi_21)
 	       VMULIDUP(8*N ,%r10,Mms,Chi_22)
 	       VMULIDUP(9*N ,%r10,Mms,Chi_30)
 	       VMULIDUP(10*N,%r10,Mms,Chi_31)
 	       VMULIDUP(11*N,%r10,Mms,Chi_32)
 	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
 	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
 	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
 	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
 	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
 	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
 	       VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
 	       VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
 	       VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
 	       VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
 	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
 	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
 	       );
 	} else { 
 	  LOAD64(%r8,a0);
 	  LOAD64(%r9,a1);
 	  LOAD64(%r10,a2);
 	  asm (
 	       VLOAD(0,%r8,Mp)
 	       VSHUF(Mp,Mps)
 	       VLOAD(0,%r9,Mm)
 	       VSHUF(Mm,Mms)
 	       VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
 	       VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
 	       VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
 	       VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
 	       VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
 	       VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
 	       VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
 	       VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
 	       VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
 	       VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
 	       VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
 	       VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
 	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
 	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
 	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
 	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
 	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
 	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
 	       VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
 	       VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
 	       VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
 	       VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
 	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
 	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
 	       );
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
 	a2 = a2+sizeof(Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
      /*
      SiteSpinor tmp;
      asm (
 	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
 	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
 	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
 	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
 	       : : "r" ((uint64_t)&tmp) : "memory" );
      */
      asm (
 	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
 	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
 	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
 	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
 	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
      //      if ( 1 || (site==0) ) { 
      //	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
      //      }
    }
  }
  }
 #undef Chi_00
 #undef Chi_01
 #undef Chi_02
 #undef Chi_10
 #undef Chi_11
 #undef Chi_12
 #undef Chi_20
 #undef Chi_21
 #undef Chi_22
 #undef Chi_30
 #undef Chi_31
 #undef Chi_32
 #undef BCAST0
 #undef BCAST1
 #undef BCAST2
 #undef BCAST3
 #undef BCAST4
 #undef BCAST5
 #undef BCAST6
 #undef BCAST7
 #undef BCAST8
 #undef BCAST9
 #undef BCAST10
 #undef BCAST11
 #endif
 };
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
@ -299,108 +767,41 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  chi.checkerboard=psi.checkerboard;
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Vector<iSinglet<Simd> >  Matp;
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Vector<iSinglet<Simd> >  Matm;
  Vector<iSinglet<Simd> >  *_Matp;
  Vector<iSinglet<Simd> >  *_Matm;
-  for(int s=0;s<Ls;s++){
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
-    Pplus(s,s) = bee[s];
+  if ( inv && dag ) { 
-    Pminus(s,s)= bee[s];
+    _Matp = &MatpInvDag;
    _Matm = &MatmInvDag;
  }
-  
+  if ( inv && (!dag) ) { 
-  for(int s=0;s<Ls-1;s++){
+    _Matp = &MatpInv;
-    Pminus(s,s+1) = -cee[s];
+    _Matm = &MatmInv;
  } 
  if ( !inv ) {
    MooeeInternalCompute(dag,inv,Matp,Matm);
    _Matp = &Matp;
    _Matm = &Matm;
  }
-  
+  assert(_Matp->size()==Ls*LLs);
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXcd PplusMat ;
  Eigen::MatrixXcd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  typedef typename SiteHalfSpinor::scalar_type scalar_type;
  const int Nsimd=Simd::Nsimd();
  Vector<iSinglet<Simd> > Matp(Ls*LLs);
  Vector<iSinglet<Simd> > Matm(Ls*LLs);
  for(int s2=0;s2<Ls;s2++){
  for(int s1=0;s1<LLs;s1++){
    int istride = LLs;
    int ostride = 1;
      Simd Vp;
      Simd Vm;
      scalar_type *sp = (scalar_type *)&Vp;
      scalar_type *sm = (scalar_type *)&Vm;
      for(int l=0;l<Nsimd;l++){
 	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
 	sm[l] = PminusMat(l*istride+s1*ostride,s2);
      }
      Matp[LLs*s2+s1] = Vp;
      Matm[LLs*s2+s1] = Vm;
    }
  }
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  // Dynamic allocate on stack to get per thread without serialised heap acces
 #pragma omp parallel  
  {
-    Vector<SiteHalfSpinor> SitePplus(LLs);
+  if ( switcheroo<Coeff_t>::iscomplex() ) {
-    Vector<SiteHalfSpinor> SitePminus(LLs);
+  PARALLEL_FOR_LOOP
-    Vector<SiteHalfSpinor> SiteChiP(LLs);
+    for(auto site=0;site<vol;site++){
-    Vector<SiteHalfSpinor> SiteChiM(LLs);
+      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    Vector<SiteSpinor>     SiteChi(LLs);
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
 #pragma omp for 
  for(auto site=0;site<vol;site++){
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
      spProj5m(SitePminus[s],psi[lex]);
      SiteChiP[s]=zero;
      SiteChiM[s]=zero;
    }
-      
+  } else { 
-    int s=0;
+  PARALLEL_FOR_LOOP
-    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+    for(auto site=0;site<vol;site++){
-      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
+      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
 	vbroadcast(BcastP,SitePplus [s2],l);
 	vbroadcast(BcastM,SitePminus[s2],l);
 	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
 	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
 	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
 	}
      s++;
    }}
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spRecon5p(SiteChi[s],SiteChiP[s]);
      accumRecon5m(SiteChi[s],SiteChiM[s]);
      chi[lex] = SiteChi[s]*0.5;
    }
  }
  }
  MooeeInvTime+=usecond();
 }
@ -414,4 +815,5 @@ template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const Fermion
 template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 }}
--- a/lib/qcd/action/fermion/FermionOperator.h
+++ b/lib/qcd/action/fermion/FermionOperator.h
@ -48,6 +48,8 @@ namespace Grid {
      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
      virtual FermionField &tmp(void) = 0;
      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@ -61,7 +61,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      LebesgueEvenOdd(_cbgrid),
      Umu(&Fgrid),
      UmuEven(&Hgrid),
-      UmuOdd(&Hgrid) {
+      UmuOdd(&Hgrid),
      _tmp(&Hgrid)
 {
  // Allocate the required comms buffer
  ImportGauge(_Umu);
 }
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -60,7 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuEven(_FourDimRedBlackGrid),
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimRedBlackGrid)
+  LebesgueEvenOdd(_FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid)
 {
  if (Impl::LsVectorised) { 
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@ -74,6 +74,9 @@ namespace QCD {
     typedef WilsonKernels<Impl> Kernels;
     PmuStat stat;
     FermionField _tmp;
     FermionField &tmp(void) { return _tmp; }
     void Report(void);
     void ZeroCounters(void);
     double DhopCalls;
--- a/lib/serialisation/BaseIO.h
+++ b/lib/serialisation/BaseIO.h
@ -83,12 +83,7 @@ namespace Grid {
    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
    write(const std::string& s, const U &output);
    template <typename U>
-    typename std::enable_if<std::is_enum<U>::value, void>::type
+    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
    write(const std::string& s, const U &output);
    template <typename U>
    typename std::enable_if<
      !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
      void>::type
    write(const std::string& s, const U &output);
  private:
    T *upcast;
@ -107,12 +102,7 @@ namespace Grid {
    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
    read(const std::string& s, U &output);
    template <typename U>
-    typename std::enable_if<std::is_enum<U>::value, void>::type
+    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
    read(const std::string& s, U &output);
    template <typename U>
    typename std::enable_if<
      !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
      void>::type
    read(const std::string& s, U &output);
  protected:
    template <typename U>
@ -221,17 +211,7 @@ namespace Grid {
  template <typename T>
  template <typename U>
-  typename std::enable_if<std::is_enum<U>::value, void>::type
+  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
  Writer<T>::write(const std::string &s, const U &output)
  {
    EnumIO<U>::write(*this, s, output);
  }
  template <typename T>
  template <typename U>
  typename std::enable_if<
    !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
    void>::type
  Writer<T>::write(const std::string &s, const U &output)
  {
    upcast->writeDefault(s, output);
@ -266,17 +246,7 @@ namespace Grid {
  template <typename T>
  template <typename U>
-  typename std::enable_if<std::is_enum<U>::value, void>::type
+  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
  Reader<T>::read(const std::string &s, U &output)
  {
    EnumIO<U>::read(*this, s, output);
  }
  template <typename T>
  template <typename U>
  typename std::enable_if<
    !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
    void>::type
  Reader<T>::read(const std::string &s, U &output)
  {
    upcast->readDefault(s, output);
@ -300,7 +270,6 @@ namespace Grid {
      abort();
    }
  }
 }
 #endif
--- a/lib/serialisation/MacroMagic.h
+++ b/lib/serialisation/MacroMagic.h
@ -114,35 +114,33 @@ THE SOFTWARE.
 #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);
 #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)		\
-  \
+\
-  \
+\
-  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
+GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
-  \
+\
-  \
+\
-  template <typename T>\
+template <typename T>\
-  static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
+static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
-    push(WR,s);\
+  push(WR,s);\
-    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
+  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
-    pop(WR);\
+  pop(WR);\
-  } \
+} \
-  \
+\
-  \
+\
-  template <typename T>\
+template <typename T>\
-  static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
+static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
-    push(RD,s);\
+  push(RD,s);\
-    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
+  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
-    pop(RD);\
+  pop(RD);\
-  } \
+} \
-  \
+\
-  \
+\
-  friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
+friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
-    os<<"class "<<#cname<<" {"<<std::endl;\
+  os<<"class "<<#cname<<" {"<<std::endl;\
-    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
+  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
-      os<<"}";								\
+    os<<"}";								\
-    return os;\
+  return os;\
-  };  
+};  
 #define GRID_ENUM_TYPE(obj) std::remove_reference<decltype(obj)>::type
 #define GRID_MACRO_ENUMVAL(A,B) A = B,
@ -150,44 +148,52 @@ THE SOFTWARE.
 #define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;}
 #define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break;
 namespace Grid {
  template <typename U>
  class EnumIO {};
 }
 #define GRID_SERIALIZABLE_ENUM(name,undefname,...)\
-  enum class name {\
+class name: public Serializable\
-      GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\
+{\
-      undefname = -1\
+public:\
  enum EnumType\
  {\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\
    undefname = -1\
  };\
 public:\
  name(void): value_(undefname) {};\
  name(EnumType value): value_(value) {};\
  template <typename T>\
  static inline void write(Writer<T> &WR,const std::string &s, const name &obj)\
  {\
    switch (obj.value_)\
    {\
      GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
      default: Grid::write(WR,s,#undefname); break;\
    }\
  }\
  \
-  template<>\
+  template <typename T>\
-  class EnumIO<name> {\
+  static inline void read(Reader<T> &RD,const std::string &s, name &obj)\
-    public:\
+  {\
-      template <typename T>\
+    std::string buf;\
-      static inline void write(Writer<T> &WR,const std::string &s, const name &obj){ \
+    Grid::read(RD, s, buf);\
-        switch (obj) {\
+    if (buf == #undefname) {obj = name::undefname;}\
-          GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
+    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\
-          default: Grid::write(WR,s,#undefname); break;\
+    else {obj = name::undefname;}\
-        }\
+  }\
-      }\
+  inline operator EnumType(void) const\
-      \
+  {\
-      template <typename T>\
+    return value_;\
-      static inline void read(Reader<T> &RD,const std::string &s, name &obj){ \
+  }\
-        std::string buf;\
+  inline friend std::ostream & operator<<(std::ostream &os, const name &obj)\
-        Grid::read(RD, s, buf);\
+  {\
        if (buf == #undefname) {obj = name::undefname;}\
        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\
        else {obj = name::undefname;}\
      }\
  };\
  \
  inline std::ostream & operator << (std::ostream &os, const name &obj ) { \
    switch (obj) {\
-        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
+      GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
-        default: os << #undefname; break;\
+      default: os << #undefname; break;\
    }\
    return os;\
-  };
+  }\
 private:\
  EnumType value_;\
 };
 #endif
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@ -213,6 +213,29 @@ namespace Optimization {
    }
  };
  struct MultRealPart{
    inline __m256 operator()(__m256 a, __m256 b){
      __m256 ymm0;
      ymm0  = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      return  _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
    }
    inline __m256d operator()(__m256d a, __m256d b){
      __m256d ymm0;
      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
      return _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
    }
  };
  struct MaddRealPart{
    inline __m256 operator()(__m256 a, __m256 b, __m256 c){
      __m256 ymm0 =  _mm256_moveldup_ps(a); // ymm0 <- ar ar,
      return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c);                         
    }
    inline __m256d operator()(__m256d a, __m256d b, __m256d c){
      __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 );
      return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c);                         
    }
  };
  struct MultComplex{
    // Complex float
    inline __m256 operator()(__m256 a, __m256 b){
@ -627,7 +650,9 @@ namespace Optimization {
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultComplex  MultComplexSIMD;
  typedef Optimization::MultRealPart MultRealPartSIMD;
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@ -189,6 +189,29 @@ namespace Optimization {
  //  2mul,4 mac +add+sub = 8 flop type insns
  //  3shuf + 2 (+shuf)   = 5/6 simd perm and 1/2 the load.
  struct MultRealPart{
    inline __m512 operator()(__m512 a, __m512 b){
      __m512 ymm0;
      ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
      return _mm512_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
    }
    inline __m512d operator()(__m512d a, __m512d b){
      __m512d ymm0;
      ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
      return _mm512_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
    }
  };
  struct MaddRealPart{
    inline __m512 operator()(__m512 a, __m512 b, __m512 c){
      __m512 ymm0 =  _mm512_moveldup_ps(a); // ymm0 <- ar ar,
      return _mm512_fmadd_ps( ymm0, b, c);                         
    }
    inline __m512d operator()(__m512d a, __m512d b, __m512d c){
      __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
      return _mm512_fmadd_pd( ymm0, b, c);                         
    }
  };
  struct MultComplex{
    // Complex float
    inline __m512 operator()(__m512 a, __m512 b){
@ -501,6 +524,8 @@ namespace Optimization {
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::MultRealPart MultRealPartSIMD;
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@ -224,6 +224,21 @@ namespace Optimization {
  #define cmul(a, b, c, i)\
  c[i]   = a[i]*b[i]   - a[i+1]*b[i+1];\
  c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
  struct MultRealPart{
    template <typename T>
    inline vec<T> operator()(vec<T> a, vec<T> b){
      vec<T> out;
      VECTOR_FOR(i, W<T>::c, 1)
      {
         out.v[2*i]   = a[2*i]*b[2*i];
         out.v[2*i+1] = a[2*i]*b[2*i+1];
      }      
      return out;
    };
  };
  struct MultComplex{
    // Complex
@ -456,6 +471,7 @@ namespace Optimization {
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::MultRealPart MultRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@ -220,6 +220,14 @@ namespace Optimization {
    }
  };
  struct MultRealPart{
    // Complex double
    inline vector4double operator()(vector4double a, vector4double b){
  //      return vec_xmul(b, a);
        return vec_xmul(a, b);
    }
    FLOAT_WRAP_2(operator(), inline)
  };
  struct MultComplex{
    // Complex double
    inline vector4double operator()(vector4double a, vector4double b){
@ -430,6 +438,7 @@ typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Mult        MultSIMD;
 typedef Optimization::Div         DivSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::MultRealPart MultRealPartSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
 typedef Optimization::TimesI      TimesISIMD;
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@ -177,6 +177,29 @@ namespace Optimization {
    }
  };
  struct MultRealPart{
    inline __m128 operator()(__m128 a, __m128 b){
      __m128 ymm0;
      ymm0  = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      return  _mm_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
    }
    inline __m128d operator()(__m128d a, __m128d b){
      __m128d ymm0;
      ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
      return _mm_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
    }
  };
  struct MaddRealPart{
    inline __m128 operator()(__m128 a, __m128 b, __m128 c){
      __m128 ymm0 =  _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      return _mm_add_ps(_mm_mul_ps( ymm0, b),c);                         
    }
    inline __m128d operator()(__m128d a, __m128d b, __m128d c){
      __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
      return _mm_add_pd(_mm_mul_pd( ymm0, b),c);                         
    }
  };
  struct MultComplex{
    // Complex float
    inline __m128 operator()(__m128 a, __m128 b){
@ -325,9 +348,11 @@ namespace Optimization {
      }
    }
 #ifndef _mm_alignr_epi64
 #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
-    
+#endif 
    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
@ -415,6 +440,8 @@ namespace Optimization {
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::MultRealPart MultRealPartSIMD;
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -101,6 +101,11 @@ template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integr
 // general forms to allow for vsplat syntax
 // need explicit declaration of types when used since
 // clang cannot automatically determine the output type sometimes
 template <class Out, class Input1, class Input2, class Input3, class Operation>
 Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
  return op(src_1, src_2, src_3);
 }
 template <class Out, class Input1, class Input2, class Operation>
 Out binary(Input1 src_1, Input2 src_2, Operation op) {
  return op(src_1, src_2);
@ -178,6 +183,7 @@ class Grid_simd {
                          const Grid_simd *__restrict__ r) {
    *y = (*l) * (*r);
  }
  friend inline void sub(Grid_simd *__restrict__ y,
                         const Grid_simd *__restrict__ l,
                         const Grid_simd *__restrict__ r) {
@ -188,7 +194,6 @@ class Grid_simd {
                         const Grid_simd *__restrict__ r) {
    *y = (*l) + (*r);
  }
  friend inline void mac(Grid_simd *__restrict__ y,
                         const Scalar_type *__restrict__ a,
                         const Grid_simd *__restrict__ x) {
@ -260,7 +265,7 @@ class Grid_simd {
  }
  ////////////////////////////
-  // opreator scalar * simd
+  // operator scalar * simd
  ////////////////////////////
  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
    Grid_simd va;
@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
  S* typepun =(S*) &src;
  vsplat(ret,typepun[lane]);
 }    
 template <class S, class V, IfComplex<S> =0> 
 inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
  S* typepun =(S*) &src;
  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
 }    
 ///////////////////////
 // Splat
@ -449,6 +459,10 @@ template <class S, class V>
 inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
  vsplat(ret, real(c), imag(c));
 }
 template <class S, class V>
 inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
  vsplat(ret, real(c), real(c));
 }
 // if real fill with a, if complex fill with a in the real part (first function
 // above)
@ -550,6 +564,21 @@ inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
  return ret;
 };
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
  Grid_simd<S, V> ret;
  ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
  return ret;
 };
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
  Grid_simd<S, V> ret;
  ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
  return ret;
 };
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@ -95,10 +95,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
 #define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"
-#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
+#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
-#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
+#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
-#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
+#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss    (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
-#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
+#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss    (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
 #define VBCASTCDUPf(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*64  )(" #A ")," #DEST  ";\n" 
 #define VBCASTZDUPf(OFF,A,DEST)           "vbroadcastf32x4 (" #OFF "*64  )(" #A ")," #DEST  ";\n" 
 #define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST) 
 #define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST) 
 #define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
 #define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
@ -106,11 +110,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
 #define VMADDRDUPf(O,P,B,accum) "vfmadd231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDIDUPf(O,P,B,accum) "vfmadd231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDRDUPd(O,P,B,accum) "vfmadd231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMADDIDUPd(O,P,B,accum) "vfmadd231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@ -87,7 +87,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VACCTIMESMINUSI1d(A,ACC,tmp)				\
  VACCTIMESMINUSI2d(A,ACC,tmp)			
-#define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
+#define LOAD64a(A,ptr)  "movq %0, %" #A :  : "r"(ptr)  : #A  
 #define LOAD64i(A,ptr)  __asm__ ( LOAD64a(A,ptr));
 #define LOAD64(A,ptr)  LOAD64i(A,ptr)
 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
@ -108,8 +109,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 //  "clevict0 "#O"*64("#A");\n" 
-#define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VLOADf(OFF,PTR,DEST)   "vmovups  " #OFF "*64(" #PTR "), " #DEST  ";\n"
-#define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VLOADd(OFF,PTR,DEST)   "vmovupd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 #define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
 #define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
@ -143,8 +144,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VSTOREf(OFF,PTR,SRC)   "vmovntps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovntpd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #else
-#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREf(OFF,PTR,SRC)   "vmovups " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
-#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREd(OFF,PTR,SRC)   "vmovupd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #endif
 // Swaps Re/Im ; could unify this with IMCI
--- a/lib/simd/Intel512double.h
+++ b/lib/simd/Intel512double.h
@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
 #define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
 #define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
 #undef VMADDRDUP   
 #undef VMADDSUBRDUP   
 #undef VMADDSUBIDUP   
 #undef VMULRDUP   
 #undef VMULIDUP   
 #define VMADDRDUP(O,P,B,accum)    VMADDRDUPd(O,P,B,accum) 
 #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
 #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
 #define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
--- a/lib/simd/Intel512single.h
+++ b/lib/simd/Intel512single.h
@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
 #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
 #undef VMADDRDUP   
 #undef VMADDSUBRDUP   
 #undef VMADDSUBIDUP   
 #undef VMULRDUP   
 #undef VMULIDUP   
 #define VMADDRDUP(O,P,B,accum)    VMADDRDUPf(O,P,B,accum) 
 #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
 #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
 #define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
--- a/tests/debug/Test_cayley_even_odd_vec.cc
+++ b/tests/debug/Test_cayley_even_odd_vec.cc