diff --git a/lib/Algorithms.h b/lib/Algorithms.h
index a77e50bc..02d72cb1 100644
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -18,8 +18,8 @@
 #include <algorithms/iterative/ConjugateGradientMultiShift.h>
 
 // Lanczos support
-#include <algorithms/iterative/MatrixUtils.h>
-#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+//#include <algorithms/iterative/MatrixUtils.h>
+//#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
 
 #include <algorithms/CoarsenedMatrix.h>
 
diff --git a/lib/Init.cc b/lib/Init.cc
index 6457b4b2..a6f016b1 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -178,6 +178,7 @@ void Grid_init(int *argc,char ***argv)
   }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
     QCD::WilsonFermionStatic::HandOptDslash=1;
+    QCD::WilsonFermion5DStatic::HandOptDslash=1;
   }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
     LebesgueOrder::UseLebesgueOrder=1;
diff --git a/lib/Simd.h b/lib/Simd.h
index 7da0d7ad..42d5e74c 100644
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -13,6 +13,11 @@
 
 typedef uint32_t Integer;
 
+#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
+#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
+#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
+#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
+
 namespace Grid {
 
   typedef  float  RealF;
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index b923fe4f..fee1b0fb 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -56,13 +56,13 @@
     UChi_02+= U_20*Chi_02;\
     UChi_12+= U_20*Chi_12;
 
-#define PERMUTE\
-      permute(Chi_00,Chi_00,ptype);\
-      permute(Chi_01,Chi_01,ptype);\
-      permute(Chi_02,Chi_02,ptype);\
-      permute(Chi_10,Chi_10,ptype);\
-      permute(Chi_11,Chi_11,ptype);\
-      permute(Chi_12,Chi_12,ptype);
+#define PERMUTE_DIR(dir)			\
+      permute##dir(Chi_00,Chi_00);\
+      permute##dir(Chi_01,Chi_01);\
+      permute##dir(Chi_02,Chi_02);\
+      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_11,Chi_11);\
+      permute##dir(Chi_12,Chi_12);
 
 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
@@ -286,6 +286,10 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
+  //  std::cout << "Hand op Dhop "<<std::endl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
   REGISTER Simd result_00; // 12 regs on knc
   REGISTER Simd result_01;
   REGISTER Simd result_02;
@@ -352,7 +356,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
     LOAD_CHIMU;
     XP_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -373,7 +377,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
     LOAD_CHIMU;
     YP_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -394,7 +398,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
     LOAD_CHIMU;
     ZP_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -414,7 +418,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
     LOAD_CHIMU;
     TP_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -434,7 +438,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
     LOAD_CHIMU;
     XM_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -454,7 +458,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
     LOAD_CHIMU;
     YM_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -474,7 +478,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
     LOAD_CHIMU;
     ZM_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -494,7 +498,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
     LOAD_CHIMU;
     TM_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -526,6 +530,9 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int sU,const FermionField &in, FermionField &out)
 {
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
   REGISTER Simd result_00; // 12 regs on knc
   REGISTER Simd result_01;
   REGISTER Simd result_02;
@@ -592,7 +599,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
     LOAD_CHIMU;
     XM_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -612,7 +619,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
     LOAD_CHIMU;
     YM_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -633,7 +640,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
     LOAD_CHIMU;
     ZM_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -653,7 +660,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
     LOAD_CHIMU;
     TM_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -673,7 +680,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
     LOAD_CHIMU;
     XP_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -694,7 +701,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
     LOAD_CHIMU;
     YP_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -714,7 +721,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
     LOAD_CHIMU;
     ZP_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
@@ -734,7 +741,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
     LOAD_CHIMU;
     TP_PROJ;
     if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
     }
   } else { 
     LOAD_CHI;
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index a00e29eb..217ea947 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -183,11 +183,11 @@ namespace Optimization {
     // Complex float
     inline __m256 operator()(__m256 a, __m256 b){
       __m256 ymm0,ymm1,ymm2;
-      ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
       ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
       // FIXME AVX2 could MAC
-      ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
-      ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
       ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
       return _mm256_addsub_ps(ymm0,ymm1);  
     }
@@ -270,7 +270,7 @@ namespace Optimization {
     //Complex single
     inline __m256 operator()(__m256 in, __m256 ret){
       __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in);   // r,-i
-      return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
+      return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
     }
     //Complex double
     inline __m256d operator()(__m256d in, __m256d ret){
@@ -282,7 +282,7 @@ namespace Optimization {
   struct TimesI{
     //Complex single
     inline __m256 operator()(__m256 in, __m256 ret){
-      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
+      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
       return _mm256_addsub_ps(_mm256_setzero_ps(),tmp);          // i,-r
     }
     //Complex double
@@ -296,27 +296,44 @@ namespace Optimization {
   // Some Template specialization
   //////////////////////////////////////////////
 
-  template < typename vtype > 
-    void permute(vtype &a,vtype b, int perm) {
-    uconv<vtype> conv;
-    conv.v = b;
-    switch (perm){
-      // 8x32 bits=>3 permutes
-    case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-    case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-    case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
-    default: assert(0); break;
-    }
-    a = conv.v;
-  }
+  struct Permute{
+
+    static inline __m256 Permute0(__m256 in){
+      return _mm256_permute2f128_ps(in,in,0x01);
+    };
+    static inline __m256 Permute1(__m256 in){
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m256 Permute2(__m256 in){
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m256 Permute3(__m256 in){
+      return in;
+    };
+
+    static inline __m256d Permute0(__m256d in){
+      return _mm256_permute2f128_pd(in,in,0x01);
+    };
+    static inline __m256d Permute1(__m256d in){
+      return _mm256_shuffle_pd(in,in,0x5);
+    };
+    static inline __m256d Permute2(__m256d in){
+      return in;
+    };
+    static inline __m256d Permute3(__m256d in){
+      return in;
+    };
+
+  };
+
 
   //Complex float Reduce
   template<>
     inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
     __m256 v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; quad complex single
-    v1 = _mm256_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
+    v1= _mm256_add_ps(v1,in);
+    v2=Optimization::Permute::Permute1(v1); 
     v1 = _mm256_add_ps(v1,v2);
     u256f conv; conv.v = v1;
     return Grid::ComplexF(conv.f[0],conv.f[1]);
@@ -326,11 +343,11 @@ namespace Optimization {
   template<>
   inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
     __m256 v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; octo-double
+    v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
     v1 = _mm256_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2 = Optimization::Permute::Permute1(v1); 
     v1 = _mm256_add_ps(v1,v2);
-    Optimization::permute(v2,v1,2); 
+    v2 = Optimization::Permute::Permute2(v1); 
     v1 = _mm256_add_ps(v1,v2);
     u256f conv; conv.v=v1;
     return conv.f[0];
@@ -341,7 +358,7 @@ namespace Optimization {
   template<>
   inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
     __m256d v1;
-    Optimization::permute(v1,in,0); // sse 128; paired complex single
+    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
     v1 = _mm256_add_pd(v1,in);
     u256d conv; conv.v = v1;
     return Grid::ComplexD(conv.f[0],conv.f[1]);
@@ -351,9 +368,9 @@ namespace Optimization {
   template<>
   inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
     __m256d v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; quad double
+    v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
     v1 = _mm256_add_pd(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2 = Optimization::Permute::Permute1(v1); 
     v1 = _mm256_add_pd(v1,v2);
     u256d conv; conv.v = v1;
     return conv.f[0];
@@ -387,13 +404,6 @@ namespace Grid {
     _mm_prefetch(ptr,_MM_HINT_T0);
   }
 
-
-
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  };
-
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
   typedef Optimization::Vstore   VstoreSIMD;
diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 7c1800db..7da71899 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -174,7 +174,7 @@ namespace Optimization {
     inline __m512d operator()(__m512d a, __m512d b){
       __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
       __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
-      a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
+      a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); 
       return _mm512_fmaddsub_pd( a_real, b, a_imag );
     }
   };
@@ -211,26 +211,24 @@ namespace Optimization {
     //Complex single
     inline __m512 operator()(__m512 in, __m512 ret){
       __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
-      return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
+      return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));   // 0x4E??
     }
     //Complex double
     inline __m512d operator()(__m512d in, __m512d ret){
       __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
-      return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
-    }
-
-
+      return _mm512_shuffle_pd(tmp,tmp,0x55);
+    } 
   };
 
   struct TimesI{
     //Complex single
     inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
+      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
       return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
     }
     //Complex double
     inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
+      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
       return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
     }
 
@@ -239,6 +237,36 @@ namespace Optimization {
 
 
   
+  // Gpermute utilities consider coalescing into 1 Gpermute
+  struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+
+    static inline __m512d Permute0(__m512d in){
+      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_shuffle_pd(in,in,0x55);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };
 
 
   //////////////////////////////////////////////
@@ -298,25 +326,6 @@ namespace Grid {
   }
 
 
-
-
-  // Gpermute utilities consider coalescing into 1 Gpermute
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    union { 
-      __m512 f;
-      decltype(VectorSIMD::v) v;
-    } conv;
-    conv.v = b.v;
-    switch(perm){
-    case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-    case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
-    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-    default: assert(0); break;
-    }
-    y.v=conv.v;
-  };
   
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h
index c89e8830..63765a47 100644
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -255,7 +255,36 @@ namespace Optimization {
   };
 
 
-  
+   struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB); 
+    };
+
+    static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
+      return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };
+ 
 
 
   //////////////////////////////////////////////
@@ -315,25 +344,6 @@ namespace Grid {
   }
 
 
-
-
-  // Gpermute utilities consider coalescing into 1 Gpermute
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    union { 
-      __m512 f;
-      decltype(VectorSIMD::v) v;
-    } conv;
-    conv.v = b.v;
-    switch(perm){
-    case 3:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
-    case 2:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; 
-    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
-    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-    default: assert(0); break;
-    }
-    y.v=conv.v;
-  };
   
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index 62516201..ecac68c4 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -151,10 +151,10 @@ namespace Optimization {
     // Complex float
     inline __m128 operator()(__m128 a, __m128 b){
       __m128 ymm0,ymm1,ymm2;
-      ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
       ymm0 = _mm_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
-      ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
-      ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
       ymm1 = _mm_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
       return _mm_addsub_ps(ymm0,ymm1);    
     }
@@ -201,7 +201,7 @@ namespace Optimization {
     //Complex single
     inline __m128 operator()(__m128 in, __m128 ret){
       __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
-      return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
+      return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
     }
     //Complex double
     inline __m128d operator()(__m128d in, __m128d ret){
@@ -215,7 +215,7 @@ namespace Optimization {
   struct TimesI{
     //Complex single
     inline __m128 operator()(__m128 in, __m128 ret){
-      __m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1));
+      __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
       return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
     }
     //Complex double
@@ -225,27 +225,45 @@ namespace Optimization {
     }
   };
 
+  struct Permute{
+
+    static inline __m128 Permute0(__m128 in){
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m128 Permute1(__m128 in){
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m128 Permute2(__m128 in){
+      return in;
+    };
+    static inline __m128 Permute3(__m128 in){
+      return in;
+    };
+
+    static inline __m128d Permute0(__m128d in){
+      return _mm_shuffle_pd(in,in,0x1);
+    };
+    static inline __m128d Permute1(__m128d in){
+      return in;
+    };
+    static inline __m128d Permute2(__m128d in){
+      return in;
+    };
+    static inline __m128d Permute3(__m128d in){
+      return in;
+    };
+
+  };
+
   //////////////////////////////////////////////
   // Some Template specialization
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {
-    uconv<vtype> conv; 
-    conv.v = b;
-    switch(perm){
-    case 3: break; //empty for SSE4
-    case 2: break; //empty for SSE4
-    case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-    case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-    default: assert(0); break;
-    }
-    a=conv.v;
-  }; 
+
 
   //Complex float Reduce
   template<>
   inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
     __m128 v1; // two complex
-    Optimization::permute(v1,in,0); 
+    v1= Optimization::Permute::Permute0(in); 
     v1= _mm_add_ps(v1,in);
     u128f conv;    conv.v=v1;
     return Grid::ComplexF(conv.f[0],conv.f[1]);
@@ -254,9 +272,9 @@ namespace Optimization {
   template<>
   inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
     __m128 v1,v2; // quad single
-    Optimization::permute(v1,in,0); 
+    v1= Optimization::Permute::Permute0(in); 
     v1= _mm_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2= Optimization::Permute::Permute1(v1); 
     v1 = _mm_add_ps(v1,v2);
     u128f conv; conv.v=v1;
     return conv.f[0];
@@ -274,7 +292,7 @@ namespace Optimization {
   template<>
   inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
     __m128d v1;
-    Optimization::permute(v1,in,0); // avx 256; quad double
+    v1 = Optimization::Permute::Permute0(in); 
     v1 = _mm_add_pd(v1,in);
     u128d conv; conv.v = v1;
     return conv.f[0];
@@ -302,14 +320,6 @@ namespace Grid {
   inline void prefetch_HINT_T0(const char *ptr){
     _mm_prefetch(ptr,_MM_HINT_T0);
   }
-  
-
-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-
 
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 16308f53..6d8d79b8 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -251,15 +251,30 @@ namespace Grid {
     // all subtypes; may not be a good assumption, but could
     // add the vector width as a template param for BG/Q for example
     ////////////////////////////////////////////////////////////////////
+    friend inline void permute0(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute0(b.v);
+    }
+    friend inline void permute1(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute1(b.v);
+    }
+    friend inline void permute2(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute2(b.v);
+    }
+    friend inline void permute3(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute3(b.v);
+    }
     friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
     {
-      Gpermute<Grid_simd>(y,b,perm);
+      if      (perm==3) permute3(y,b);
+      else if (perm==2) permute2(y,b);
+      else if (perm==1) permute1(y,b);
+      else if (perm==0) permute0(y,b);
     }
 
+
     
   };// end of Grid_simd class definition 
 
-
   ///////////////////////
   // Splat
   ///////////////////////
diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h
index 4f741f1d..bed4b11b 100644
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -177,6 +177,7 @@ public:
       permute(out._internal[i],in._internal[i],permutetype);
     }
   }
+
   // Unary negation
   friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
     iVector<vtype,N> ret;
@@ -290,12 +291,15 @@ public:
 	vstream(out._internal[i][j],in._internal[i][j]);
       }}
     }
+
   friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
     for(int i=0;i<N;i++){
       for(int j=0;j<N;j++){
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
     }}
   }
+
+
   // Unary negation
   friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
     iMatrix<vtype,N> ret;
diff --git a/scripts/configure-commands b/scripts/configure-commands
index 0dbd5438..07506d27 100755
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -35,10 +35,10 @@ icpc-avx-openmp-mpi)
 CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 icpc-avx-openmp)
-CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
+CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 icpc-avx2)
-  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
+  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
 icpc-avx512)
   CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3  -std=c++11" --host=none  LIBS="-lgmp -lmpfr" --enable-comms=none
@@ -50,7 +50,7 @@ icpc-mic-avx512)
   CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3  -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
 clang-sse)
-CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
+CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
 clang-avx)
 CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none