Updating to modify non-inlining permute routines and hopefully get better reg use and

enhance performance.
2025-10-23 09:14:48 +01:00 · 2015-09-25 08:55:04 -07:00
parent 5ef42add2d
commit 64d64d1ab6
11 changed files with 212 additions and 141 deletions
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -18,8 +18,8 @@
 #include <algorithms/iterative/ConjugateGradientMultiShift.h>

 // Lanczos support
-#include <algorithms/iterative/MatrixUtils.h>
-#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+//#include <algorithms/iterative/MatrixUtils.h>
+//#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>

 #include <algorithms/CoarsenedMatrix.h>

--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -178,6 +178,7 @@ void Grid_init(int *argc,char ***argv)
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
    QCD::WilsonFermionStatic::HandOptDslash=1;
+    QCD::WilsonFermion5DStatic::HandOptDslash=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -13,6 +13,11 @@

 typedef uint32_t Integer;

+#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
+#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
+#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
+#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
+
 namespace Grid {

  typedef  float  RealF;
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -56,13 +56,13 @@
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;

-#define PERMUTE\
-      permute(Chi_00,Chi_00,ptype);\
-      permute(Chi_01,Chi_01,ptype);\
-      permute(Chi_02,Chi_02,ptype);\
-      permute(Chi_10,Chi_10,ptype);\
-      permute(Chi_11,Chi_11,ptype);\
-      permute(Chi_12,Chi_12,ptype);
+#define PERMUTE_DIR(dir)			\
+      permute##dir(Chi_00,Chi_00);\
+      permute##dir(Chi_01,Chi_01);\
+      permute##dir(Chi_02,Chi_02);\
+      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_11,Chi_11);\
+      permute##dir(Chi_12,Chi_12);

 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
@@ -286,6 +286,10 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
+  //  std::cout << "Hand op Dhop "<<std::endl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
  REGISTER Simd result_00; // 12 regs on knc
  REGISTER Simd result_01;
  REGISTER Simd result_02;
@@ -352,7 +356,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
    LOAD_CHIMU;
    XP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -373,7 +377,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
    LOAD_CHIMU;
    YP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -394,7 +398,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
    LOAD_CHIMU;
    ZP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -414,7 +418,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
    LOAD_CHIMU;
    TP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -434,7 +438,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
    LOAD_CHIMU;
    XM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -454,7 +458,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
    LOAD_CHIMU;
    YM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -474,7 +478,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
    LOAD_CHIMU;
    ZM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -494,7 +498,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
    LOAD_CHIMU;
    TM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -526,6 +530,9 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int sU,const FermionField &in, FermionField &out)
 {
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
  REGISTER Simd result_00; // 12 regs on knc
  REGISTER Simd result_01;
  REGISTER Simd result_02;
@@ -592,7 +599,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
    LOAD_CHIMU;
    XM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -612,7 +619,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
    LOAD_CHIMU;
    YM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -633,7 +640,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
    LOAD_CHIMU;
    ZM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -653,7 +660,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
    LOAD_CHIMU;
    TM_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -673,7 +680,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
    LOAD_CHIMU;
    XP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -694,7 +701,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
    LOAD_CHIMU;
    YP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -714,7 +721,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
    LOAD_CHIMU;
    ZP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
@@ -734,7 +741,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
    LOAD_CHIMU;
    TP_PROJ;
    if ( perm) {
-      PERMUTE;
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -183,11 +183,11 @@ namespace Optimization {
    // Complex float
    inline __m256 operator()(__m256 a, __m256 b){
      __m256 ymm0,ymm1,ymm2;
-      ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
      // FIXME AVX2 could MAC
-      ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
-      ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
      ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
      return _mm256_addsub_ps(ymm0,ymm1);  
    }
@@ -270,7 +270,7 @@ namespace Optimization {
    //Complex single
    inline __m256 operator()(__m256 in, __m256 ret){
      __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in);   // r,-i
-      return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
+      return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
    }
    //Complex double
    inline __m256d operator()(__m256d in, __m256d ret){
@@ -282,7 +282,7 @@ namespace Optimization {
  struct TimesI{
    //Complex single
    inline __m256 operator()(__m256 in, __m256 ret){
-      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
+      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
      return _mm256_addsub_ps(_mm256_setzero_ps(),tmp);          // i,-r
    }
    //Complex double
@@ -296,27 +296,44 @@ namespace Optimization {
  // Some Template specialization
  //////////////////////////////////////////////

-  template < typename vtype > 
-    void permute(vtype &a,vtype b, int perm) {
-    uconv<vtype> conv;
-    conv.v = b;
-    switch (perm){
-      // 8x32 bits=>3 permutes
-    case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-    case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-    case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
-    default: assert(0); break;
-    }
-    a = conv.v;
-  }
+  struct Permute{
+
+    static inline __m256 Permute0(__m256 in){
+      return _mm256_permute2f128_ps(in,in,0x01);
+    };
+    static inline __m256 Permute1(__m256 in){
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m256 Permute2(__m256 in){
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m256 Permute3(__m256 in){
+      return in;
+    };
+
+    static inline __m256d Permute0(__m256d in){
+      return _mm256_permute2f128_pd(in,in,0x01);
+    };
+    static inline __m256d Permute1(__m256d in){
+      return _mm256_shuffle_pd(in,in,0x5);
+    };
+    static inline __m256d Permute2(__m256d in){
+      return in;
+    };
+    static inline __m256d Permute3(__m256d in){
+      return in;
+    };
+
+  };
+

  //Complex float Reduce
  template<>
    inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
    __m256 v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; quad complex single
-    v1 = _mm256_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
+    v1= _mm256_add_ps(v1,in);
+    v2=Optimization::Permute::Permute1(v1); 
    v1 = _mm256_add_ps(v1,v2);
    u256f conv; conv.v = v1;
    return Grid::ComplexF(conv.f[0],conv.f[1]);
@@ -326,11 +343,11 @@ namespace Optimization {
  template<>
  inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
    __m256 v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; octo-double
+    v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
    v1 = _mm256_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2 = Optimization::Permute::Permute1(v1); 
    v1 = _mm256_add_ps(v1,v2);
-    Optimization::permute(v2,v1,2); 
+    v2 = Optimization::Permute::Permute2(v1); 
    v1 = _mm256_add_ps(v1,v2);
    u256f conv; conv.v=v1;
    return conv.f[0];
@@ -341,7 +358,7 @@ namespace Optimization {
  template<>
  inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
    __m256d v1;
-    Optimization::permute(v1,in,0); // sse 128; paired complex single
+    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
    v1 = _mm256_add_pd(v1,in);
    u256d conv; conv.v = v1;
    return Grid::ComplexD(conv.f[0],conv.f[1]);
@@ -351,9 +368,9 @@ namespace Optimization {
  template<>
  inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
    __m256d v1,v2;
-    Optimization::permute(v1,in,0); // avx 256; quad double
+    v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
    v1 = _mm256_add_pd(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2 = Optimization::Permute::Permute1(v1); 
    v1 = _mm256_add_pd(v1,v2);
    u256d conv; conv.v = v1;
    return conv.f[0];
@@ -387,13 +404,6 @@ namespace Grid {
    _mm_prefetch(ptr,_MM_HINT_T0);
  }

-
-
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  };
-
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -174,7 +174,7 @@ namespace Optimization {
    inline __m512d operator()(__m512d a, __m512d b){
      __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
      __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
-      a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
+      a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); 
      return _mm512_fmaddsub_pd( a_real, b, a_imag );
    }
  };
@@ -211,26 +211,24 @@ namespace Optimization {
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
-      return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
+      return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));   // 0x4E??
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
-      return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
-    }
-
-
+      return _mm512_shuffle_pd(tmp,tmp,0x55);
+    } 
  };

  struct TimesI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
+      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
+      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
    }

@@ -239,6 +237,36 @@ namespace Optimization {


  
+  // Gpermute utilities consider coalescing into 1 Gpermute
+  struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+
+    static inline __m512d Permute0(__m512d in){
+      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_shuffle_pd(in,in,0x55);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };


  //////////////////////////////////////////////
@@ -298,25 +326,6 @@ namespace Grid {
  }


-
-
-  // Gpermute utilities consider coalescing into 1 Gpermute
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    union { 
-      __m512 f;
-      decltype(VectorSIMD::v) v;
-    } conv;
-    conv.v = b.v;
-    switch(perm){
-    case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-    case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
-    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-    default: assert(0); break;
-    }
-    y.v=conv.v;
-  };
  
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -255,7 +255,36 @@ namespace Optimization {
  };


-  
+   struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB); 
+    };
+
+    static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
+      return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };
+ 


  //////////////////////////////////////////////
@@ -315,25 +344,6 @@ namespace Grid {
  }


-
-
-  // Gpermute utilities consider coalescing into 1 Gpermute
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    union { 
-      __m512 f;
-      decltype(VectorSIMD::v) v;
-    } conv;
-    conv.v = b.v;
-    switch(perm){
-    case 3:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
-    case 2:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; 
-    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
-    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-    default: assert(0); break;
-    }
-    y.v=conv.v;
-  };
  
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -151,10 +151,10 @@ namespace Optimization {
    // Complex float
    inline __m128 operator()(__m128 a, __m128 b){
      __m128 ymm0,ymm1,ymm2;
-      ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      ymm0 = _mm_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
-      ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
-      ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
      ymm1 = _mm_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
      return _mm_addsub_ps(ymm0,ymm1);    
    }
@@ -201,7 +201,7 @@ namespace Optimization {
    //Complex single
    inline __m128 operator()(__m128 in, __m128 ret){
      __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
-      return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
+      return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
    }
    //Complex double
    inline __m128d operator()(__m128d in, __m128d ret){
@@ -215,7 +215,7 @@ namespace Optimization {
  struct TimesI{
    //Complex single
    inline __m128 operator()(__m128 in, __m128 ret){
-      __m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1));
+      __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
      return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
    }
    //Complex double
@@ -225,27 +225,45 @@ namespace Optimization {
    }
  };

+  struct Permute{
+
+    static inline __m128 Permute0(__m128 in){
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m128 Permute1(__m128 in){
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m128 Permute2(__m128 in){
+      return in;
+    };
+    static inline __m128 Permute3(__m128 in){
+      return in;
+    };
+
+    static inline __m128d Permute0(__m128d in){
+      return _mm_shuffle_pd(in,in,0x1);
+    };
+    static inline __m128d Permute1(__m128d in){
+      return in;
+    };
+    static inline __m128d Permute2(__m128d in){
+      return in;
+    };
+    static inline __m128d Permute3(__m128d in){
+      return in;
+    };
+
+  };
+
  //////////////////////////////////////////////
  // Some Template specialization
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {
-    uconv<vtype> conv; 
-    conv.v = b;
-    switch(perm){
-    case 3: break; //empty for SSE4
-    case 2: break; //empty for SSE4
-    case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-    case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-    default: assert(0); break;
-    }
-    a=conv.v;
-  }; 
+

  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
    __m128 v1; // two complex
-    Optimization::permute(v1,in,0); 
+    v1= Optimization::Permute::Permute0(in); 
    v1= _mm_add_ps(v1,in);
    u128f conv;    conv.v=v1;
    return Grid::ComplexF(conv.f[0],conv.f[1]);
@@ -254,9 +272,9 @@ namespace Optimization {
  template<>
  inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
    __m128 v1,v2; // quad single
-    Optimization::permute(v1,in,0); 
+    v1= Optimization::Permute::Permute0(in); 
    v1= _mm_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); 
+    v2= Optimization::Permute::Permute1(v1); 
    v1 = _mm_add_ps(v1,v2);
    u128f conv; conv.v=v1;
    return conv.f[0];
@@ -274,7 +292,7 @@ namespace Optimization {
  template<>
  inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
    __m128d v1;
-    Optimization::permute(v1,in,0); // avx 256; quad double
+    v1 = Optimization::Permute::Permute0(in); 
    v1 = _mm_add_pd(v1,in);
    u128d conv; conv.v = v1;
    return conv.f[0];
@@ -302,14 +320,6 @@ namespace Grid {
  inline void prefetch_HINT_T0(const char *ptr){
    _mm_prefetch(ptr,_MM_HINT_T0);
  }
-  
-
-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-

  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -251,15 +251,30 @@ namespace Grid {
    // all subtypes; may not be a good assumption, but could
    // add the vector width as a template param for BG/Q for example
    ////////////////////////////////////////////////////////////////////
+    friend inline void permute0(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute0(b.v);
+    }
+    friend inline void permute1(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute1(b.v);
+    }
+    friend inline void permute2(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute2(b.v);
+    }
+    friend inline void permute3(Grid_simd &y,Grid_simd b){
+      y.v = Optimization::Permute::Permute3(b.v);
+    }
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
    {
-      Gpermute<Grid_simd>(y,b,perm);
+      if      (perm==3) permute3(y,b);
+      else if (perm==2) permute2(y,b);
+      else if (perm==1) permute1(y,b);
+      else if (perm==0) permute0(y,b);
    }

+
    
  };// end of Grid_simd class definition 

-
  ///////////////////////
  // Splat
  ///////////////////////
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -177,6 +177,7 @@ public:
      permute(out._internal[i],in._internal[i],permutetype);
    }
  }
+
  // Unary negation
  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
    iVector<vtype,N> ret;
@@ -290,12 +291,15 @@ public:
 	vstream(out._internal[i][j],in._internal[i][j]);
      }}
    }
+
  friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
    }}
  }
+
+
  // Unary negation
  friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
    iMatrix<vtype,N> ret;
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -35,10 +35,10 @@ icpc-avx-openmp-mpi)
 CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 icpc-avx-openmp)
-CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
+CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 icpc-avx2)
-  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
+  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 icpc-avx512)
  CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3  -std=c++11" --host=none  LIBS="-lgmp -lmpfr" --enable-comms=none
@@ -50,7 +50,7 @@ icpc-mic-avx512)
  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3  -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-sse)
-CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
+CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx)
 CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none