diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h
index cccc82e0..26edb4c9 100644
--- a/lib/Grid_simd.h
+++ b/lib/Grid_simd.h
@@ -95,6 +95,20 @@ namespace Grid {
   template<>            inline void zeroit(RealF &arg){ arg=0; };
   template<>            inline void zeroit(RealD &arg){ arg=0; };
   
+
+  //////////////////////////////////////////////////////////
+  // Permute
+  // Permute 0 every ABCDEFGH -> BA DC FE HG
+  // Permute 1 every ABCDEFGH -> CD AB GH EF
+  // Permute 2 every ABCDEFGH -> EFGH ABCD
+  // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
+  // Permute 4 possible on half precision @512bit vectors.
+  //
+  // Defined inside SIMD specialization files
+  //////////////////////////////////////////////////////////
+  template<class VectorSIMD>
+    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);
+
 };
 
 #include <simd/Grid_vector_types.h>
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index f9fa1f85..ec2df44b 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-05-22 18:58:27 neo>
+// Time-stamp: <2015-05-27 12:07:15 neo>
 //----------------------------------------------------------------------
 
 #include <immintrin.h>
@@ -383,6 +383,30 @@ namespace Grid {
   typedef __m256d SIMD_Dtype; // Double precision type
   typedef __m256i SIMD_Itype; // Integer type
 
+  // prefecthing 
+  inline void v_prefetch0(int size, const char *ptr){
+    for(int i=0;i<size;i+=64){ //  Define L1 linesize above
+      _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
+      _mm_prefetch(ptr+i+512,_MM_HINT_T0);
+    }
+  }
+  
+  template < typename VectorSIMD > 
+    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
+    union { 
+      __m256 f;
+      decltype(VectorSIMD::v) v;
+    } conv;
+    conv.v = b.v;
+    switch(perm){
+    case 3: break; //empty for AVX1/2
+    case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
+    case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));  break; 
+    case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
+    default: assert(0); break;
+    }
+    y.v=conv.v;
+  };
 
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
diff --git a/lib/simd/Grid_knc.h b/lib/simd/Grid_knc.h
index daec3973..bb914270 100644
--- a/lib/simd/Grid_knc.h
+++ b/lib/simd/Grid_knc.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-05-22 17:12:44 neo>
+// Time-stamp: <2015-05-27 12:08:50 neo>
 //----------------------------------------------------------------------
 
 #include <immintrin.h>
@@ -302,7 +302,32 @@ namespace Grid {
   typedef __m512d SIMD_Dtype; // Double precision type
   typedef __m512i SIMD_Itype; // Integer type
 
+  // prefecth
+  inline void v_prefetch0(int size, const char *ptr){
+    for(int i=0;i<size;i+=64){ //  Define L1 linesize above
+      _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
+      _mm_prefetch(ptr+i+512,_MM_HINT_T0);
+    }
+  }
 
+  // Gpermute utilities consider coalescing into 1 Gpermute
+  template < typename VectorSIMD > 
+    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
+    union { 
+      __m512 f;
+      decltype(VectorSIMD::v) v;
+    } conv;
+    conv.v = b.v;
+    switch(perm){
+    case 3:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
+    case 2:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; 
+    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
+    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
+    default: assert(0); break;
+    }
+    y=conv.v;
+  };
+  
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
   typedef Optimization::Vstore   VstoreSIMD;
diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index dc0251f8..2bd5a20e 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-05-22 17:29:26 neo>
+// Time-stamp: <2015-05-27 11:30:21 neo>
 //----------------------------------------------------------------------
 
 // lot of undefined functions
@@ -251,6 +251,7 @@ namespace Grid {
   typedef vector4double SIMD_Dtype; // Double precision type
   typedef int SIMD_Itype;           // Integer type
 
+  inline void v_prefetch0(int size, const char *ptr){};
 
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index eb930003..ce2737d5 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-05-21 18:06:30 neo>
+// Time-stamp: <2015-05-27 12:02:07 neo>
 //----------------------------------------------------------------------
 
 #include <pmmintrin.h>
@@ -221,7 +221,7 @@ namespace Optimization {
   };
 
 
-  
+
 
 
   //////////////////////////////////////////////
@@ -277,6 +277,10 @@ namespace Optimization {
     assert(0);
   }
   
+
+
+
+
   
 }
 
@@ -289,6 +293,28 @@ namespace Grid {
   typedef __m128i SIMD_Itype; // Integer type
 
 
+  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
+
+  // Gpermute function
+  template < typename VectorSIMD > 
+    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
+    union { 
+      __m128 f;
+      decltype(VectorSIMD::v) v;
+    } conv;
+    conv.v = b.v;
+    switch(perm){
+    case 3: break; //empty for SSE4
+    case 2: break; //empty for SSE4
+    case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
+    case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
+    default: assert(0); break;
+    }
+    y.v=conv.v;
+  }; 
+  
+
+
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
   typedef Optimization::Vstore   VstoreSIMD;
@@ -296,6 +322,8 @@ namespace Grid {
   typedef Optimization::Vstream  VstreamSIMD;
   template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 
+ 
+
 
   // Arithmetic operations
   typedef Optimization::Sum         SumSIMD;
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 41706906..0bedf32f 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -2,7 +2,7 @@
 /*! @file Grid_vector_types.h
   @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-05-26 14:08:13 neo>
+// Time-stamp: <2015-05-27 12:04:06 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
@@ -58,56 +58,7 @@ namespace Grid {
   ///////////////////////////////////////////////
 
 
-  // Move to the simd files
-//////////////////////////////////////////////////////////
-// Permute
-// Permute 0 every ABCDEFGH -> BA DC FE HG
-// Permute 1 every ABCDEFGH -> CD AB GH EF
-// Permute 2 every ABCDEFGH -> EFGH ABCD
-// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
-// Permute 4 possible on half precision @512bit vectors.
-//////////////////////////////////////////////////////////
-template<class vsimd>
-inline void Gpermute(vsimd &y,const vsimd &b,int perm){
-	union { 
-	  SIMD_Ftype f;
-	  decltype(vsimd::v) v;
-	} conv;
-	conv.v = b.v;
-      switch (perm){
-#if defined(AVX1)||defined(AVX2)
-      // 8x32 bits=>3 permutes
-      case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-      case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
-      case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
-#endif
-#ifdef SSE4
-      case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
-      case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break;
-#endif
-#ifdef AVX512
-	// 16 floats=> permutes
-        // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo 
-        // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn 
-        // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
-        // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
-      case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
-      case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
-      case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
-      case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-#endif
-#ifdef QPX
-#error not implemented
-#endif
-      default: assert(0); break;
-      }
-      y.v=conv.v;
-
- };
-
-///////////////////////////////////////
-
-
+  
 
   /*
     @brief Grid_simd class for the SIMD vector type operations