From 933c54d9c419183c934b0b23b819c3a083d8af90 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 16 Apr 2015 14:47:28 +0100
Subject: [PATCH] Improving the trace support to support any index tracing and
 simplifying implmentation in some ways

---
 Grid_QCD.h              |  10 +-
 Grid_config.h           |   4 +-
 Grid_main.cc            |  21 +++-
 Grid_math_type_mapper.h |  41 +++++---
 Grid_math_types.h       | 224 ++++++++++++++++++++++++++++++++++++++--
 Grid_simd.h             |  24 +++--
 Grid_vComplexD.h        |  18 ++--
 Grid_vComplexF.h        |  20 ++--
 Grid_vInteger.h         |  14 +--
 Grid_vRealD.h           |  14 +--
 Grid_vRealF.h           |  14 +--
 test_Grid_stencil.cc    |   4 +-
 12 files changed, 321 insertions(+), 87 deletions(-)
diff --git a/Grid_QCD.h b/Grid_QCD.h
index 72a8ee67..2f411f16 100644
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -7,12 +7,11 @@ namespace QCD {
     static const int Ns=4;
     static const int CbRed  =0;
     static const int CbBlack=1;
-
+    
     // QCD iMatrix types
     template<typename vtype> using iSinglet          = iScalar<iScalar<vtype> > ;
     template<typename vtype> using iSpinMatrix       = iMatrix<iScalar<vtype>, Ns>;
     template<typename vtype> using iSpinColourMatrix = iMatrix<iMatrix<vtype, Nc>, Ns>;
-
     template<typename vtype> using iColourMatrix     = iScalar<iMatrix<vtype, Nc>> ;
 
     template<typename vtype> using iSpinVector       = iVector<iScalar<vtype>, Ns>;
@@ -20,10 +19,8 @@ namespace QCD {
     template<typename vtype> using iSpinColourVector = iVector<iVector<vtype, Nc>, Ns>;
 
     typedef iSinglet<Complex >          TComplex;    // This is painful. Tensor singlet complex type.
-    typedef iSinglet<vComplex >         vTComplex;
-    typedef iSinglet<Real >             TReal;    // This is painful. Tensor singlet complex type.
-
-
+    typedef iSinglet<vComplex >         vTComplex;   // what if we don't know the tensor structure
+    typedef iSinglet<Real >             TReal;       // Shouldn't need these.
     typedef iSinglet<vInteger >         vTInteger;
 
     typedef iSpinMatrix<Complex >       SpinMatrix;
@@ -44,7 +41,6 @@ namespace QCD {
     typedef iSpinColourVector<vComplex > vSpinColourVector;
     
     typedef Lattice<vTComplex>            LatticeComplex;
-
     typedef Lattice<vInteger>            LatticeInteger; // Predicates for "where"
     
     typedef Lattice<vColourMatrix>     LatticeColourMatrix;
diff --git a/Grid_config.h b/Grid_config.h
index bb885708..5faa2157 100644
--- a/Grid_config.h
+++ b/Grid_config.h
@@ -73,8 +73,8 @@
 /* Define to the version of this package. */
 #define PACKAGE_VERSION "1.0"
 
-/* SSE2 */
-/* #undef SSE2 */
+/* SSE4 */
+/* #undef SSE4 */
 
 /* Define to 1 if you have the ANSI C header files. */
 #define STDC_HEADERS 1
diff --git a/Grid_main.cc b/Grid_main.cc
index 8cac55f5..e122fcbe 100644
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -152,6 +152,8 @@ int main (int argc, char ** argv)
     cMat= adj(cMat);
     cm=adj(cm);
     scm=adj(scm);
+    scm=transpose(scm);
+    scm=transposeIndex<1>(scm);
     
 //    Foo = Foo+scalar; // LatticeColourMatrix+Scalar
 //    Foo = Foo*scalar; // LatticeColourMatrix*Scalar
@@ -162,6 +164,23 @@ int main (int argc, char ** argv)
     
     LatticeComplex trscMat(&Fine);
     trscMat = trace(scMat); // Trace
+
+    {
+      TComplex      c;
+      ColourMatrix c_m;   
+      SpinMatrix   s_m;   
+      SpinColourMatrix sc_m; 
+
+      s_m = traceIndex<1>(sc_m);
+      c_m = traceIndex<2>(sc_m);
+
+      c   = traceIndex<2>(s_m);
+      c   = traceIndex<1>(c_m);
+
+      printf("c. Level %d\n",c_m.TensorLevel);
+      printf("c. Level %d\n",c_m._internal.TensorLevel);
+
+    }
     
     FooBar = Bar;
  
@@ -203,7 +222,7 @@ int main (int argc, char ** argv)
         peekSite(bar,Bar,coor);
         for(int r=0;r<3;r++){
         for(int c=0;c<3;c++){
-	  cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
+	  //	  cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
 	}}
       }}}}
     } 
diff --git a/Grid_math_type_mapper.h b/Grid_math_type_mapper.h
index 4e7120b1..7f6d85f5 100644
--- a/Grid_math_type_mapper.h
+++ b/Grid_math_type_mapper.h
@@ -1,8 +1,22 @@
 #ifndef GRID_MATH_TYPE_MAPPER_H
 #define GRID_MATH_TYPE_MAPPER_H
+
 namespace Grid {
+
+
 //////////////////////////////////////////////////////////////////////////////////
 // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
+// Use of a helper class like this allows us to template specialise and "dress"
+// other classes such as RealD == double, ComplexD == std::complex<double> with these
+// traits.
+//
+// It is possible that we could do this more elegantly if I introduced a 
+// queryable trait in iScalar, iMatrix and iVector and used the query on vtype in 
+// place of the type mapper?
+//
+// Not sure how to do this, but probably could be done with a research effort
+// to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
+//
 //////////////////////////////////////////////////////////////////////////////////
   
   template <class T> class GridTypeMapper {
@@ -10,7 +24,7 @@ namespace Grid {
     typedef typename T::scalar_type scalar_type;
     typedef typename T::vector_type vector_type;
     typedef typename T::tensor_reduced tensor_reduced;
-    //    enum { TensorLevel = T::TensorLevel };
+    enum { TensorLevel = T::TensorLevel };
   };
 
 //////////////////////////////////////////////////////////////////////////////////
@@ -21,28 +35,28 @@ namespace Grid {
     typedef RealF scalar_type;
     typedef RealF vector_type;
     typedef RealF tensor_reduced ;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
   template<> class GridTypeMapper<RealD> {
   public:
     typedef RealD scalar_type;
     typedef RealD vector_type;
     typedef RealD tensor_reduced;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
   template<> class GridTypeMapper<ComplexF> {
   public:
     typedef ComplexF scalar_type;
     typedef ComplexF vector_type;
     typedef ComplexF tensor_reduced;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
   template<> class GridTypeMapper<ComplexD> {
   public:
     typedef ComplexD scalar_type;
     typedef ComplexD vector_type;
     typedef ComplexD tensor_reduced;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
 
   template<> class GridTypeMapper<vRealF> {
@@ -50,44 +64,37 @@ namespace Grid {
     typedef RealF  scalar_type;
     typedef vRealF vector_type;
     typedef vRealF tensor_reduced;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
   template<> class GridTypeMapper<vRealD> {
   public:
     typedef RealD  scalar_type;
     typedef vRealD vector_type;
     typedef vRealD tensor_reduced;
-    //enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
   template<> class GridTypeMapper<vComplexF> {
   public:
     typedef ComplexF  scalar_type;
     typedef vComplexF vector_type;
     typedef vComplexF tensor_reduced;
-    //enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
   template<> class GridTypeMapper<vComplexD> {
   public:
     typedef ComplexD  scalar_type;
     typedef vComplexD vector_type;
     typedef vComplexD tensor_reduced;
-    //enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
   template<> class GridTypeMapper<vInteger> {
   public:
     typedef Integer  scalar_type;
     typedef vInteger vector_type;
     typedef vInteger tensor_reduced;
-    //enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
   };
 
-  // Again terminate the recursion.
-  inline vRealD    TensorRemove(vRealD    arg){ return arg;}
-  inline vRealF    TensorRemove(vRealF    arg){ return arg;}
-  inline vComplexF TensorRemove(vComplexF arg){ return arg;}
-  inline vComplexD TensorRemove(vComplexD arg){ return arg;}
-  inline vInteger  TensorRemove(vInteger  arg){ return arg;}
-
 }
 
 #endif
diff --git a/Grid_math_types.h b/Grid_math_types.h
index db90bda7..58de9c15 100644
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -2,15 +2,76 @@
 #define GRID_MATH_TYPES_H
 
 #include <Grid_math_type_mapper.h>
+#include <type_traits>
 
 namespace Grid {
 
+  // First some of my own traits
+  template<typename T> struct isGridTensor {
+    static const bool value = true;
+    static const bool notvalue = false;
+  };
+
+  template<> struct isGridTensor<RealD > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<RealF > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<ComplexD > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<ComplexF > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<Integer > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vRealD > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vRealF > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vComplexD > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vComplexF > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vInteger > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+
+  // Match the index
+  template<typename T,int Level> struct matchGridTensorIndex {
+    static const bool value = (Level==T::TensorLevel);
+    static const bool notvalue = (Level!=T::TensorLevel);
+  };
+
 
 ///////////////////////////////////////////////////
 // Scalar, Vector, Matrix objects.
 // These can be composed to form tensor products of internal indices.
 ///////////////////////////////////////////////////
-    
+
+  // Terminates the recursion for temoval of all Grids tensors
+  inline vRealD    TensorRemove(vRealD    arg){ return arg;}
+  inline vRealF    TensorRemove(vRealF    arg){ return arg;}
+  inline vComplexF TensorRemove(vComplexF arg){ return arg;}
+  inline vComplexD TensorRemove(vComplexD arg){ return arg;}
+  inline vInteger  TensorRemove(vInteger  arg){ return arg;}
+
 template<class vtype> class iScalar
 {
 public:
@@ -19,9 +80,12 @@ public:
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
-  //enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
   typedef iScalar<tensor_reduced_v> tensor_reduced;
 
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+
+  // Scalar no action
+  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
 
     iScalar(){};
     
@@ -79,9 +143,9 @@ public:
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
-  //  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
-  typedef iScalar<tensor_reduced_v> tensor_reduced;
 
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
 
     iVector(Zero &z){ *this = zero; };
     iVector() {};
@@ -137,8 +201,10 @@ public:
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+
   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
-  //  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
   typedef iScalar<tensor_reduced_v> tensor_reduced;
 
   iMatrix(Zero &z){ *this = zero; };
@@ -1018,6 +1084,137 @@ template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &
     }}
     return ret;
 }
+/////////////////////////////////////////////////////////////////
+// Transpose
+/////////////////////////////////////////////////////////////////
+template<class vtype,int N>
+  inline typename std::enable_if<isGridTensor<vtype>::value, iMatrix<vtype,N> >::type 
+  transpose(iMatrix<vtype,N> arg)
+  {
+    iMatrix<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = transpose(arg._internal[j][i]); // NB recurses
+      }}
+    return ret;
+  }
+template<class vtype,int N>
+  inline typename std::enable_if<isGridTensor<vtype>::notvalue, iMatrix<vtype,N> >::type 
+  transpose(iMatrix<vtype,N> arg)
+  {
+    iMatrix<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = arg._internal[j][i]; // Stop recursion if not a tensor type
+      }}
+    return ret;
+  }
+
+template<class vtype,int N>
+  inline typename std::enable_if<isGridTensor<vtype>::value, iScalar<vtype> >::type 
+  transpose(iScalar<vtype> arg)
+  {
+    iScalar<vtype> ret;
+    ret._internal = transpose(arg._internal); // NB recurses
+    return ret;
+  }
+
+template<class vtype,int N>
+  inline typename std::enable_if<isGridTensor<vtype>::notvalue, iScalar<vtype> >::type 
+  transpose(iScalar<vtype> arg)
+  {
+    iScalar<vtype> ret;
+    ret._internal = arg._internal; // NB recursion stops
+    return ret;
+  }
+
+
+
+/*
+ *  No need to implement transpose on the primitive types
+ *  Not sure that this idiom is any more elegant that the trace idiom below however!
+inline ComplexD transpose(ComplexD &rhs){  return rhs;}
+inline ComplexF transpose(ComplexF &rhs){  return rhs;}
+inline RealD transpose(RealD &rhs){  return rhs;}
+inline RealF transpose(RealF &rhs){  return rhs;}
+ */
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Transpose a specific index
+////////////////////////////////////////////////////////////////////////////////////////////
+template<int Level,class vtype,int N> inline 
+  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::value, iMatrix<vtype,N> >::type 
+transposeIndex (const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = arg._internal[j][i]; 
+  }}
+  return ret;
+}
+// or not
+template<int Level,class vtype,int N> inline 
+typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue, iMatrix<vtype,N> >::type 
+transposeIndex (const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = transposeIndex<Level>(arg._internal[i][j]); 
+  }}
+  return ret;
+}
+template<int Level,class vtype> inline 
+typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue, iScalar<vtype> >::type 
+transposeIndex (const iScalar<vtype> &arg)
+{
+  return transposeIndex<Level>(arg._internal);
+}
+////////////////////////////////
+// Trace a specific index 
+////////////////////////////////
+
+template<int Level> inline ComplexF traceIndex(const ComplexF arg) { return arg;}
+template<int Level> inline ComplexD traceIndex(const ComplexD arg) { return arg;}
+template<int Level> inline RealF traceIndex(const RealF arg) { return arg;}
+template<int Level> inline RealD traceIndex(const RealD arg) { return arg;}
+
+template<int Level,class vtype> inline 
+auto traceIndex(const iScalar<vtype> &arg) -> iScalar<decltype(traceIndex<Level>(arg._internal)) >
+{
+  iScalar<decltype(traceIndex<Level>(arg._internal))> ret;
+  ret._internal = traceIndex<Level>(arg._internal);
+  return ret;
+}
+
+// If we hit the right index, return scalar and trace it with no further recursion
+template<int Level,class vtype,int N> inline 
+auto traceIndex(const iMatrix<vtype,N> &arg) ->
+  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value,  // Index matches
+                                                    iScalar<vtype> >::type                              // return scalar
+{
+  iScalar<vtype> ret;
+  zeroit(ret._internal);
+  for(int i=0;i<N;i++){
+    ret._internal = ret._internal + arg._internal[i][i];
+  }
+  return ret;
+}
+
+// not this level, so recurse
+template<int Level,class vtype,int N> inline 
+auto traceIndex(const iMatrix<vtype,N> &arg) ->
+  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,// No index match
+         iMatrix<decltype(traceIndex<Level>(arg._internal[0][0])),N> >::type     // return matrix
+{
+  iMatrix<decltype(traceIndex<Level>(arg._internal[0][0])),N> ret;
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    ret._internal[i][j] = traceIndex<Level>(arg._internal[i][j]);
+  }}
+  return ret;
+}
 
 /////////////////////////////////////////////////////////////////
 // Can only take the real/imag part of scalar objects, since
@@ -1075,17 +1272,24 @@ template<class itype,int N> inline auto imag(const iVector<itype,N> &z) -> iVect
     // Trace of scalar and matrix
     /////////////////////////////////
 
-inline Complex trace( const Complex &arg){
+inline ComplexF trace( const ComplexF &arg){
     return arg;
 }
-//inline vComplex trace(const vComplex &arg){
-//    return arg;
-//}
+inline ComplexD trace( const ComplexD &arg){
+    return arg;
+}
+inline RealF trace( const RealF &arg){
+    return arg;
+}
+inline RealD trace( const RealD &arg){
+    return arg;
+}
+
 template<class vtype,int N>
 inline auto trace(const iMatrix<vtype,N> &arg) -> iScalar<decltype(trace(arg._internal[0][0]))>
 {
     iScalar<decltype( trace(arg._internal[0][0] )) > ret;
-    ZeroIt(ret._internal);
+    zeroit(ret._internal);
     for(int i=0;i<N;i++){
         ret._internal=ret._internal+trace(arg._internal[i][i]);
     }
diff --git a/Grid_simd.h b/Grid_simd.h
index f0cf6c08..ae05c3b2 100644
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -12,7 +12,7 @@
 ////////////////////////////////////////////////////////////////////////
     
 
-#ifdef SSE2
+#ifdef SSE4
 #include <pmmintrin.h>
 #endif
 #if defined(AVX1) || defined (AVX2)
@@ -33,6 +33,12 @@ namespace Grid {
 
   inline RealF adj(const RealF  & r){ return r; }
   inline RealF conj(const RealF  & r){ return r; }
+  inline RealF real(const RealF  & r){ return r; }
+
+  inline RealD adj(const RealD  & r){ return r; }
+  inline RealD conj(const RealD  & r){ return r; }
+  inline RealD real(const RealD  & r){ return r; }
+
   inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
   inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
   inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
@@ -60,8 +66,6 @@ namespace Grid {
     inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
     inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
     inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
-    inline RealD adj(const RealD & r){ return r; }  // No-op for real
-    inline RealD conj(const RealD & r){ return r; }
     
     inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
     inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
@@ -72,14 +76,14 @@ namespace Grid {
 
   class Zero{};
   static Zero zero;
-  template<class itype> inline void ZeroIt(itype &arg){ arg=zero;};
-  template<>            inline void ZeroIt(ComplexF &arg){ arg=0; };
-  template<>            inline void ZeroIt(ComplexD &arg){ arg=0; };
-  template<>            inline void ZeroIt(RealF &arg){ arg=0; };
-  template<>            inline void ZeroIt(RealD &arg){ arg=0; };
+  template<class itype> inline void zeroit(itype &arg){ arg=zero;};
+  template<>            inline void zeroit(ComplexF &arg){ arg=0; };
+  template<>            inline void zeroit(ComplexD &arg){ arg=0; };
+  template<>            inline void zeroit(RealF &arg){ arg=0; };
+  template<>            inline void zeroit(RealD &arg){ arg=0; };
 
 
-#if defined (SSE2)
+#if defined (SSE4)
     typedef __m128 fvec;
     typedef __m128d dvec;
     typedef __m128 cvec;
@@ -202,7 +206,7 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
       case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
       case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
 #endif
-#ifdef SSE2
+#ifdef SSE4
       case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
       case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
 #endif
diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h
index 133e354b..ffcff3a9 100644
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -49,7 +49,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_add_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_add_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -67,7 +67,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_sub_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_sub_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -114,7 +114,7 @@ namespace Grid {
             ymm1 = _mm256_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
             ret.v= _mm256_addsub_pd(ymm0,ymm1);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             zvec ymm0,ymm1,ymm2;
             ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar,
             ymm0 = _mm_mul_pd(ymm0,b.v);        // ymm0 <- ar bi, ar br
@@ -200,14 +200,14 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_pd(ig,rl,ig,rl);
 #endif
-#ifdef SSE2
-            ret.v = _mm_set_pd(a,b);
+#ifdef SSE4
+            ret.v = _mm_set_pd(ig,rl);
 #endif
 #ifdef AVX512
             ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl);
 #endif
 #ifdef QPX
-            ret.v = {a,b,a,b};
+            ret.v = {ig,rl,ig,rl};
 #endif
         }
 
@@ -215,7 +215,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_set_pd(a[0].imag(),a[0].real());
 #endif
 #ifdef AVX512
@@ -231,7 +231,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 #if defined (AVX1)|| defined (AVX2)
        _mm256_store_pd((double *)a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
        _mm_store_pd((double *)a,ret.v);
 #endif
 #ifdef AVX512
@@ -257,7 +257,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
             __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
              ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_addsub_pd(ret.v,in.v);
 #endif
 #ifdef AVX512
diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h
index 75c47248..42fe5163 100644
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -63,7 +63,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_add_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_add_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -81,7 +81,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_sub_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_sub_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -120,7 +120,7 @@ namespace Grid {
             ymm1 = _mm256_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
             ret.v= _mm256_addsub_ps(ymm0,ymm1);    
 #endif
-#ifdef SSE2
+#ifdef SSE4
             cvec ymm0,ymm1,ymm2;
             ymm0 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
             ymm0 = _mm_mul_ps(ymm0,b.v);        // ymm0 <- ar bi, ar br
@@ -156,8 +156,8 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
 #endif
-#ifdef SSE2
-            ret.v = _mm_set_ps(a[1].imag, a[1].real(),a[0].imag(),a[0].real());
+#ifdef SSE4
+            ret.v = _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
 #endif
 #ifdef AVX512
             ret.v = _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
@@ -181,7 +181,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
         _mm256_store_ps((float *)a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
         _mm_store_ps((float *)a,ret.v);
 #endif
 #ifdef AVX512
@@ -201,7 +201,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_ps(b,a,b,a,b,a,b,a);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_set_ps(a,b,a,b);
 #endif
 #ifdef AVX512
@@ -213,7 +213,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
         }
        friend inline ComplexF Reduce(const vComplexF & in)
        {
+#ifdef SSE4
+#error
+#endif
 #if defined (AVX1) || defined(AVX2)
+	 // FIXME this is inefficient and use 
          __attribute__ ((aligned(32))) float c_[8];
          _mm256_store_ps(c_,in.v);
          return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
@@ -305,7 +309,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
              tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
              ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_addsub_ps(ret.v,in.v);
 #endif
 #ifdef AVX512
diff --git a/Grid_vInteger.h b/Grid_vInteger.h
index 82adbd8e..672d4938 100644
--- a/Grid_vInteger.h
+++ b/Grid_vInteger.h
@@ -48,7 +48,7 @@ namespace Grid {
 #if defined (AVX2)
             ret.v = _mm256_add_epi32(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_add_epi32(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -78,7 +78,7 @@ namespace Grid {
 #if defined (AVX2)
             ret.v = _mm256_sub_epi32(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_sub_epi32(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -108,7 +108,7 @@ namespace Grid {
 #if defined (AVX2)
             ret.v = _mm256_mul_epi32(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_mul_epi32(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -147,7 +147,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set1_epi32(a);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_set1_epi32(a);
 #endif
 #ifdef AVX512
@@ -161,7 +161,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
 	  ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 #endif
-#ifdef SSE2
+#ifdef SSE4
 	  ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
 #endif
 #ifdef AVX512
@@ -177,8 +177,8 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
 	  _mm256_store_si256((__m256i*)a,ret.v);
 #endif
-#ifdef SSE2
-	  _mm_store_si128(a,ret.v);
+#ifdef SSE4
+	  _mm_store_si128((__m128i *)a,ret.v);
 #endif
 #ifdef AVX512
 	  _mm512_store_si512(a,ret.v);
diff --git a/Grid_vRealD.h b/Grid_vRealD.h
index 86f738f3..0f88f7da 100644
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -24,7 +24,7 @@ namespace Grid {
         friend inline vRealD conj(const vRealD &in){ return in; }
         
         friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
-#if defined (AVX1) || defined (SSE2)
+#if defined (AVX1) || defined (SSE4)
             y = a*x+y;
 #endif
 #ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
@@ -55,7 +55,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_add_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_add_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -72,7 +72,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_sub_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_sub_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -90,7 +90,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_mul_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_mul_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -133,7 +133,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_pd(a,a,a,a);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_set_pd(a,a);
 #endif
 #ifdef AVX512
@@ -147,7 +147,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_set_pd(a[0],a[1]);
 #endif
 #ifdef AVX512
@@ -163,7 +163,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             _mm256_store_pd(a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             _mm_store_pd(a,ret.v);
 #endif
 #ifdef AVX512
diff --git a/Grid_vRealF.h b/Grid_vRealF.h
index b7c127ce..78c37177 100644
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -26,7 +26,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_add_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_add_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -48,7 +48,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_sub_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_sub_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -70,7 +70,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_mul_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_mul_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -96,7 +96,7 @@ namespace Grid {
         friend inline vRealF conj(const vRealF &in){ return in; }
 
         friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
-#if defined (AVX1) || defined (SSE2)
+#if defined (AVX1) || defined (SSE4)
             y = a*x+y;
 #endif
 #ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
@@ -158,7 +158,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_set_ps(a,a,a,a);
 #endif
 #ifdef AVX512
@@ -175,7 +175,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 #endif
-#ifdef SSE2
+#ifdef SSE4
             ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
 #endif
 #ifdef AVX512
@@ -195,7 +195,7 @@ friend inline void vstore(const vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
 	_mm256_store_ps(a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
 	_mm_store_ps(a,ret.v);
 #endif
 #ifdef AVX512
diff --git a/test_Grid_stencil.cc b/test_Grid_stencil.cc
index 8077562f..fd2a9503 100644
--- a/test_Grid_stencil.cc
+++ b/test_Grid_stencil.cc
@@ -29,14 +29,14 @@ int main (int argc, char ** argv)
     
 #ifdef AVX512
     simd_layout[0] = 1;
-    simd_layout[1] = 1;
+    simd_layout[1] = 2;
     simd_layout[2] = 2;
     simd_layout[3] = 2;
 #endif
 #if defined (AVX1)|| defined (AVX2)
     simd_layout[0] = 1;
     simd_layout[1] = 1;
-    simd_layout[2] = 1;
+    simd_layout[2] = 2;
     simd_layout[3] = 2;
 #endif
 #if defined (SSE2)