Improving the trace support to support any index tracing and simplifying

implmentation in some ways
2025-11-04 14:04:32 +00:00 · 2015-04-16 14:47:28 +01:00
parent fddb904b4c
commit 933c54d9c4
12 changed files with 321 additions and 87 deletions
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -7,12 +7,11 @@ namespace QCD {
    static const int Ns=4;
    static const int CbRed  =0;
    static const int CbBlack=1;
-
+    
    // QCD iMatrix types
    template<typename vtype> using iSinglet          = iScalar<iScalar<vtype> > ;
    template<typename vtype> using iSpinMatrix       = iMatrix<iScalar<vtype>, Ns>;
    template<typename vtype> using iSpinColourMatrix = iMatrix<iMatrix<vtype, Nc>, Ns>;
    template<typename vtype> using iColourMatrix     = iScalar<iMatrix<vtype, Nc>> ;
    template<typename vtype> using iSpinVector       = iVector<iScalar<vtype>, Ns>;
@@ -20,10 +19,8 @@ namespace QCD {
    template<typename vtype> using iSpinColourVector = iVector<iVector<vtype, Nc>, Ns>;
    typedef iSinglet<Complex >          TComplex;    // This is painful. Tensor singlet complex type.
-    typedef iSinglet<vComplex >         vTComplex;
+    typedef iSinglet<vComplex >         vTComplex;   // what if we don't know the tensor structure
-    typedef iSinglet<Real >             TReal;    // This is painful. Tensor singlet complex type.
+    typedef iSinglet<Real >             TReal;       // Shouldn't need these.
    typedef iSinglet<vInteger >         vTInteger;
    typedef iSpinMatrix<Complex >       SpinMatrix;
@@ -44,7 +41,6 @@ namespace QCD {
    typedef iSpinColourVector<vComplex > vSpinColourVector;
    typedef Lattice<vTComplex>            LatticeComplex;
    typedef Lattice<vInteger>            LatticeInteger; // Predicates for "where"
    typedef Lattice<vColourMatrix>     LatticeColourMatrix;
--- a/Grid_config.h
+++ b/Grid_config.h
@@ -73,8 +73,8 @@
 /* Define to the version of this package. */
 #define PACKAGE_VERSION "1.0"
-/* SSE2 */
+/* SSE4 */
-/* #undef SSE2 */
+/* #undef SSE4 */
 /* Define to 1 if you have the ANSI C header files. */
 #define STDC_HEADERS 1
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -152,6 +152,8 @@ int main (int argc, char ** argv)
    cMat= adj(cMat);
    cm=adj(cm);
    scm=adj(scm);
    scm=transpose(scm);
    scm=transposeIndex<1>(scm);
 //    Foo = Foo+scalar; // LatticeColourMatrix+Scalar
 //    Foo = Foo*scalar; // LatticeColourMatrix*Scalar
@@ -162,6 +164,23 @@ int main (int argc, char ** argv)
    LatticeComplex trscMat(&Fine);
    trscMat = trace(scMat); // Trace
    {
      TComplex      c;
      ColourMatrix c_m;   
      SpinMatrix   s_m;   
      SpinColourMatrix sc_m; 
      s_m = traceIndex<1>(sc_m);
      c_m = traceIndex<2>(sc_m);
      c   = traceIndex<2>(s_m);
      c   = traceIndex<1>(c_m);
      printf("c. Level %d\n",c_m.TensorLevel);
      printf("c. Level %d\n",c_m._internal.TensorLevel);
    }
    FooBar = Bar;
@@ -203,7 +222,7 @@ int main (int argc, char ** argv)
        peekSite(bar,Bar,coor);
        for(int r=0;r<3;r++){
        for(int c=0;c<3;c++){
-	  cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
+	  //	  cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
 	}}
      }}}}
    } 
--- a/Grid_math_type_mapper.h
+++ b/Grid_math_type_mapper.h
@@ -1,8 +1,22 @@
 #ifndef GRID_MATH_TYPE_MAPPER_H
 #define GRID_MATH_TYPE_MAPPER_H
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////
 // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
 // Use of a helper class like this allows us to template specialise and "dress"
 // other classes such as RealD == double, ComplexD == std::complex<double> with these
 // traits.
 //
 // It is possible that we could do this more elegantly if I introduced a 
 // queryable trait in iScalar, iMatrix and iVector and used the query on vtype in 
 // place of the type mapper?
 //
 // Not sure how to do this, but probably could be done with a research effort
 // to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
 //
 //////////////////////////////////////////////////////////////////////////////////
  template <class T> class GridTypeMapper {
@@ -10,7 +24,7 @@ namespace Grid {
    typedef typename T::scalar_type scalar_type;
    typedef typename T::vector_type vector_type;
    typedef typename T::tensor_reduced tensor_reduced;
-    //    enum { TensorLevel = T::TensorLevel };
+    enum { TensorLevel = T::TensorLevel };
  };
 //////////////////////////////////////////////////////////////////////////////////
@@ -21,28 +35,28 @@ namespace Grid {
    typedef RealF scalar_type;
    typedef RealF vector_type;
    typedef RealF tensor_reduced ;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<RealD> {
  public:
    typedef RealD scalar_type;
    typedef RealD vector_type;
    typedef RealD tensor_reduced;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<ComplexF> {
  public:
    typedef ComplexF scalar_type;
    typedef ComplexF vector_type;
    typedef ComplexF tensor_reduced;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<ComplexD> {
  public:
    typedef ComplexD scalar_type;
    typedef ComplexD vector_type;
    typedef ComplexD tensor_reduced;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vRealF> {
@@ -50,44 +64,37 @@ namespace Grid {
    typedef RealF  scalar_type;
    typedef vRealF vector_type;
    typedef vRealF tensor_reduced;
-    //    enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vRealD> {
  public:
    typedef RealD  scalar_type;
    typedef vRealD vector_type;
    typedef vRealD tensor_reduced;
-    //enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vComplexF> {
  public:
    typedef ComplexF  scalar_type;
    typedef vComplexF vector_type;
    typedef vComplexF tensor_reduced;
-    //enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vComplexD> {
  public:
    typedef ComplexD  scalar_type;
    typedef vComplexD vector_type;
    typedef vComplexD tensor_reduced;
-    //enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vInteger> {
  public:
    typedef Integer  scalar_type;
    typedef vInteger vector_type;
    typedef vInteger tensor_reduced;
-    //enum { TensorLevel = 0 };
+    enum { TensorLevel = 0 };
  };
  // Again terminate the recursion.
  inline vRealD    TensorRemove(vRealD    arg){ return arg;}
  inline vRealF    TensorRemove(vRealF    arg){ return arg;}
  inline vComplexF TensorRemove(vComplexF arg){ return arg;}
  inline vComplexD TensorRemove(vComplexD arg){ return arg;}
  inline vInteger  TensorRemove(vInteger  arg){ return arg;}
 }
 #endif
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -2,15 +2,76 @@
 #define GRID_MATH_TYPES_H
 #include <Grid_math_type_mapper.h>
 #include <type_traits>
 namespace Grid {
  // First some of my own traits
  template<typename T> struct isGridTensor {
    static const bool value = true;
    static const bool notvalue = false;
  };
  template<> struct isGridTensor<RealD > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<RealF > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<ComplexD > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<ComplexF > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<Integer > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<vRealD > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<vRealF > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<vComplexD > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<vComplexF > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<vInteger > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  // Match the index
  template<typename T,int Level> struct matchGridTensorIndex {
    static const bool value = (Level==T::TensorLevel);
    static const bool notvalue = (Level!=T::TensorLevel);
  };
 ///////////////////////////////////////////////////
 // Scalar, Vector, Matrix objects.
 // These can be composed to form tensor products of internal indices.
 ///////////////////////////////////////////////////
-    
+
  // Terminates the recursion for temoval of all Grids tensors
  inline vRealD    TensorRemove(vRealD    arg){ return arg;}
  inline vRealF    TensorRemove(vRealF    arg){ return arg;}
  inline vComplexF TensorRemove(vComplexF arg){ return arg;}
  inline vComplexD TensorRemove(vComplexD arg){ return arg;}
  inline vInteger  TensorRemove(vInteger  arg){ return arg;}
 template<class vtype> class iScalar
 {
 public:
@@ -19,9 +80,12 @@ public:
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  //enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
  // Scalar no action
  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
    iScalar(){};
@@ -79,9 +143,9 @@ public:
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  //  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
  typedef iScalar<tensor_reduced_v> tensor_reduced;
    iVector(Zero &z){ *this = zero; };
    iVector() {};
@@ -137,8 +201,10 @@ public:
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
-  //  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  iMatrix(Zero &z){ *this = zero; };
@@ -1018,6 +1084,137 @@ template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &
    }}
    return ret;
 }
 /////////////////////////////////////////////////////////////////
 // Transpose
 /////////////////////////////////////////////////////////////////
 template<class vtype,int N>
  inline typename std::enable_if<isGridTensor<vtype>::value, iMatrix<vtype,N> >::type 
  transpose(iMatrix<vtype,N> arg)
  {
    iMatrix<vtype,N> ret;
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	ret._internal[i][j] = transpose(arg._internal[j][i]); // NB recurses
      }}
    return ret;
  }
 template<class vtype,int N>
  inline typename std::enable_if<isGridTensor<vtype>::notvalue, iMatrix<vtype,N> >::type 
  transpose(iMatrix<vtype,N> arg)
  {
    iMatrix<vtype,N> ret;
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	ret._internal[i][j] = arg._internal[j][i]; // Stop recursion if not a tensor type
      }}
    return ret;
  }
 template<class vtype,int N>
  inline typename std::enable_if<isGridTensor<vtype>::value, iScalar<vtype> >::type 
  transpose(iScalar<vtype> arg)
  {
    iScalar<vtype> ret;
    ret._internal = transpose(arg._internal); // NB recurses
    return ret;
  }
 template<class vtype,int N>
  inline typename std::enable_if<isGridTensor<vtype>::notvalue, iScalar<vtype> >::type 
  transpose(iScalar<vtype> arg)
  {
    iScalar<vtype> ret;
    ret._internal = arg._internal; // NB recursion stops
    return ret;
  }
 /*
 *  No need to implement transpose on the primitive types
 *  Not sure that this idiom is any more elegant that the trace idiom below however!
 inline ComplexD transpose(ComplexD &rhs){  return rhs;}
 inline ComplexF transpose(ComplexF &rhs){  return rhs;}
 inline RealD transpose(RealD &rhs){  return rhs;}
 inline RealF transpose(RealF &rhs){  return rhs;}
 */
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Transpose a specific index
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<int Level,class vtype,int N> inline 
  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::value, iMatrix<vtype,N> >::type 
 transposeIndex (const iMatrix<vtype,N> &arg)
 {
  iMatrix<vtype,N> ret;
  for(int i=0;i<N;i++){
    for(int j=0;j<N;j++){
      ret._internal[i][j] = arg._internal[j][i]; 
  }}
  return ret;
 }
 // or not
 template<int Level,class vtype,int N> inline 
 typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue, iMatrix<vtype,N> >::type 
 transposeIndex (const iMatrix<vtype,N> &arg)
 {
  iMatrix<vtype,N> ret;
  for(int i=0;i<N;i++){
    for(int j=0;j<N;j++){
      ret._internal[i][j] = transposeIndex<Level>(arg._internal[i][j]); 
  }}
  return ret;
 }
 template<int Level,class vtype> inline 
 typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue, iScalar<vtype> >::type 
 transposeIndex (const iScalar<vtype> &arg)
 {
  return transposeIndex<Level>(arg._internal);
 }
 ////////////////////////////////
 // Trace a specific index 
 ////////////////////////////////
 template<int Level> inline ComplexF traceIndex(const ComplexF arg) { return arg;}
 template<int Level> inline ComplexD traceIndex(const ComplexD arg) { return arg;}
 template<int Level> inline RealF traceIndex(const RealF arg) { return arg;}
 template<int Level> inline RealD traceIndex(const RealD arg) { return arg;}
 template<int Level,class vtype> inline 
 auto traceIndex(const iScalar<vtype> &arg) -> iScalar<decltype(traceIndex<Level>(arg._internal)) >
 {
  iScalar<decltype(traceIndex<Level>(arg._internal))> ret;
  ret._internal = traceIndex<Level>(arg._internal);
  return ret;
 }
 // If we hit the right index, return scalar and trace it with no further recursion
 template<int Level,class vtype,int N> inline 
 auto traceIndex(const iMatrix<vtype,N> &arg) ->
  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value,  // Index matches
                                                    iScalar<vtype> >::type                              // return scalar
 {
  iScalar<vtype> ret;
  zeroit(ret._internal);
  for(int i=0;i<N;i++){
    ret._internal = ret._internal + arg._internal[i][i];
  }
  return ret;
 }
 // not this level, so recurse
 template<int Level,class vtype,int N> inline 
 auto traceIndex(const iMatrix<vtype,N> &arg) ->
  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,// No index match
         iMatrix<decltype(traceIndex<Level>(arg._internal[0][0])),N> >::type     // return matrix
 {
  iMatrix<decltype(traceIndex<Level>(arg._internal[0][0])),N> ret;
  for(int i=0;i<N;i++){
  for(int j=0;j<N;j++){
    ret._internal[i][j] = traceIndex<Level>(arg._internal[i][j]);
  }}
  return ret;
 }
 /////////////////////////////////////////////////////////////////
 // Can only take the real/imag part of scalar objects, since
@@ -1075,17 +1272,24 @@ template<class itype,int N> inline auto imag(const iVector<itype,N> &z) -> iVect
    // Trace of scalar and matrix
    /////////////////////////////////
-inline Complex trace( const Complex &arg){
+inline ComplexF trace( const ComplexF &arg){
    return arg;
 }
-//inline vComplex trace(const vComplex &arg){
+inline ComplexD trace( const ComplexD &arg){
-//    return arg;
+    return arg;
-//}
+}
 inline RealF trace( const RealF &arg){
    return arg;
 }
 inline RealD trace( const RealD &arg){
    return arg;
 }
 template<class vtype,int N>
 inline auto trace(const iMatrix<vtype,N> &arg) -> iScalar<decltype(trace(arg._internal[0][0]))>
 {
    iScalar<decltype( trace(arg._internal[0][0] )) > ret;
-    ZeroIt(ret._internal);
+    zeroit(ret._internal);
    for(int i=0;i<N;i++){
        ret._internal=ret._internal+trace(arg._internal[i][i]);
    }
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -12,7 +12,7 @@
 ////////////////////////////////////////////////////////////////////////
-#ifdef SSE2
+#ifdef SSE4
 #include <pmmintrin.h>
 #endif
 #if defined(AVX1) || defined (AVX2)
@@ -33,6 +33,12 @@ namespace Grid {
  inline RealF adj(const RealF  & r){ return r; }
  inline RealF conj(const RealF  & r){ return r; }
  inline RealF real(const RealF  & r){ return r; }
  inline RealD adj(const RealD  & r){ return r; }
  inline RealD conj(const RealD  & r){ return r; }
  inline RealD real(const RealD  & r){ return r; }
  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
@@ -60,8 +66,6 @@ namespace Grid {
    inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
    inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
    inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
    inline RealD adj(const RealD & r){ return r; }  // No-op for real
    inline RealD conj(const RealD & r){ return r; }
    inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
    inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
@@ -72,14 +76,14 @@ namespace Grid {
  class Zero{};
  static Zero zero;
-  template<class itype> inline void ZeroIt(itype &arg){ arg=zero;};
+  template<class itype> inline void zeroit(itype &arg){ arg=zero;};
-  template<>            inline void ZeroIt(ComplexF &arg){ arg=0; };
+  template<>            inline void zeroit(ComplexF &arg){ arg=0; };
-  template<>            inline void ZeroIt(ComplexD &arg){ arg=0; };
+  template<>            inline void zeroit(ComplexD &arg){ arg=0; };
-  template<>            inline void ZeroIt(RealF &arg){ arg=0; };
+  template<>            inline void zeroit(RealF &arg){ arg=0; };
-  template<>            inline void ZeroIt(RealD &arg){ arg=0; };
+  template<>            inline void zeroit(RealD &arg){ arg=0; };
-#if defined (SSE2)
+#if defined (SSE4)
    typedef __m128 fvec;
    typedef __m128d dvec;
    typedef __m128 cvec;
@@ -202,7 +206,7 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
      case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
      case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
 #endif
-#ifdef SSE2
+#ifdef SSE4
      case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
      case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
 #endif
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -49,7 +49,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_add_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_add_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -67,7 +67,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_sub_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_sub_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -114,7 +114,7 @@ namespace Grid {
            ymm1 = _mm256_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
            ret.v= _mm256_addsub_pd(ymm0,ymm1);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            zvec ymm0,ymm1,ymm2;
            ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar,
            ymm0 = _mm_mul_pd(ymm0,b.v);        // ymm0 <- ar bi, ar br
@@ -200,14 +200,14 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(ig,rl,ig,rl);
 #endif
-#ifdef SSE2
+#ifdef SSE4
-            ret.v = _mm_set_pd(a,b);
+            ret.v = _mm_set_pd(ig,rl);
 #endif
 #ifdef AVX512
            ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl);
 #endif
 #ifdef QPX
-            ret.v = {a,b,a,b};
+            ret.v = {ig,rl,ig,rl};
 #endif
        }
@@ -215,7 +215,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_set_pd(a[0].imag(),a[0].real());
 #endif
 #ifdef AVX512
@@ -231,7 +231,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 #if defined (AVX1)|| defined (AVX2)
       _mm256_store_pd((double *)a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
       _mm_store_pd((double *)a,ret.v);
 #endif
 #ifdef AVX512
@@ -257,7 +257,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
            __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
             ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_addsub_pd(ret.v,in.v);
 #endif
 #ifdef AVX512
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -63,7 +63,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_add_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_add_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -81,7 +81,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_sub_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_sub_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -120,7 +120,7 @@ namespace Grid {
            ymm1 = _mm256_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
            ret.v= _mm256_addsub_ps(ymm0,ymm1);    
 #endif
-#ifdef SSE2
+#ifdef SSE4
            cvec ymm0,ymm1,ymm2;
            ymm0 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
            ymm0 = _mm_mul_ps(ymm0,b.v);        // ymm0 <- ar bi, ar br
@@ -156,8 +156,8 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
 #endif
-#ifdef SSE2
+#ifdef SSE4
-            ret.v = _mm_set_ps(a[1].imag, a[1].real(),a[0].imag(),a[0].real());
+            ret.v = _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
 #endif
 #ifdef AVX512
            ret.v = _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
@@ -181,7 +181,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
        _mm256_store_ps((float *)a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
        _mm_store_ps((float *)a,ret.v);
 #endif
 #ifdef AVX512
@@ -201,7 +201,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_ps(b,a,b,a,b,a,b,a);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_set_ps(a,b,a,b);
 #endif
 #ifdef AVX512
@@ -213,7 +213,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
        }
       friend inline ComplexF Reduce(const vComplexF & in)
       {
 #ifdef SSE4
 #error
 #endif
 #if defined (AVX1) || defined(AVX2)
 	 // FIXME this is inefficient and use 
         __attribute__ ((aligned(32))) float c_[8];
         _mm256_store_ps(c_,in.v);
         return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
@@ -305,7 +309,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
             tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
             ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_addsub_ps(ret.v,in.v);
 #endif
 #ifdef AVX512
--- a/Grid_vInteger.h
+++ b/Grid_vInteger.h
@@ -48,7 +48,7 @@ namespace Grid {
 #if defined (AVX2)
            ret.v = _mm256_add_epi32(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_add_epi32(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -78,7 +78,7 @@ namespace Grid {
 #if defined (AVX2)
            ret.v = _mm256_sub_epi32(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_sub_epi32(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -108,7 +108,7 @@ namespace Grid {
 #if defined (AVX2)
            ret.v = _mm256_mul_epi32(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_mul_epi32(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -147,7 +147,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set1_epi32(a);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_set1_epi32(a);
 #endif
 #ifdef AVX512
@@ -161,7 +161,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
 	  ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 #endif
-#ifdef SSE2
+#ifdef SSE4
 	  ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
 #endif
 #ifdef AVX512
@@ -177,8 +177,8 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
 	  _mm256_store_si256((__m256i*)a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
-	  _mm_store_si128(a,ret.v);
+	  _mm_store_si128((__m128i *)a,ret.v);
 #endif
 #ifdef AVX512
 	  _mm512_store_si512(a,ret.v);
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -24,7 +24,7 @@ namespace Grid {
        friend inline vRealD conj(const vRealD &in){ return in; }
        friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
-#if defined (AVX1) || defined (SSE2)
+#if defined (AVX1) || defined (SSE4)
            y = a*x+y;
 #endif
 #ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
@@ -55,7 +55,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_add_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_add_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -72,7 +72,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_sub_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_sub_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -90,7 +90,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_mul_pd(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_mul_pd(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -133,7 +133,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(a,a,a,a);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_set_pd(a,a);
 #endif
 #ifdef AVX512
@@ -147,7 +147,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_set_pd(a[0],a[1]);
 #endif
 #ifdef AVX512
@@ -163,7 +163,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            _mm256_store_pd(a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            _mm_store_pd(a,ret.v);
 #endif
 #ifdef AVX512
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -26,7 +26,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_add_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_add_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -48,7 +48,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_sub_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_sub_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -70,7 +70,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_mul_ps(a.v,b.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_mul_ps(a.v,b.v);
 #endif
 #ifdef AVX512
@@ -96,7 +96,7 @@ namespace Grid {
        friend inline vRealF conj(const vRealF &in){ return in; }
        friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
-#if defined (AVX1) || defined (SSE2)
+#if defined (AVX1) || defined (SSE4)
            y = a*x+y;
 #endif
 #ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
@@ -158,7 +158,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_set_ps(a,a,a,a);
 #endif
 #ifdef AVX512
@@ -175,7 +175,7 @@ namespace Grid {
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 #endif
-#ifdef SSE2
+#ifdef SSE4
            ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
 #endif
 #ifdef AVX512
@@ -195,7 +195,7 @@ friend inline void vstore(const vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
 	_mm256_store_ps(a,ret.v);
 #endif
-#ifdef SSE2
+#ifdef SSE4
 	_mm_store_ps(a,ret.v);
 #endif
 #ifdef AVX512
--- a/test_Grid_stencil.cc
+++ b/test_Grid_stencil.cc
@@ -29,14 +29,14 @@ int main (int argc, char ** argv)
 #ifdef AVX512
    simd_layout[0] = 1;
-    simd_layout[1] = 1;
+    simd_layout[1] = 2;
    simd_layout[2] = 2;
    simd_layout[3] = 2;
 #endif
 #if defined (AVX1)|| defined (AVX2)
    simd_layout[0] = 1;
    simd_layout[1] = 1;
-    simd_layout[2] = 1;
+    simd_layout[2] = 2;
    simd_layout[3] = 2;
 #endif
 #if defined (SSE2)