Corrected bug in integer multiplications for SSE4 and AVX2

Merge remote-tracking branch 'upstream/master' Conflicts: tests/Make.inc
2025-06-14 13:57:07 +01:00 · 2015-06-16 23:34:45 +09:00
parent 1f2cf5cff4 ae0873bc77
commit c9018d74ac
37 changed files with 1341 additions and 515 deletions
--- a/lib/tensors/Tensor_Ta.h
+++ b/lib/tensors/Tensor_Ta.h
@ -102,10 +102,10 @@ namespace Grid {
    }

  template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
-    inline auto Determinant(const iMatrix<vtype,N> &arg)-> iScalar<decltype(Determinant(arg._internal[0][0]))>
+    inline iScalar<vtype> Determinant(const iMatrix<vtype,N> &arg)
    {
      iMatrix<vtype,N> ret(arg);
-      iScalar<decltype(Determinant(arg._internal[0][0]))> det = 1.0;
+      iScalar<vtype> det = vtype(1.0);
      /* Conversion of matrix to upper triangular */
      for(int i = 0; i < N; i++){
        for(int j = 0; j < N; j++){
--- a/lib/tensors/Tensor_arith_scalar.h
+++ b/lib/tensors/Tensor_arith_scalar.h
@ -9,12 +9,12 @@ namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////////////

 // multiplication by fundamental scalar type
-template<class l,int N> strong_inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs*srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+template<class l> strong_inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }

 template<class l,int N> strong_inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
@ -118,12 +118,12 @@ template<class l,int N> strong_inline iMatrix<l,N> operator * (Integer lhs,const
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // addition by fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
-template<class l,int N> strong_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs+srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+template<class l> strong_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }

 template<class l,int N> strong_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
@ -176,12 +176,12 @@ template<class l,int N> strong_inline iMatrix<l,N> operator + (Integer lhs,const
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // subtraction of fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
-template<class l,int N> strong_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs-srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
+template<class l> strong_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
  return slhs-rhs;
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@ -23,13 +23,17 @@ template<class vtype> class iScalar
 public:
  vtype _internal;

-  typedef typename GridTypeMapper<vtype>::scalar_type   scalar_type;
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
  typedef iScalar<recurse_scalar_object> scalar_object;

+  // substitutes a real or complex version with same tensor structure
+  typedef iScalar<typename GridTypeMapper<vtype>::Complexified > Complexified;
+  typedef iScalar<typename GridTypeMapper<vtype>::Realified >    Realified;
+
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};

  // Scalar no action
@ -86,9 +90,19 @@ public:
  strong_inline const vtype & operator ()(void) const {
    return _internal;
  }
-  
-  operator ComplexD () const { return(TensorRemove(_internal)); };
-  operator RealD () const { return(real(TensorRemove(_internal))); }
+
+  // Type casts meta programmed
+  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
+    operator ComplexF () const { return(TensorRemove(_internal)); };
+  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
+    operator ComplexD () const { return(TensorRemove(_internal)); };
+  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
+    operator RealD () const { return(real(TensorRemove(_internal))); }
+  template<class U=vtype,class V=scalar_type,IfReal<V>    = 0,IfNotSimd<U> = 0>  
+    operator RealD    () const { return TensorRemove(_internal); }
+  template<class U=vtype,class V=scalar_type,IfInteger<V> = 0,IfNotSimd<U> = 0>  
+    operator Integer  () const { return Integer(TensorRemove(_internal)); }
+
  
  // convert from a something to a scalar via constructor of something arg
  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline iScalar<vtype> operator = (T arg)
@ -123,6 +137,10 @@ public:
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef iVector<recurse_scalar_object,N> scalar_object;

+  // substitutes a real or complex version with same tensor structure
+  typedef iVector<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
+  typedef iVector<typename GridTypeMapper<vtype>::Realified,N >    Realified;
+
  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iVector<vtype,N>
    { 
      zeroit(*this);
@ -211,6 +229,12 @@ public:
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
+
+  // substitutes a real or complex version with same tensor structure
+  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
+  typedef iMatrix<typename GridTypeMapper<vtype>::Realified,N >    Realified;
+
+  // Tensure removal
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef iMatrix<recurse_scalar_object,N> scalar_object;

--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@ -31,18 +31,17 @@ inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::
 		  std::vector<scalar *> &extracted,int offset){
  int Nextr=extracted.size();
  int Nsimd=vsimd::Nsimd();
-  int s=Nsimd/Nextr;
-
+  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
+                     // replicate n-fold. Use to allow Integer masks to 
+                     // predicate floating point of various width assignments and maintain conformable.
  scalar *buf =(scalar *) y;
  for(int i=0;i<Nextr;i++){
    for(int ii=0;ii<s;ii++){
      buf[i*s+ii]=extracted[i][offset];
    }
  }
-
 };

-
 ////////////////////////////////////////////////////////////////////////////////////////////////
 // Extract a fundamental vector type to scalar array 
 ////////////////////////////////////////////////////////////////////////////////////////////////
@ -55,8 +54,17 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v

  scalar *buf = (scalar *)&y;
  for(int i=0;i<Nextr;i++){
-    for(int ii=0;ii<s;ii++){
-      extracted[i]=buf[i*s+ii];
+    extracted[i]=buf[i*s];
+    for(int ii=1;ii<s;ii++){
+      if ( buf[i*s]!=buf[i*s+ii] ){
+	std::cout << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
+	for(int vv=0;vv<Nsimd;vv++) {
+	  std::cout<< buf[vv]<<" ";
+	}
+	std::cout<<std::endl;
+	assert(0);
+      }
+      assert(buf[i*s]==buf[i*s+ii]);
    }
  }

@ -74,21 +82,7 @@ inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::

  for(int i=0;i<Nextr;i++){
    for(int ii=0;ii<s;ii++){
-      buf[i*s+ii]=extracted[i];
-    }
-  }
-
-};
-template<class vsimd,class scalar>
-inline void AmergeA(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type  &y,std::vector<scalar> &extracted){
-  int Nextr=extracted.size();
-  int Nsimd=vsimd::Nsimd();
-  int s=Nsimd/Nextr;
-
-  scalar *buf = (scalar *)&y;
-  for(int i=0;i<Nextr;i++){
-    for(int ii=0;ii<s;ii++){
-      buf[i*s+ii]=extracted[i];
+      buf[i*s+ii]=extracted[i]; // replicates value
    }
  }
 };
@ -102,12 +96,12 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
  typedef typename vobj::vector_type vector_type ;

  const int Nsimd=vobj::vector_type::Nsimd();
+  int Nextr=extracted.size();
  const int words=sizeof(vobj)/sizeof(vector_type);
+  int s=Nsimd/Nextr;

-  extracted.resize(Nsimd);
-
-  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) 
+  std::vector<scalar_type *> pointers(Nextr);
+  for(int i=0;i<Nextr;i++) 
    pointers[i] =(scalar_type *)& extracted[i];

  vector_type *vp = (vector_type *)&vec;
@ -127,11 +121,11 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac

  const int words=sizeof(vobj)/sizeof(vector_type);
  const int Nsimd=vobj::vector_type::Nsimd();
-
-  assert(extracted.size()==Nsimd);
+  int Nextr=extracted.size();
+  int s = Nsimd/Nextr;

  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) {
+  for(int i=0;i<Nextr;i++) {
    pointers[i] =(scalar_type *)& extracted[i][offset];
  }

@ -153,10 +147,11 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
  const int Nsimd=vobj::vector_type::Nsimd();
  const int words=sizeof(vobj)/sizeof(vector_type);

-  assert(extracted.size()==Nsimd);
+  int Nextr = extracted.size();
+  int splat=Nsimd/Nextr;

-  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) 
+  std::vector<scalar_type *> pointers(Nextr);
+  for(int i=0;i<Nextr;i++) 
    pointers[i] =(scalar_type *)& extracted[i];
  
  vector_type *vp = (vector_type *)&vec;
@ -177,14 +172,14 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  const int Nsimd=vobj::vector_type::Nsimd();
  const int words=sizeof(vobj)/sizeof(vector_type);

-  assert(extracted.size()==Nsimd);
+  int Nextr=extracted.size();

-  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) 
+  std::vector<scalar_type *> pointers(Nextr);
+  for(int i=0;i<Nextr;i++) 
    pointers[i] =(scalar_type *)& extracted[i][offset];
-  
+
  vector_type *vp = (vector_type *)&vec;
-  assert((void *)vp!=NULL);
+
  for(int w=0;w<words;w++){
    merge<vector_type,scalar_type>(&vp[w],pointers,w);
  }
--- a/lib/tensors/Tensor_inner.h
+++ b/lib/tensors/Tensor_inner.h
@ -10,7 +10,8 @@ namespace Grid {
      typedef typename sobj::scalar_type scalar;
      decltype(innerProduct(arg,arg)) nrm;
      nrm = innerProduct(arg,arg);
-      return real(nrm);
+      RealD ret = real(nrm);
+      return ret;
    }

    template<class l,class r,int N> inline
--- a/lib/tensors/Tensor_logical.h
+++ b/lib/tensors/Tensor_logical.h
@ -0,0 +1,32 @@
+#ifndef GRID_TENSOR_LOGICAL_H
+#define GRID_TENSOR_LOGICAL_H
+
+namespace Grid {
+
+#define LOGICAL_BINOP(Op)\
+template<class v> strong_inline iScalar<v> operator Op (const iScalar<v>& lhs,const iScalar<v>& rhs) \
+{\
+  iScalar<v> ret;\
+  ret._internal = lhs._internal Op rhs._internal ;\
+  return ret;\
+}\
+template<class l> strong_inline iScalar<l> operator Op (const iScalar<l>& lhs,Integer rhs) \
+{\
+  typename iScalar<l>::scalar_type t; t=rhs;\
+  typename iScalar<l>::tensor_reduced srhs; srhs=t;\
+  return lhs Op srhs;\
+}\
+template<class l> strong_inline iScalar<l> operator Op (Integer lhs,const iScalar<l>& rhs) \
+{\
+  typename iScalar<l>::scalar_type t;t=lhs;\
+  typename iScalar<l>::tensor_reduced slhs;slhs=t;\
+  return slhs Op rhs;\
+}
+
+LOGICAL_BINOP(|);
+LOGICAL_BINOP(&);
+LOGICAL_BINOP(||);
+LOGICAL_BINOP(&&);
+
+}
+#endif
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@ -26,6 +26,8 @@ namespace Grid {
    typedef typename T::vector_type vector_type;
    typedef typename T::tensor_reduced tensor_reduced;
    typedef typename T::scalar_object scalar_object;
+    typedef typename T::Complexified Complexified;
+    typedef typename T::Realified Realified;
    enum { TensorLevel = T::TensorLevel };
  };

@ -38,6 +40,8 @@ namespace Grid {
    typedef RealF vector_type;
    typedef RealF tensor_reduced ;
    typedef RealF scalar_object;
+    typedef ComplexF Complexified;
+    typedef RealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<RealD> {
@ -46,6 +50,8 @@ namespace Grid {
    typedef RealD vector_type;
    typedef RealD tensor_reduced;
    typedef RealD scalar_object;
+    typedef ComplexD Complexified;
+    typedef RealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<ComplexF> {
@ -54,6 +60,8 @@ namespace Grid {
    typedef ComplexF vector_type;
    typedef ComplexF tensor_reduced;
    typedef ComplexF scalar_object;
+    typedef ComplexF Complexified;
+    typedef RealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<ComplexD> {
@ -62,6 +70,8 @@ namespace Grid {
    typedef ComplexD vector_type;
    typedef ComplexD tensor_reduced;
    typedef ComplexD scalar_object;
+    typedef ComplexD Complexified;
+    typedef RealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<Integer> {
@ -70,6 +80,8 @@ namespace Grid {
    typedef Integer vector_type;
    typedef Integer tensor_reduced;
    typedef Integer scalar_object;
+    typedef void Complexified;
+    typedef void Realified;
    enum { TensorLevel = 0 };
  };

@ -79,6 +91,8 @@ namespace Grid {
    typedef vRealF vector_type;
    typedef vRealF tensor_reduced;
    typedef RealF  scalar_object;
+    typedef vComplexF Complexified;
+    typedef vRealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vRealD> {
@ -87,6 +101,8 @@ namespace Grid {
    typedef vRealD vector_type;
    typedef vRealD tensor_reduced;
    typedef RealD  scalar_object;
+    typedef vComplexD Complexified;
+    typedef vRealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vComplexF> {
@ -95,6 +111,8 @@ namespace Grid {
    typedef vComplexF vector_type;
    typedef vComplexF tensor_reduced;
    typedef ComplexF  scalar_object;
+    typedef vComplexF Complexified;
+    typedef vRealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vComplexD> {
@ -103,6 +121,8 @@ namespace Grid {
    typedef vComplexD vector_type;
    typedef vComplexD tensor_reduced;
    typedef ComplexD  scalar_object;
+    typedef vComplexD Complexified;
+    typedef vRealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vInteger> {
@ -111,6 +131,8 @@ namespace Grid {
    typedef vInteger vector_type;
    typedef vInteger tensor_reduced;
    typedef  Integer scalar_object;
+    typedef void Complexified;
+    typedef void Realified;
    enum { TensorLevel = 0 };
  };

--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@ -2,7 +2,7 @@
 #define GRID_TENSOR_UNARY_H
 namespace Grid {

-#define UNARY_REAL(func)\
+#define UNARY(func)\
 template<class obj> inline auto func(const iScalar<obj> &z) -> iScalar<obj>\
 {\
    iScalar<obj> ret;\
@ -53,14 +53,71 @@ template<class obj> inline iScalar<obj> func(const iScalar<obj> &z,scal y)	\
    return ret;\
 }

-UNARY_REAL(sqrt);
-UNARY_REAL(rsqrt);
-UNARY_REAL(sin);
-UNARY_REAL(cos);
+UNARY(sqrt);
+UNARY(rsqrt);
+UNARY(sin);
+UNARY(cos);
+UNARY(log);
+UNARY(exp);
+UNARY(abs);
+UNARY(Not);
+
+
+template<class obj> inline auto toReal(const iScalar<obj> &z) -> typename iScalar<obj>::Realified
+{
+  typename iScalar<obj>::Realified ret;
+  ret._internal = toReal(z._internal);
+  return ret;
+}
+ template<class obj,int N> inline auto toReal(const iVector<obj,N> &z) -> typename iVector<obj,N>::Realified
+{
+  typename iVector<obj,N>::Realified ret;
+  for(int c1=0;c1<N;c1++){  
+    ret._internal[c1] = toReal(z._internal[c1]); 
+  }
+  return ret;
+}
+template<class obj,int N> inline auto toReal(const iMatrix<obj,N> &z) -> typename iMatrix<obj,N>::Realified
+{
+  typename iMatrix<obj,N>::Realified ret;
+  for(int c1=0;c1<N;c1++){
+  for(int c2=0;c2<N;c2++){
+    ret._internal[c1][c2] = toReal(z._internal[c1][c2]);
+  }}
+  return ret;
+}
+
+template<class obj> inline auto toComplex(const iScalar<obj> &z) -> typename iScalar<obj>::Complexified
+{
+  typename iScalar<obj>::Complexified ret;
+  ret._internal = toComplex(z._internal);
+  return ret;
+}
+ template<class obj,int N> inline auto toComplex(const iVector<obj,N> &z) -> typename iVector<obj,N>::Complexified
+{
+  typename iVector<obj,N>::Complexified ret;
+  for(int c1=0;c1<N;c1++){  
+    ret._internal[c1] = toComplex(z._internal[c1]); 
+  }
+  return ret;
+}
+template<class obj,int N> inline auto toComplex(const iMatrix<obj,N> &z) -> typename iMatrix<obj,N>::Complexified
+{
+  typename iMatrix<obj,N>::Complexified ret;
+  for(int c1=0;c1<N;c1++){
+  for(int c2=0;c2<N;c2++){
+    ret._internal[c1][c2] = toComplex(z._internal[c1][c2]);
+  }}
+  return ret;
+}
+

 BINARY_RSCALAR(mod,Integer);
 BINARY_RSCALAR(pow,RealD);

+#undef UNARY
+#undef BINARY_RSCALAR
+

 }
 #endif