Accelerator mark up of entire tensore space for offload

2025-12-21 21:24:30 +00:00 · 2018-01-24 13:27:30 +00:00
parent 69327db9a9
commit 8e99264f40
18 changed files with 614 additions and 661 deletions
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -65,126 +65,111 @@ public:
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };

  // Scalar no action
-  //  template<int Level> using tensor_reduce_level = typename
-  //  iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
-  iScalar() = default;
-  /*
-    iScalar(const iScalar<vtype> &copyme)=default;
-    iScalar(iScalar<vtype> &&copyme)=default;
-    iScalar<vtype> & operator= (const iScalar<vtype> &copyme) = default;
-    iScalar<vtype> & operator= (iScalar<vtype> &&copyme) = default;
-  */
+  accelerator iScalar() = default;

-  //  template<int N=0>
-  //  iScalar(EnableIf<isSIMDvectorized<vector_type>, vector_type> s) : _internal(s){};  // recurse down and hit the constructor for vector_type
-
-  iScalar(scalar_type s) : _internal(s){};  // recurse down and hit the constructor for vector_type
-
-  iScalar(const Zero &z) { *this = zero; };
-
-  iScalar<vtype> &operator=(const Zero &hero) {
-    zeroit(*this);
-    return *this;
-  }
-  friend strong_inline void vstream(iScalar<vtype> &out,
-                                    const iScalar<vtype> &in) {
-    vstream(out._internal, in._internal);
-  }
-  friend strong_inline void vbroadcast(iScalar<vtype> &out,const iScalar<vtype> &in,int lane){
-    vbroadcast(out._internal,in._internal,lane);
-  }
-  friend strong_inline void zeroit(iScalar<vtype> &that){
+  friend accelerator_inline void zeroit(iScalar<vtype> &that){
    zeroit(that._internal);
  }
-  friend strong_inline void prefetch(iScalar<vtype> &that) {
+
+  accelerator_inline iScalar(scalar_type s) : _internal(s){};  // recurse down and hit the constructor for vector_type
+
+  accelerator_inline iScalar(const Zero &z) { zeroit(*this); };
+
+  accelerator_inline iScalar<vtype> &operator=(const Zero &hero) {
+    zeroit(*this);  return *this;
+  }
+  friend accelerator_inline void vstream(iScalar<vtype> &out, const iScalar<vtype> &in) {
+    vstream(out._internal, in._internal);
+  }
+  friend accelerator_inline void vbroadcast(iScalar<vtype> &out,const iScalar<vtype> &in,int lane){
+    vbroadcast(out._internal,in._internal,lane);
+  }
+  friend accelerator_inline void prefetch(iScalar<vtype> &that) {
    prefetch(that._internal);
  }
-  friend strong_inline void permute(iScalar<vtype> &out,
-                                    const iScalar<vtype> &in, int permutetype) {
+  friend accelerator_inline void permute(iScalar<vtype> &out, const iScalar<vtype> &in, int permutetype) {
    permute(out._internal, in._internal, permutetype);
  }
-  friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
+  friend accelerator_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
    rotate(out._internal,in._internal,rot);
  }
-  friend strong_inline void exchange(iScalar<vtype> &out1,iScalar<vtype> &out2,
-				     const iScalar<vtype> &in1,const iScalar<vtype> &in2,int type){
-    exchange(out1._internal,out2._internal,
-	     in1._internal, in2._internal,type);
+  friend accelerator_inline void exchange(iScalar<vtype> &out1,iScalar<vtype> &out2,
+				     const iScalar<vtype> &in1,const iScalar<vtype> &in2,int type)
+  {
+    exchange(out1._internal,out2._internal,in1._internal, in2._internal,type);
  }

  // Unary negation
-  friend strong_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
+  friend accelerator_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
    iScalar<vtype> ret;
    ret._internal = -r._internal;
    return ret;
  }
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-  strong_inline iScalar<vtype> &operator*=(const iScalar<vtype> &r) {
+  accelerator_inline iScalar<vtype> &operator*=(const iScalar<vtype> &r) {
    *this = (*this) * r;
    return *this;
  }
-  strong_inline iScalar<vtype> &operator-=(const iScalar<vtype> &r) {
+  accelerator_inline iScalar<vtype> &operator-=(const iScalar<vtype> &r) {
    *this = (*this) - r;
    return *this;
  }
-  strong_inline iScalar<vtype> &operator+=(const iScalar<vtype> &r) {
+  accelerator_inline iScalar<vtype> &operator+=(const iScalar<vtype> &r) {
    *this = (*this) + r;
    return *this;
  }
-  strong_inline vtype &operator()(void) { return _internal; }
-  strong_inline const vtype &operator()(void) const { return _internal; }
+  accelerator_inline vtype &operator()(void) { return _internal; }
+  accelerator_inline const vtype &operator()(void) const { return _internal; }

  // Type casts meta programmed, must be pure scalar to match TensorRemove
-  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0>
+  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0> accelerator_inline
  operator ComplexF() const {
    return (TensorRemove(_internal));
-  };
-  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0>
+  }
+  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0> accelerator_inline
  operator ComplexD() const {
    return (TensorRemove(_internal));
-  };
-  //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> =
-  //  0> operator RealD    () const { return(real(TensorRemove(_internal))); }
-  template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0>
+  }
+  template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> accelerator_inline
  operator RealD() const {
    return TensorRemove(_internal);
  }
-  template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0>
+  template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> accelerator_inline
  operator Integer() const {
    return Integer(TensorRemove(_internal));
  }

  // convert from a something to a scalar via constructor of something arg
  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
-  strong_inline iScalar<vtype> operator=(T arg) {
+  accelerator_inline iScalar<vtype> operator=(T arg) {
    _internal = arg;
    return *this;
  }

  // Convert elements
  template <class ttype>
-  strong_inline iScalar<vtype> operator=(iScalar<ttype> &&arg) {
+  accelerator_inline iScalar<vtype> operator=(iScalar<ttype> &&arg) {
    _internal = arg._internal;
    return *this;
  }

+  // Host only
  friend std::ostream &operator<<(std::ostream &stream,const iScalar<vtype> &o) {
    stream << "S {" << o._internal << "}";
    return stream;
  };
-
-
 };
+
 ///////////////////////////////////////////////////////////
 // Allows to turn scalar<scalar<scalar<double>>>> back to double.
 ///////////////////////////////////////////////////////////
 template <class T>
-strong_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
+accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
 TensorRemove(T arg) {
  return arg;
 }
 template <class vtype>
-strong_inline auto TensorRemove(iScalar<vtype> arg)
+accelerator_inline auto TensorRemove(iScalar<vtype> arg)
  -> decltype(TensorRemove(arg._internal)) {
  return TensorRemove(arg._internal);
 }
@@ -210,88 +195,80 @@ public:
  // get double precision version
  typedef iVector<typename GridTypeMapper<vtype>::DoublePrecision, N> DoublePrecision;
  
-  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
-	    * = nullptr>
-  strong_inline auto operator=(T arg) -> iVector<vtype, N> {
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
+  accelerator_inline auto operator=(T arg) -> iVector<vtype, N> {
    zeroit(*this);
    for (int i = 0; i < N; i++) _internal[i] = arg;
    return *this;
  }

  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
-  iVector(const Zero &z) { *this = zero; };
-  iVector() = default;
-  /*
-    iVector(const iVector<vtype,N> &copyme)=default;
-    iVector(iVector<vtype,N> &&copyme)=default;
-    iVector<vtype,N> & operator= (const iVector<vtype,N> &copyme) = default;
-    iVector<vtype,N> & operator= (iVector<vtype,N> &&copyme) = default;
-  */
+  accelerator_inline iVector(const Zero &z) { zeroit(*this); };
+  accelerator iVector() = default;

-  iVector<vtype, N> &operator=(const Zero &hero) {
+  accelerator_inline iVector<vtype, N> &operator=(const Zero &hero) {
    zeroit(*this);
    return *this;
  }
-  friend strong_inline void zeroit(iVector<vtype, N> &that) {
+  friend accelerator_inline void zeroit(iVector<vtype, N> &that) {
    for (int i = 0; i < N; i++) {
      zeroit(that._internal[i]);
    }
  }
-  friend strong_inline void prefetch(iVector<vtype, N> &that) {
+  friend accelerator_inline void prefetch(iVector<vtype, N> &that) {
    for (int i = 0; i < N; i++) prefetch(that._internal[i]);
  }
-  friend strong_inline void vstream(iVector<vtype, N> &out,
-                                    const iVector<vtype, N> &in) {
+  friend accelerator_inline void vstream(iVector<vtype, N> &out, const iVector<vtype, N> &in) {
    for (int i = 0; i < N; i++) {
      vstream(out._internal[i], in._internal[i]);
    }
  }
-  friend strong_inline void vbroadcast(iVector<vtype,N> &out,const iVector<vtype,N> &in,int lane){
+  friend accelerator_inline void vbroadcast(iVector<vtype,N> &out,const iVector<vtype,N> &in,int lane){
    for(int i=0;i<N;i++){
      vbroadcast(out._internal[i],in._internal[i],lane);
    }
  }
-  friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
+  friend accelerator_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
      permute(out._internal[i],in._internal[i],permutetype);
    }
  }
-  friend strong_inline void rotate(iVector<vtype,N> &out,const iVector<vtype,N> &in,int rot){
+  friend accelerator_inline void rotate(iVector<vtype,N> &out,const iVector<vtype,N> &in,int rot){
    for(int i=0;i<N;i++){
      rotate(out._internal[i],in._internal[i],rot);
    }
  }
-  friend strong_inline void exchange(iVector<vtype,N> &out1,iVector<vtype,N> &out2,
+  friend accelerator_inline void exchange(iVector<vtype,N> &out1,iVector<vtype,N> &out2,
 				     const iVector<vtype,N> &in1,const iVector<vtype,N> &in2,int type){
    for(int i=0;i<N;i++){
-      exchange(out1._internal[i],out2._internal[i],
-	       in1._internal[i], in2._internal[i],type);
+      exchange(out1._internal[i],out2._internal[i],in1._internal[i], in2._internal[i],type);
    }
  }

  // Unary negation
-  friend strong_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
+  friend accelerator_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
    iVector<vtype, N> ret;
    for (int i = 0; i < N; i++) ret._internal[i] = -r._internal[i];
    return ret;
  }
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-  strong_inline iVector<vtype, N> &operator*=(const iScalar<vtype> &r) {
+  accelerator_inline iVector<vtype, N> &operator*=(const iScalar<vtype> &r) {
    *this = (*this) * r;
    return *this;
  }
-  strong_inline iVector<vtype, N> &operator-=(const iVector<vtype, N> &r) {
+  accelerator_inline iVector<vtype, N> &operator-=(const iVector<vtype, N> &r) {
    *this = (*this) - r;
    return *this;
  }
-  strong_inline iVector<vtype, N> &operator+=(const iVector<vtype, N> &r) {
+  accelerator_inline iVector<vtype, N> &operator+=(const iVector<vtype, N> &r) {
    *this = (*this) + r;
    return *this;
  }
-  strong_inline vtype &operator()(int i) { return _internal[i]; }
-  strong_inline const vtype &operator()(int i) const { return _internal[i]; }
-  friend std::ostream &operator<<(std::ostream &stream,
-                                  const iVector<vtype, N> &o) {
+  accelerator_inline vtype &operator()(int i) { return _internal[i]; }
+  accelerator_inline const vtype &operator()(int i) const { return _internal[i]; }
+
+  // Host
+  friend std::ostream &operator<<(std::ostream &stream, const iVector<vtype, N> &o) {
    stream << "V<" << N << ">{";
    for (int i = 0; i < N; i++) {
      stream << o._internal[i];
@@ -300,9 +277,6 @@ public:
    stream << "}";
    return stream;
  };
-  //    strong_inline vtype && operator ()(int i) {
-  //      return _internal[i];
-  //    }
 };

 template <class vtype, int N>
@@ -330,147 +304,137 @@ public:

  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };

-  iMatrix(const Zero &z) { *this = zero; };
-  iMatrix() = default;
+  accelerator_inline iMatrix(const Zero &z) { zeroit(*this); };
+  accelerator iMatrix() = default;

-  iMatrix &operator=(const iMatrix &rhs) {
+  accelerator_inline iMatrix &operator=(const iMatrix &rhs) {
    for (int i = 0; i < N; i++)
-      for (int j = 0; j < N; j++) vstream(_internal[i][j], rhs._internal[i][j]);
+      for (int j = 0; j < N; j++) 
+	vstream(_internal[i][j], rhs._internal[i][j]);
    return *this;
  };

-  iMatrix(scalar_type s) {
+  accelerator_inline iMatrix(scalar_type s) {
    (*this) = s;
  };  // recurse down and hit the constructor for vector_type

-  /*
-    iMatrix(const iMatrix<vtype,N> &copyme)=default;
-    iMatrix(iMatrix<vtype,N> &&copyme)=default;
-    iMatrix<vtype,N> & operator= (const iMatrix<vtype,N> &copyme) = default;
-    iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &&copyme) = default;
-  */
-
-  iMatrix<vtype, N> &operator=(const Zero &hero) {
+  accelerator_inline iMatrix<vtype, N> &operator=(const Zero &hero) {
    zeroit(*this);
    return *this;
  }
-  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
-	    * = nullptr>
-  strong_inline auto operator=(T arg) -> iMatrix<vtype, N> {
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
+  accelerator_inline auto operator=(T arg) -> iMatrix<vtype, N> {
    zeroit(*this);
    for (int i = 0; i < N; i++) _internal[i][i] = arg;
    return *this;
  }

-friend strong_inline void zeroit(iMatrix<vtype,N> &that){
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      zeroit(that._internal[i][j]);
+  friend accelerator_inline void zeroit(iMatrix<vtype,N> &that){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	zeroit(that._internal[i][j]);
    }}
-}
-friend strong_inline void prefetch(iMatrix<vtype,N> &that){
-  for(int i=0;i<N;i++) 
-    for(int j=0;j<N;j++) 
-      prefetch(that._internal[i][j]);
-}
-friend strong_inline void vstream(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in){
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      vstream(out._internal[i][j],in._internal[i][j]);
+  }
+  friend accelerator_inline void prefetch(iMatrix<vtype,N> &that){
+    for(int i=0;i<N;i++) {
+      for(int j=0;j<N;j++) { 
+	prefetch(that._internal[i][j]);
    }}
-}
-friend strong_inline void vbroadcast(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int lane){
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      vbroadcast(out._internal[i][j],in._internal[i][j],lane);
+  }
+  friend accelerator_inline void vstream(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	vstream(out._internal[i][j],in._internal[i][j]);
    }}
-}
+  }
+  friend accelerator_inline void vbroadcast(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int lane){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	vbroadcast(out._internal[i][j],in._internal[i][j],lane);
+    }}
+  }

-friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      permute(out._internal[i][j],in._internal[i][j],permutetype);
+  friend accelerator_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	permute(out._internal[i][j],in._internal[i][j],permutetype);
    }}
-}
-friend strong_inline void rotate(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int rot){
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
+  }
+  friend accelerator_inline void rotate(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int rot){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
      rotate(out._internal[i][j],in._internal[i][j],rot);
    }}
-}
-friend strong_inline void exchange(iMatrix<vtype,N> &out1,iMatrix<vtype,N> &out2,
-				   const iMatrix<vtype,N> &in1,const iMatrix<vtype,N> &in2,int type){
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      exchange(out1._internal[i][j],out2._internal[i][j],
-	       in1._internal[i][j], in2._internal[i][j],type);
-    }}
-}
-
-// Unary negation
-friend strong_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
-  iMatrix<vtype, N> ret;
-  for (int i = 0; i < N; i++) {
-    for (int j = 0; j < N; j++) {
-      ret._internal[i][j] = -r._internal[i][j];
-    }
  }
-  return ret;
-}
-// *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-template <class T>
-strong_inline iMatrix<vtype, N> &operator*=(const T &r) {
-  *this = (*this) * r;
-  return *this;
-}
-template <class T>
-strong_inline iMatrix<vtype, N> &operator-=(const T &r) {
-  *this = (*this) - r;
-  return *this;
-}
-template <class T>
-strong_inline iMatrix<vtype, N> &operator+=(const T &r) {
-  *this = (*this) + r;
-  return *this;
-}
+  friend accelerator_inline void exchange(iMatrix<vtype,N> &out1,iMatrix<vtype,N> &out2,
+					  const iMatrix<vtype,N> &in1,const iMatrix<vtype,N> &in2,int type){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	exchange(out1._internal[i][j],out2._internal[i][j],in1._internal[i][j], in2._internal[i][j],type);
+    }}
+  }
+  
+  // Unary negation
+  friend accelerator_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
+    iMatrix<vtype, N> ret;
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < N; j++) {
+	ret._internal[i][j] = -r._internal[i][j];
+    }}
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  template <class T>
+  accelerator_inline iMatrix<vtype, N> &operator*=(const T &r) {
+    *this = (*this) * r;
+    return *this;
+  }
+  template <class T>
+  accelerator_inline iMatrix<vtype, N> &operator-=(const T &r) {
+    *this = (*this) - r;
+    return *this;
+  }
+  template <class T>
+  accelerator_inline iMatrix<vtype, N> &operator+=(const T &r) {
+    *this = (*this) + r;
+    return *this;
+  }

-// returns an lvalue reference
-strong_inline vtype &operator()(int i, int j) { return _internal[i][j]; }
-strong_inline const vtype &operator()(int i, int j) const {
-  return _internal[i][j];
-}
-friend std::ostream &operator<<(std::ostream &stream,
-				const iMatrix<vtype, N> &o) {
-  stream << "M<" << N << ">{";
-  for (int i = 0; i < N; i++) {
-    stream << "{";
-    for (int j = 0; j < N; j++) {
-      stream << o._internal[i][j];
-      if (i < N - 1) stream << ",";
+  // returns an lvalue reference
+  accelerator_inline vtype &operator()(int i, int j) { return _internal[i][j]; }
+  accelerator_inline const vtype &operator()(int i, int j) const {
+    return _internal[i][j];
+  }
+  
+  // Host function only
+  friend std::ostream &operator<<(std::ostream &stream, const iMatrix<vtype, N> &o) {
+    stream << "M<" << N << ">{";
+    for (int i = 0; i < N; i++) {
+      stream << "{";
+      for (int j = 0; j < N; j++) {
+	stream << o._internal[i][j];
+	if (i < N - 1) stream << ",";
+      }
+      stream << "}";
+      if (i != N - 1) stream << "\n\t\t";
    }
    stream << "}";
-    if (i != N - 1) stream << "\n\t\t";
-  }
-  stream << "}";
-  return stream;
+    return stream;
+  };
+
 };

-//  strong_inline vtype && operator ()(int i,int j) {
-//    return _internal[i][j];
-//  }
-};
-
-template <class v>
+template <class v> accelerator_inline
 void vprefetch(const iScalar<v> &vv) {
  vprefetch(vv._internal);
 }
-template <class v, int N>
+template <class v, int N> accelerator_inline
 void vprefetch(const iVector<v, N> &vv) {
  for (int i = 0; i < N; i++) {
    vprefetch(vv._internal[i]);
  }
 }
-template <class v, int N>
+template <class v, int N> accelerator_inline
 void vprefetch(const iMatrix<v, N> &vv) {
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
@@ -482,6 +446,3 @@ void vprefetch(const iMatrix<v, N> &vv) {
 NAMESPACE_END(Grid);

 #endif
-
-
-