Optimisation and syntax pretty

2025-08-02 20:57:06 +01:00 · 2015-05-16 04:36:22 +01:00
parent 56667e9d32
commit 9f0e990b40
4 changed files with 26 additions and 18 deletions
--- a/lib/lattice/Grid_lattice_base.h
+++ b/lib/lattice/Grid_lattice_base.h
@@ -62,7 +62,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  // Expression Template closure support
  ////////////////////////////////////////////////////////////////////////////////
-  template <typename Op, typename T1>                         inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
+  template <typename Op, typename T1>                         strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
@@ -71,7 +71,7 @@ PARALLEL_FOR_LOOP
    }
    return *this;
  }
-  template <typename Op, typename T1,typename T2>             inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
+  template <typename Op, typename T1,typename T2>             strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
@@ -80,7 +80,7 @@ PARALLEL_FOR_LOOP
    }
    return *this;
  }
-  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
+  template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
@@ -132,14 +132,14 @@ PARALLEL_FOR_LOOP
        checkerboard=0;
    }

-    template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
+    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r;
        }
        return *this;
    }
-    template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      conformable(*this,r);
      std::cout<<"Lattice operator ="<<std::endl;
 PARALLEL_FOR_LOOP
@@ -150,21 +150,21 @@ PARALLEL_FOR_LOOP
    }

    // *=,+=,-= operators inherit behvour from correspond */+/- operation
-    template<class T> inline Lattice<vobj> &operator *=(const T &r) {
+    template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
        *this = (*this)*r;
        return *this;
    }

-    template<class T> inline Lattice<vobj> &operator -=(const T &r) {
+    template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
        *this = (*this)-r;
        return *this;
    }
-    template<class T> inline Lattice<vobj> &operator +=(const T &r) {
+    template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
        *this = (*this)+r;
        return *this;
    }
    
-    inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+    strong_inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
        conformable(lhs,rhs);
        Lattice<vobj> ret(lhs._grid);
 PARALLEL_FOR_LOOP
@@ -176,7 +176,7 @@ PARALLEL_FOR_LOOP

 }; // class Lattice

-  template<class vobj> inline std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
+  template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
    std::vector<int> gcoor;
    typedef typename vobj::scalar_object sobj;
    sobj ss;
--- a/lib/lattice/Grid_lattice_trace.h
+++ b/lib/lattice/Grid_lattice_trace.h
@@ -26,7 +26,7 @@ PARALLEL_FOR_LOOP
    // Trace Index level dependent operation
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj>
-    inline auto traceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
+    inline auto latTraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
    {
      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 PARALLEL_FOR_LOOP
--- a/lib/math/Grid_math_arith_mac.h
+++ b/lib/math/Grid_math_arith_mac.h
@@ -27,13 +27,14 @@ strong_inline  void mac(iScalar<rtype> * __restrict__ ret,const iScalar<vtype> *
 }
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
    for(int c3=0;c3<N;c3++){
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
    }}}
    return;
 }
+
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
    for(int c1=0;c1<N;c1++){
--- a/lib/math/Grid_math_arith_mul.h
+++ b/lib/math/Grid_math_arith_mul.h
@@ -15,15 +15,22 @@ strong_inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> *

 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
  for(int c1=0;c1<N;c1++){
-        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
+    int c3=0;
+    for(int c2=0;c2<N;c2++){
+      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+    }
+  }
  for(int c3=1;c3<N;c3++){
+    for(int c1=0;c1<N;c1++){
+      for(int c2=0;c2<N;c2++){
 	mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
      }
-    }}
+    }
+  }
    return;
 }
+
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
    for(int c2=0;c2<N;c2++){