From 9f0e990b4054ee825b4eb4ab171748e5ad2f362e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 16 May 2015 04:36:22 +0100
Subject: [PATCH] Optimisation and syntax pretty

---
 lib/lattice/Grid_lattice_base.h  | 20 ++++++++++----------
 lib/lattice/Grid_lattice_trace.h |  2 +-
 lib/math/Grid_math_arith_mac.h   |  5 +++--
 lib/math/Grid_math_arith_mul.h   | 17 ++++++++++++-----
 4 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h
index 64103c5d..ae606ae7 100644
--- a/lib/lattice/Grid_lattice_base.h
+++ b/lib/lattice/Grid_lattice_base.h
@@ -62,7 +62,7 @@ public:
   ////////////////////////////////////////////////////////////////////////////////
   // Expression Template closure support
   ////////////////////////////////////////////////////////////////////////////////
-  template <typename Op, typename T1>                         inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
+  template <typename Op, typename T1>                         strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
   {
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
@@ -71,7 +71,7 @@ PARALLEL_FOR_LOOP
     }
     return *this;
   }
-  template <typename Op, typename T1,typename T2>             inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
+  template <typename Op, typename T1,typename T2>             strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
   {
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
@@ -80,7 +80,7 @@ PARALLEL_FOR_LOOP
     }
     return *this;
   }
-  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
+  template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
   {
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
@@ -132,14 +132,14 @@ PARALLEL_FOR_LOOP
         checkerboard=0;
     }
 
-    template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
+    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 PARALLEL_FOR_LOOP
         for(int ss=0;ss<_grid->oSites();ss++){
             this->_odata[ss]=r;
         }
         return *this;
     }
-    template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
       conformable(*this,r);
       std::cout<<"Lattice operator ="<<std::endl;
 PARALLEL_FOR_LOOP
@@ -150,21 +150,21 @@ PARALLEL_FOR_LOOP
     }
 
     // *=,+=,-= operators inherit behvour from correspond */+/- operation
-    template<class T> inline Lattice<vobj> &operator *=(const T &r) {
+    template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
         *this = (*this)*r;
         return *this;
     }
 
-    template<class T> inline Lattice<vobj> &operator -=(const T &r) {
+    template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
         *this = (*this)-r;
         return *this;
     }
-    template<class T> inline Lattice<vobj> &operator +=(const T &r) {
+    template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
         *this = (*this)+r;
         return *this;
     }
     
-    inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+    strong_inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
         conformable(lhs,rhs);
         Lattice<vobj> ret(lhs._grid);
 PARALLEL_FOR_LOOP
@@ -176,7 +176,7 @@ PARALLEL_FOR_LOOP
 
  }; // class Lattice
 
-  template<class vobj> inline std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
+  template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
     std::vector<int> gcoor;
     typedef typename vobj::scalar_object sobj;
     sobj ss;
diff --git a/lib/lattice/Grid_lattice_trace.h b/lib/lattice/Grid_lattice_trace.h
index 75cc5b87..4ce26170 100644
--- a/lib/lattice/Grid_lattice_trace.h
+++ b/lib/lattice/Grid_lattice_trace.h
@@ -26,7 +26,7 @@ PARALLEL_FOR_LOOP
     // Trace Index level dependent operation
     ////////////////////////////////////////////////////////////////////////////////////////////////////
     template<int Index,class vobj>
-    inline auto traceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
+    inline auto latTraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
     {
       Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 PARALLEL_FOR_LOOP
diff --git a/lib/math/Grid_math_arith_mac.h b/lib/math/Grid_math_arith_mac.h
index 68b0acf1..06b1661d 100644
--- a/lib/math/Grid_math_arith_mac.h
+++ b/lib/math/Grid_math_arith_mac.h
@@ -27,13 +27,14 @@ strong_inline  void mac(iScalar<rtype> * __restrict__ ret,const iScalar<vtype> *
 }
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
     for(int c3=0;c3<N;c3++){
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
         mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
     }}}
     return;
 }
+
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
     for(int c1=0;c1<N;c1++){
diff --git a/lib/math/Grid_math_arith_mul.h b/lib/math/Grid_math_arith_mul.h
index 7b883cf4..04b05bb4 100644
--- a/lib/math/Grid_math_arith_mul.h
+++ b/lib/math/Grid_math_arith_mul.h
@@ -15,15 +15,22 @@ strong_inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> *
 
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+  for(int c1=0;c1<N;c1++){
+    int c3=0;
     for(int c2=0;c2<N;c2++){
+      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+    }
+  }
+  for(int c3=1;c3<N;c3++){
     for(int c1=0;c1<N;c1++){
-        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
-        for(int c3=1;c3<N;c3++){
-            mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
-        }
-    }}
+      for(int c2=0;c2<N;c2++){
+	mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+      }
+    }
+  }
     return;
 }
+
 template<class rrtype,class ltype,class rtype,int N>
 strong_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
     for(int c2=0;c2<N;c2++){