From 1b9ecbac3bc0bb6df2400a4e2f79f17d34b42e45 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 21 May 2015 06:39:00 +0100
Subject: [PATCH] Compile time select if we do the streaming store copy. Relies
 on Clang++ eliminating object copies, and other compliers do not necessarily
 cope.

---
 lib/lattice/Grid_lattice_arith.h | 61 ++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 11 deletions(-)
diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h
index f1e566a2..ff966578 100644
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -12,10 +12,13 @@ namespace Grid {
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
-      //      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#else
+      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -24,9 +27,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else
+      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -35,9 +42,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
     }
   }
   template<class obj1,class obj2,class obj3> strong_inline
@@ -45,9 +56,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -81,9 +96,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,ret);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       sub(&tmp,&lhs._odata[ss],&rhs);
       vstream(ret._odata[ss],tmp);
+#else 
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
+#endif
     }
   }
   template<class obj1,class obj2,class obj3> strong_inline
@@ -91,9 +110,13 @@ PARALLEL_FOR_LOOP
     conformable(lhs,ret);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       add(&tmp,&lhs._odata[ss],&rhs);
       vstream(ret._odata[ss],tmp);
+#else 
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
+#endif
     }
   }
 
@@ -105,9 +128,13 @@ PARALLEL_FOR_LOOP
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       mult(&tmp,&lhs,&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else 
+      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -116,9 +143,13 @@ PARALLEL_FOR_LOOP
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       mac(&tmp,&lhs,&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else 
+      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -127,9 +158,13 @@ PARALLEL_FOR_LOOP
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       sub(&tmp,&lhs,&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else 
+      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
     }
   }
   template<class obj1,class obj2,class obj3> strong_inline
@@ -137,9 +172,13 @@ PARALLEL_FOR_LOOP
     conformable(ret,rhs);
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       obj1 tmp;
       add(&tmp,&lhs,&rhs._odata[ss]);
       vstream(ret._odata[ss],tmp);
+#else 
+      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
     }
   }
   
@@ -148,8 +187,12 @@ PARALLEL_FOR_LOOP
     conformable(x,y);
 #pragma omp parallel for
     for(int ss=0;ss<x._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       vobj tmp = a*x._odata[ss]+y._odata[ss];
       vstream(ret._odata[ss],tmp);
+#else
+      ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
+#endif
     }
   }
   template<class sobj,class vobj> strong_inline
@@ -157,29 +200,25 @@ PARALLEL_FOR_LOOP
     conformable(x,y);
 #pragma omp parallel for
     for(int ss=0;ss<x._grid->oSites();ss++){
+#ifdef STREAMING_STORES
       vobj tmp = a*x._odata[ss]+b*y._odata[ss];
       vstream(ret._odata[ss],tmp);
+#else
+      ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
+#endif
     }
   }
 
   template<class sobj,class vobj> strong_inline
   RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
     conformable(x,y);
-#pragma omp parallel for
-    for(int ss=0;ss<x._grid->oSites();ss++){
-      vobj tmp = a*x._odata[ss]+y._odata[ss];
-      vstream(ret._odata[ss],tmp);
-    }
+    axpy(ret,a,x,y);
     return norm2(ret);
   }
   template<class sobj,class vobj> strong_inline
   RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
     conformable(x,y);
-#pragma omp parallel for
-    for(int ss=0;ss<x._grid->oSites();ss++){
-      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
-      vstream(ret._odata[ss],tmp);
-    }
+    axpby(ret,a,b,x,y);
     return norm2(ret); // FIXME implement parallel norm in ss loop
   }