Compile time select if we do the streaming store copy. Relies on Clang++ eliminating object copies,

and other compliers do not necessarily cope.
2026-03-15 16:56:09 +00:00 · 2015-05-21 06:39:00 +01:00
parent ac0941be9a
commit 1b9ecbac3b
1 changed files with 50 additions and 11 deletions
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -12,10 +12,13 @@ namespace Grid {
    conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
-      //      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#else
      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 #endif
    }
  }
@@ -24,9 +27,13 @@ PARALLEL_FOR_LOOP
    conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else
      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 #endif
    }
  }
@@ -35,9 +42,13 @@ PARALLEL_FOR_LOOP
    conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else
      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
@@ -45,9 +56,13 @@ PARALLEL_FOR_LOOP
    conformable(lhs,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else
      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 #endif
    }
  }
@@ -81,9 +96,13 @@ PARALLEL_FOR_LOOP
    conformable(lhs,ret);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
 #else 
      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
@@ -91,9 +110,13 @@ PARALLEL_FOR_LOOP
    conformable(lhs,ret);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
 #else 
      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
 #endif
    }
  }
@@ -105,9 +128,13 @@ PARALLEL_FOR_LOOP
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else 
      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 #endif
    }
  }
@@ -116,9 +143,13 @@ PARALLEL_FOR_LOOP
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else 
      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 #endif
    }
  }
@@ -127,9 +158,13 @@ PARALLEL_FOR_LOOP
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else 
      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
@@ -137,9 +172,13 @@ PARALLEL_FOR_LOOP
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else 
      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 #endif
    }
  }
@@ -148,8 +187,12 @@ PARALLEL_FOR_LOOP
    conformable(x,y);
 #pragma omp parallel for
    for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+y._odata[ss];
      vstream(ret._odata[ss],tmp);
 #else
      ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
 #endif
    }
  }
  template<class sobj,class vobj> strong_inline
@@ -157,29 +200,25 @@ PARALLEL_FOR_LOOP
    conformable(x,y);
 #pragma omp parallel for
    for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vstream(ret._odata[ss],tmp);
 #else
      ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
 #endif
    }
  }
  template<class sobj,class vobj> strong_inline
  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
    conformable(x,y);
-#pragma omp parallel for
+    axpy(ret,a,x,y);
    for(int ss=0;ss<x._grid->oSites();ss++){
      vobj tmp = a*x._odata[ss]+y._odata[ss];
      vstream(ret._odata[ss],tmp);
    }
    return norm2(ret);
  }
  template<class sobj,class vobj> strong_inline
  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
    conformable(x,y);
-#pragma omp parallel for
+    axpby(ret,a,b,x,y);
    for(int ss=0;ss<x._grid->oSites();ss++){
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vstream(ret._odata[ss],tmp);
    }
    return norm2(ret); // FIXME implement parallel norm in ss loop
  }