From 1b9ecbac3bc0bb6df2400a4e2f79f17d34b42e45 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2015 06:39:00 +0100 Subject: [PATCH] Compile time select if we do the streaming store copy. Relies on Clang++ eliminating object copies, and other compliers do not necessarily cope. --- lib/lattice/Grid_lattice_arith.h | 61 ++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h index f1e566a2..ff966578 100644 --- a/lib/lattice/Grid_lattice_arith.h +++ b/lib/lattice/Grid_lattice_arith.h @@ -12,10 +12,13 @@ namespace Grid { conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); - // mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#else + mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif } } @@ -24,9 +27,13 @@ PARALLEL_FOR_LOOP conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif } } @@ -35,9 +42,13 @@ PARALLEL_FOR_LOOP conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif } } template strong_inline @@ -45,9 +56,13 @@ PARALLEL_FOR_LOOP conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif } } @@ -81,9 +96,13 @@ PARALLEL_FOR_LOOP conformable(lhs,ret); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; sub(&tmp,&lhs._odata[ss],&rhs); vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs._odata[ss],&rhs); +#endif } } template strong_inline @@ -91,9 +110,13 @@ PARALLEL_FOR_LOOP conformable(lhs,ret); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; add(&tmp,&lhs._odata[ss],&rhs); vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs._odata[ss],&rhs); +#endif } } @@ -105,9 +128,13 @@ PARALLEL_FOR_LOOP conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; mult(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + mult(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif } } @@ -116,9 +143,13 @@ PARALLEL_FOR_LOOP conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; mac(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + mac(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif } } @@ -127,9 +158,13 @@ PARALLEL_FOR_LOOP conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; sub(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif } } template strong_inline @@ -137,9 +172,13 @@ PARALLEL_FOR_LOOP conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; add(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif } } @@ -148,8 +187,12 @@ PARALLEL_FOR_LOOP conformable(x,y); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES vobj tmp = a*x._odata[ss]+y._odata[ss]; vstream(ret._odata[ss],tmp); +#else + ret._odata[ss]=a*x._odata[ss]+y._odata[ss]; +#endif } } template strong_inline @@ -157,29 +200,25 @@ PARALLEL_FOR_LOOP conformable(x,y); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES vobj tmp = a*x._odata[ss]+b*y._odata[ss]; vstream(ret._odata[ss],tmp); +#else + ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss]; +#endif } } template strong_inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ conformable(x,y); -#pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - vobj tmp = a*x._odata[ss]+y._odata[ss]; - vstream(ret._odata[ss],tmp); - } + axpy(ret,a,x,y); return norm2(ret); } template strong_inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ conformable(x,y); -#pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - vobj tmp = a*x._odata[ss]+b*y._odata[ss]; - vstream(ret._odata[ss],tmp); - } + axpby(ret,a,b,x,y); return norm2(ret); // FIXME implement parallel norm in ss loop }