1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 15:55:37 +00:00

Compile time select if we do the streaming store copy. Relies on Clang++ eliminating object copies,

and other compliers do not necessarily cope.
This commit is contained in:
Peter Boyle 2015-05-21 06:39:00 +01:00
parent ac0941be9a
commit 1b9ecbac3b

View File

@ -12,10 +12,13 @@ namespace Grid {
conformable(lhs,rhs); conformable(lhs,rhs);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){ for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
// mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); #else
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
} }
} }
@ -24,9 +27,13 @@ PARALLEL_FOR_LOOP
conformable(lhs,rhs); conformable(lhs,rhs);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){ for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
} }
} }
@ -35,9 +42,13 @@ PARALLEL_FOR_LOOP
conformable(lhs,rhs); conformable(lhs,rhs);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){ for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
} }
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> strong_inline
@ -45,9 +56,13 @@ PARALLEL_FOR_LOOP
conformable(lhs,rhs); conformable(lhs,rhs);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){ for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
} }
} }
@ -81,9 +96,13 @@ PARALLEL_FOR_LOOP
conformable(lhs,ret); conformable(lhs,ret);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){ for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs); sub(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
#endif
} }
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> strong_inline
@ -91,9 +110,13 @@ PARALLEL_FOR_LOOP
conformable(lhs,ret); conformable(lhs,ret);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){ for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs); add(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs._odata[ss],&rhs);
#endif
} }
} }
@ -105,9 +128,13 @@ PARALLEL_FOR_LOOP
conformable(ret,rhs); conformable(ret,rhs);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){ for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
mult(&tmp,&lhs,&rhs._odata[ss]); mult(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
} }
} }
@ -116,9 +143,13 @@ PARALLEL_FOR_LOOP
conformable(ret,rhs); conformable(ret,rhs);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){ for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
mac(&tmp,&lhs,&rhs._odata[ss]); mac(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
} }
} }
@ -127,9 +158,13 @@ PARALLEL_FOR_LOOP
conformable(ret,rhs); conformable(ret,rhs);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){ for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
sub(&tmp,&lhs,&rhs._odata[ss]); sub(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
} }
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> strong_inline
@ -137,9 +172,13 @@ PARALLEL_FOR_LOOP
conformable(ret,rhs); conformable(ret,rhs);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){ for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp; obj1 tmp;
add(&tmp,&lhs,&rhs._odata[ss]); add(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
} }
} }
@ -148,8 +187,12 @@ PARALLEL_FOR_LOOP
conformable(x,y); conformable(x,y);
#pragma omp parallel for #pragma omp parallel for
for(int ss=0;ss<x._grid->oSites();ss++){ for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+y._odata[ss]; vobj tmp = a*x._odata[ss]+y._odata[ss];
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
#endif
} }
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
@ -157,29 +200,25 @@ PARALLEL_FOR_LOOP
conformable(x,y); conformable(x,y);
#pragma omp parallel for #pragma omp parallel for
for(int ss=0;ss<x._grid->oSites();ss++){ for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+b*y._odata[ss]; vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vstream(ret._odata[ss],tmp); vstream(ret._odata[ss],tmp);
#else
ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
#endif
} }
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
conformable(x,y); conformable(x,y);
#pragma omp parallel for axpy(ret,a,x,y);
for(int ss=0;ss<x._grid->oSites();ss++){
vobj tmp = a*x._odata[ss]+y._odata[ss];
vstream(ret._odata[ss],tmp);
}
return norm2(ret); return norm2(ret);
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
conformable(x,y); conformable(x,y);
#pragma omp parallel for axpby(ret,a,b,x,y);
for(int ss=0;ss<x._grid->oSites();ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vstream(ret._odata[ss],tmp);
}
return norm2(ret); // FIXME implement parallel norm in ss loop return norm2(ret); // FIXME implement parallel norm in ss loop
} }