mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-13 01:05:36 +00:00
Accelerated coalesced loops in most cases
This commit is contained in:
parent
68541606ab
commit
7b8ccff4f4
@ -41,17 +41,13 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,lhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto lhs_t = lhs_v(ss);
|
||||||
mult(&tmp,&lhs_v[ss],&rhs_v[ss]);
|
auto rhs_t = rhs_v(ss);
|
||||||
vstream(ret_v[ss],tmp);
|
mult(&tmp,&lhs_t,&rhs_t);
|
||||||
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,lhs_v,{
|
|
||||||
mult(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
@ -62,17 +58,13 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,lhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto lhs_t=lhs_v(ss);
|
||||||
mac(&tmp,&lhs_v[ss],&rhs_v[ss]);
|
auto rhs_t=rhs_v(ss);
|
||||||
vstream(ret_v[ss],tmp);
|
mac(&tmp,&lhs_t,&rhs_t);
|
||||||
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,lhs_v,{
|
|
||||||
mac(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
@ -83,17 +75,13 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,lhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto lhs_t=lhs_v(ss);
|
||||||
sub(&tmp,&lhs_v[ss],&rhs_v[ss]);
|
auto rhs_t=rhs_v(ss);
|
||||||
vstream(ret_v[ss],tmp);
|
sub(&tmp,&lhs_t,&rhs_t);
|
||||||
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,lhs_v,{
|
|
||||||
sub(&ret[ss],&lhs_v[ss],&rhs_v[ss]);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||||
@ -103,17 +91,13 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,lhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto lhs_t=lhs_v(ss);
|
||||||
add(&tmp,&lhs_v[ss],&rhs_v[ss]);
|
auto rhs_t=rhs_v(ss);
|
||||||
vstream(ret_v[ss],tmp);
|
add(&tmp,&lhs_t,&rhs_t);
|
||||||
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,lhs_v,{
|
|
||||||
add(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -125,10 +109,10 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
|||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
accelerator_loop(ss,lhs_v,{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
obj1 tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
mult(&tmp,&lhs_v[ss],&rhs);
|
mult(&tmp,&lhs_v(ss),&rhs);
|
||||||
vstream(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -138,10 +122,11 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
|||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
accelerator_loop(ss,lhs_v,{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
obj1 tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
mac(&tmp,&lhs_v[ss],&rhs);
|
auto lhs_t=lhs_v(ss);
|
||||||
vstream(ret_v[ss],tmp);
|
mac(&tmp,&lhs_t,&rhs);
|
||||||
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -151,17 +136,12 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
|||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,lhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto lhs_t=lhs_v(ss);
|
||||||
sub(&tmp,&lhs_v[ss],&rhs);
|
sub(&tmp,&lhs_t,&rhs);
|
||||||
vstream(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,lhs_v,{
|
|
||||||
sub(&ret_v[ss],&lhs_v[ss],&rhs);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
@ -169,17 +149,12 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
|||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,lhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto lhs_t=lhs_v(ss);
|
||||||
add(&tmp,&lhs_v[ss],&rhs);
|
add(&tmp,&lhs_t,&rhs);
|
||||||
vstream(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,lhs_v,{
|
|
||||||
add(&ret_v[ss],&lhs_v[ss],&rhs);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -191,17 +166,12 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
|||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto rhs_v = lhs.View();
|
auto rhs_v = lhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,rhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto rhs_t=rhs_v(ss);
|
||||||
mult(&tmp,&lhs,&rhs_v[ss]);
|
mult(&tmp,&lhs,&rhs_t);
|
||||||
vstream(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,rhs_v,{
|
|
||||||
mult(&ret_v[ss],&lhs,&rhs_v[ss]);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
@ -210,17 +180,12 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
|||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto rhs_v = lhs.View();
|
auto rhs_v = lhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,rhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto rhs_t=rhs_v(ss);
|
||||||
mac(&tmp,&lhs,&rhs_v[ss]);
|
mac(&tmp,&lhs,&rhs_t);
|
||||||
vstream(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,rhs_v,{
|
|
||||||
mac(&ret_v[ss],&lhs,&rhs_v[ss]);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
@ -229,17 +194,12 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
|||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto rhs_v = lhs.View();
|
auto rhs_v = lhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,rhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto rhs_t=rhs_v(ss);
|
||||||
sub(&tmp,&lhs,&rhs_v[ss]);
|
sub(&tmp,&lhs,&rhs_t);
|
||||||
vstream(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,rhs_v,{
|
|
||||||
sub(&ret_v[ss],&lhs,&rhs_v[ss]);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
@ -247,17 +207,12 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
|||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto rhs_v = lhs.View();
|
auto rhs_v = lhs.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
accelerator_loop(ss,rhs_v,{
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
obj1 tmp;
|
auto rhs_t=rhs_v(ss);
|
||||||
add(&tmp,&lhs,&rhs_v[ss]);
|
add(&tmp,&lhs,&rhs_t);
|
||||||
vstream(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,rhs_v,{
|
|
||||||
add(&ret_v[ss],&lhs,&rhs_v[ss]);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class sobj,class vobj> inline
|
template<class sobj,class vobj> inline
|
||||||
@ -268,16 +223,10 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
|
|||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto x_v = x.View();
|
auto x_v = x.View();
|
||||||
auto y_v = y.View();
|
auto y_v = y.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||||
accelerator_loop(ss,x_v,{
|
auto tmp = a*x_v(ss)+y_v(ss);
|
||||||
vobj tmp = a*x_v[ss]+y_v[ss];
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
vstream(ret_v[ss],tmp);
|
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,x_v,{
|
|
||||||
ret_v[ss]=a*x_v[ss]+y_v[ss];
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
template<class sobj,class vobj> inline
|
template<class sobj,class vobj> inline
|
||||||
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
@ -287,16 +236,10 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
|||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
auto x_v = x.View();
|
auto x_v = x.View();
|
||||||
auto y_v = y.View();
|
auto y_v = y.View();
|
||||||
#ifdef STREAMING_STORES
|
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||||
accelerator_loop(ss,x_v,{
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
vobj tmp = a*x_v[ss]+b*y_v[ss];
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
vstream(ret_v[ss],tmp);
|
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
accelerator_loop(ss,x_v,{
|
|
||||||
ret_v[ss]=a*x_v[ss]+b*y_v[ss];
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class sobj,class vobj> inline
|
template<class sobj,class vobj> inline
|
||||||
|
Loading…
Reference in New Issue
Block a user