diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index 203c3826..3543d6aa 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -41,17 +41,13 @@ void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ auto rhs_v = rhs.View(); conformable(ret,rhs); conformable(lhs,rhs); -#ifdef STREAMING_STORES - accelerator_loop(ss,lhs_v,{ - obj1 tmp; - mult(&tmp,&lhs_v[ss],&rhs_v[ss]); - vstream(ret_v[ss],tmp); + accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto lhs_t = lhs_v(ss); + auto rhs_t = rhs_v(ss); + mult(&tmp,&lhs_t,&rhs_t); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,lhs_v,{ - mult(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]); - }); -#endif } template inline @@ -62,17 +58,13 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ auto ret_v = ret.View(); auto lhs_v = lhs.View(); auto rhs_v = rhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,lhs_v,{ - obj1 tmp; - mac(&tmp,&lhs_v[ss],&rhs_v[ss]); - vstream(ret_v[ss],tmp); + accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto lhs_t=lhs_v(ss); + auto rhs_t=rhs_v(ss); + mac(&tmp,&lhs_t,&rhs_t); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,lhs_v,{ - mac(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]); - }); -#endif } template inline @@ -83,17 +75,13 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ auto ret_v = ret.View(); auto lhs_v = lhs.View(); auto rhs_v = rhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,lhs_v,{ - obj1 tmp; - sub(&tmp,&lhs_v[ss],&rhs_v[ss]); - vstream(ret_v[ss],tmp); + accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto lhs_t=lhs_v(ss); + auto rhs_t=rhs_v(ss); + sub(&tmp,&lhs_t,&rhs_t); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,lhs_v,{ - sub(&ret[ss],&lhs_v[ss],&rhs_v[ss]); - }); -#endif } template inline void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ @@ -103,17 +91,13 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ auto ret_v = ret.View(); auto lhs_v = lhs.View(); auto rhs_v = rhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,lhs_v,{ - obj1 tmp; - add(&tmp,&lhs_v[ss],&rhs_v[ss]); - vstream(ret_v[ss],tmp); + accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto lhs_t=lhs_v(ss); + auto rhs_t=rhs_v(ss); + add(&tmp,&lhs_t,&rhs_t); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,lhs_v,{ - add(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]); - }); -#endif } ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -125,10 +109,10 @@ void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ conformable(lhs,ret); auto ret_v = ret.View(); auto lhs_v = lhs.View(); - accelerator_loop(ss,lhs_v,{ - obj1 tmp; - mult(&tmp,&lhs_v[ss],&rhs); - vstream(ret_v[ss],tmp); + accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + mult(&tmp,&lhs_v(ss),&rhs); + coalescedWrite(ret_v[ss],tmp); }); } @@ -138,10 +122,11 @@ void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ conformable(ret,lhs); auto ret_v = ret.View(); auto lhs_v = lhs.View(); - accelerator_loop(ss,lhs_v,{ - obj1 tmp; - mac(&tmp,&lhs_v[ss],&rhs); - vstream(ret_v[ss],tmp); + accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto lhs_t=lhs_v(ss); + mac(&tmp,&lhs_t,&rhs); + coalescedWrite(ret_v[ss],tmp); }); } @@ -151,17 +136,12 @@ void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ conformable(ret,lhs); auto ret_v = ret.View(); auto lhs_v = lhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,lhs_v,{ - obj1 tmp; - sub(&tmp,&lhs_v[ss],&rhs); - vstream(ret_v[ss],tmp); + accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto lhs_t=lhs_v(ss); + sub(&tmp,&lhs_t,&rhs); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,lhs_v,{ - sub(&ret_v[ss],&lhs_v[ss],&rhs); - }); -#endif } template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ @@ -169,17 +149,12 @@ void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ conformable(lhs,ret); auto ret_v = ret.View(); auto lhs_v = lhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,lhs_v,{ - obj1 tmp; - add(&tmp,&lhs_v[ss],&rhs); - vstream(ret_v[ss],tmp); + accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto lhs_t=lhs_v(ss); + add(&tmp,&lhs_t,&rhs); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,lhs_v,{ - add(&ret_v[ss],&lhs_v[ss],&rhs); - }); -#endif } ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -191,17 +166,12 @@ void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ conformable(ret,rhs); auto ret_v = ret.View(); auto rhs_v = lhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,rhs_v,{ - obj1 tmp; - mult(&tmp,&lhs,&rhs_v[ss]); - vstream(ret_v[ss],tmp); + accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto rhs_t=rhs_v(ss); + mult(&tmp,&lhs,&rhs_t); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,rhs_v,{ - mult(&ret_v[ss],&lhs,&rhs_v[ss]); - }); -#endif } template inline @@ -210,17 +180,12 @@ void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ conformable(ret,rhs); auto ret_v = ret.View(); auto rhs_v = lhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,rhs_v,{ - obj1 tmp; - mac(&tmp,&lhs,&rhs_v[ss]); - vstream(ret_v[ss],tmp); + accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto rhs_t=rhs_v(ss); + mac(&tmp,&lhs,&rhs_t); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,rhs_v,{ - mac(&ret_v[ss],&lhs,&rhs_v[ss]); - }); -#endif } template inline @@ -229,17 +194,12 @@ void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ conformable(ret,rhs); auto ret_v = ret.View(); auto rhs_v = lhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,rhs_v,{ - obj1 tmp; - sub(&tmp,&lhs,&rhs_v[ss]); - vstream(ret_v[ss],tmp); + accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto rhs_t=rhs_v(ss); + sub(&tmp,&lhs,&rhs_t); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,rhs_v,{ - sub(&ret_v[ss],&lhs,&rhs_v[ss]); - }); -#endif } template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ @@ -247,17 +207,12 @@ void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ conformable(ret,rhs); auto ret_v = ret.View(); auto rhs_v = lhs.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,rhs_v,{ - obj1 tmp; - add(&tmp,&lhs,&rhs_v[ss]); - vstream(ret_v[ss],tmp); + accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ + decltype(coalescedRead(obj1())) tmp; + auto rhs_t=rhs_v(ss); + add(&tmp,&lhs,&rhs_t); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,rhs_v,{ - add(&ret_v[ss],&lhs,&rhs_v[ss]); - }); -#endif } template inline @@ -268,16 +223,10 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & auto ret_v = ret.View(); auto x_v = x.View(); auto y_v = y.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,x_v,{ - vobj tmp = a*x_v[ss]+y_v[ss]; - vstream(ret_v[ss],tmp); + accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ + auto tmp = a*x_v(ss)+y_v(ss); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,x_v,{ - ret_v[ss]=a*x_v[ss]+y_v[ss]; - }); -#endif } template inline void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ @@ -287,16 +236,10 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice auto ret_v = ret.View(); auto x_v = x.View(); auto y_v = y.View(); -#ifdef STREAMING_STORES - accelerator_loop(ss,x_v,{ - vobj tmp = a*x_v[ss]+b*y_v[ss]; - vstream(ret_v[ss],tmp); + accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ + auto tmp = a*x_v(ss)+b*y_v(ss); + coalescedWrite(ret_v[ss],tmp); }); -#else - accelerator_loop(ss,x_v,{ - ret_v[ss]=a*x_v[ss]+b*y_v[ss]; - }); -#endif } template inline