diff --git a/lib/lattice/Lattice_arith.h b/lib/lattice/Lattice_arith.h index c3093167..42300aa7 100644 --- a/lib/lattice/Lattice_arith.h +++ b/lib/lattice/Lattice_arith.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,241 +23,240 @@ Author: Peter Boyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_LATTICE_ARITH_H #define GRID_LATTICE_ARITH_H -namespace Grid { +NAMESPACE_BEGIN(Grid); - - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // avoid copy back routines for mult, mac, sub, add - ////////////////////////////////////////////////////////////////////////////////////////////////////// - template strong_inline - void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ - ret.checkerboard = lhs.checkerboard; - conformable(ret,rhs); - conformable(lhs,rhs); - parallel_for(int ss=0;ssoSites();ss++){ +////////////////////////////////////////////////////////////////////////////////////////////////////// +// avoid copy back routines for mult, mac, sub, add +////////////////////////////////////////////////////////////////////////////////////////////////////// +template strong_inline +void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,rhs); + conformable(lhs,rhs); + parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES - obj1 tmp; - mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); - vstream(ret._odata[ss],tmp); + obj1 tmp; + mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); + vstream(ret._odata[ss],tmp); #else - mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); + mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); #endif - } } - - template strong_inline - void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ - ret.checkerboard = lhs.checkerboard; - conformable(ret,rhs); - conformable(lhs,rhs); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); - vstream(ret._odata[ss],tmp); -#else - mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); -#endif - } - } - - template strong_inline - void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ - ret.checkerboard = lhs.checkerboard; - conformable(ret,rhs); - conformable(lhs,rhs); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); - vstream(ret._odata[ss],tmp); -#else - sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); -#endif - } - } - template strong_inline - void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ - ret.checkerboard = lhs.checkerboard; - conformable(ret,rhs); - conformable(lhs,rhs); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); - vstream(ret._odata[ss],tmp); -#else - add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); -#endif - } - } - - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // avoid copy back routines for mult, mac, sub, add - ////////////////////////////////////////////////////////////////////////////////////////////////////// - template strong_inline - void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ - ret.checkerboard = lhs.checkerboard; - conformable(lhs,ret); - parallel_for(int ss=0;ssoSites();ss++){ - obj1 tmp; - mult(&tmp,&lhs._odata[ss],&rhs); - vstream(ret._odata[ss],tmp); - } - } - - template strong_inline - void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ - ret.checkerboard = lhs.checkerboard; - conformable(ret,lhs); - parallel_for(int ss=0;ssoSites();ss++){ - obj1 tmp; - mac(&tmp,&lhs._odata[ss],&rhs); - vstream(ret._odata[ss],tmp); - } - } - - template strong_inline - void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ - ret.checkerboard = lhs.checkerboard; - conformable(ret,lhs); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - sub(&tmp,&lhs._odata[ss],&rhs); - vstream(ret._odata[ss],tmp); -#else - sub(&ret._odata[ss],&lhs._odata[ss],&rhs); -#endif - } - } - template strong_inline - void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ - ret.checkerboard = lhs.checkerboard; - conformable(lhs,ret); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - add(&tmp,&lhs._odata[ss],&rhs); - vstream(ret._odata[ss],tmp); -#else - add(&ret._odata[ss],&lhs._odata[ss],&rhs); -#endif - } - } - - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // avoid copy back routines for mult, mac, sub, add - ////////////////////////////////////////////////////////////////////////////////////////////////////// - template strong_inline - void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ - ret.checkerboard = rhs.checkerboard; - conformable(ret,rhs); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - mult(&tmp,&lhs,&rhs._odata[ss]); - vstream(ret._odata[ss],tmp); -#else - mult(&ret._odata[ss],&lhs,&rhs._odata[ss]); -#endif - } - } - - template strong_inline - void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ - ret.checkerboard = rhs.checkerboard; - conformable(ret,rhs); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - mac(&tmp,&lhs,&rhs._odata[ss]); - vstream(ret._odata[ss],tmp); -#else - mac(&ret._odata[ss],&lhs,&rhs._odata[ss]); -#endif - } - } - - template strong_inline - void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ - ret.checkerboard = rhs.checkerboard; - conformable(ret,rhs); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - sub(&tmp,&lhs,&rhs._odata[ss]); - vstream(ret._odata[ss],tmp); -#else - sub(&ret._odata[ss],&lhs,&rhs._odata[ss]); -#endif - } - } - template strong_inline - void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ - ret.checkerboard = rhs.checkerboard; - conformable(ret,rhs); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - obj1 tmp; - add(&tmp,&lhs,&rhs._odata[ss]); - vstream(ret._odata[ss],tmp); -#else - add(&ret._odata[ss],&lhs,&rhs._odata[ss]); -#endif - } - } - - template strong_inline - void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - vobj tmp = a*x._odata[ss]+y._odata[ss]; - vstream(ret._odata[ss],tmp); -#else - ret._odata[ss]=a*x._odata[ss]+y._odata[ss]; -#endif - } - } - template strong_inline - void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - parallel_for(int ss=0;ssoSites();ss++){ -#ifdef STREAMING_STORES - vobj tmp = a*x._odata[ss]+b*y._odata[ss]; - vstream(ret._odata[ss],tmp); -#else - ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss]; -#endif - } - } - - template strong_inline - RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - axpy(ret,a,x,y); - return norm2(ret); - } - template strong_inline - RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - axpby(ret,a,b,x,y); - return norm2(ret); // FIXME implement parallel norm in ss loop - } - } + +template strong_inline +void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,rhs); + conformable(lhs,rhs); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); + vstream(ret._odata[ss],tmp); +#else + mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif + } +} + +template strong_inline +void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,rhs); + conformable(lhs,rhs); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); + vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif + } +} +template strong_inline +void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,rhs); + conformable(lhs,rhs); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); + vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif + } +} + +////////////////////////////////////////////////////////////////////////////////////////////////////// +// avoid copy back routines for mult, mac, sub, add +////////////////////////////////////////////////////////////////////////////////////////////////////// +template strong_inline +void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(lhs,ret); + parallel_for(int ss=0;ssoSites();ss++){ + obj1 tmp; + mult(&tmp,&lhs._odata[ss],&rhs); + vstream(ret._odata[ss],tmp); + } +} + +template strong_inline +void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,lhs); + parallel_for(int ss=0;ssoSites();ss++){ + obj1 tmp; + mac(&tmp,&lhs._odata[ss],&rhs); + vstream(ret._odata[ss],tmp); + } +} + +template strong_inline +void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,lhs); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + sub(&tmp,&lhs._odata[ss],&rhs); + vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs._odata[ss],&rhs); +#endif + } +} +template strong_inline +void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(lhs,ret); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + add(&tmp,&lhs._odata[ss],&rhs); + vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs._odata[ss],&rhs); +#endif + } +} + +////////////////////////////////////////////////////////////////////////////////////////////////////// +// avoid copy back routines for mult, mac, sub, add +////////////////////////////////////////////////////////////////////////////////////////////////////// +template strong_inline +void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + ret.checkerboard = rhs.checkerboard; + conformable(ret,rhs); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + mult(&tmp,&lhs,&rhs._odata[ss]); + vstream(ret._odata[ss],tmp); +#else + mult(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif + } +} + +template strong_inline +void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + ret.checkerboard = rhs.checkerboard; + conformable(ret,rhs); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + mac(&tmp,&lhs,&rhs._odata[ss]); + vstream(ret._odata[ss],tmp); +#else + mac(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif + } +} + +template strong_inline +void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + ret.checkerboard = rhs.checkerboard; + conformable(ret,rhs); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + sub(&tmp,&lhs,&rhs._odata[ss]); + vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif + } +} +template strong_inline +void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + ret.checkerboard = rhs.checkerboard; + conformable(ret,rhs); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + obj1 tmp; + add(&tmp,&lhs,&rhs._odata[ss]); + vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif + } +} + +template strong_inline +void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + ret.checkerboard = x.checkerboard; + conformable(ret,x); + conformable(x,y); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + vobj tmp = a*x._odata[ss]+y._odata[ss]; + vstream(ret._odata[ss],tmp); +#else + ret._odata[ss]=a*x._odata[ss]+y._odata[ss]; +#endif + } +} +template strong_inline +void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + ret.checkerboard = x.checkerboard; + conformable(ret,x); + conformable(x,y); + parallel_for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES + vobj tmp = a*x._odata[ss]+b*y._odata[ss]; + vstream(ret._odata[ss],tmp); +#else + ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss]; +#endif + } +} + +template strong_inline +RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + ret.checkerboard = x.checkerboard; + conformable(ret,x); + conformable(x,y); + axpy(ret,a,x,y); + return norm2(ret); +} +template strong_inline +RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + ret.checkerboard = x.checkerboard; + conformable(ret,x); + conformable(x,y); + axpby(ret,a,b,x,y); + return norm2(ret); // FIXME implement parallel norm in ss loop +} + +NAMESPACE_END(Grid); #endif