From 549a143e782e8de67b3187e214ae3ccbd959317b Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 24 Jan 2018 13:34:46 +0000 Subject: [PATCH] Accelerator related --- lib/lattice/Lattice_arith.h | 112 ++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 44 deletions(-) diff --git a/lib/lattice/Lattice_arith.h b/lib/lattice/Lattice_arith.h index 42300aa7..05e85bc7 100644 --- a/lib/lattice/Lattice_arith.h +++ b/lib/lattice/Lattice_arith.h @@ -33,215 +33,239 @@ NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////////////////////////// // avoid copy back routines for mult, mac, sub, add ////////////////////////////////////////////////////////////////////////////////////////////////////// -template strong_inline +template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.checkerboard = lhs.checkerboard; conformable(ret,rhs); conformable(lhs,rhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,lhs,{ obj1 tmp; mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,lhs,{ mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); + }); #endif - } } -template strong_inline +template inline void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.checkerboard = lhs.checkerboard; conformable(ret,rhs); conformable(lhs,rhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,lhs,{ obj1 tmp; mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,lhs,{ mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); + }); #endif - } } -template strong_inline +template inline void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.checkerboard = lhs.checkerboard; conformable(ret,rhs); conformable(lhs,rhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,lhs,{ obj1 tmp; sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,lhs,{ sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); + }); #endif - } } -template strong_inline +template inline void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.checkerboard = lhs.checkerboard; conformable(ret,rhs); conformable(lhs,rhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,lhs,{ obj1 tmp; add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,lhs,{ add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); + }); #endif - } } ////////////////////////////////////////////////////////////////////////////////////////////////////// // avoid copy back routines for mult, mac, sub, add ////////////////////////////////////////////////////////////////////////////////////////////////////// -template strong_inline +template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.checkerboard = lhs.checkerboard; conformable(lhs,ret); - parallel_for(int ss=0;ssoSites();ss++){ + accelerator_loop(ss,lhs,{ obj1 tmp; mult(&tmp,&lhs._odata[ss],&rhs); vstream(ret._odata[ss],tmp); - } + }); } -template strong_inline +template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.checkerboard = lhs.checkerboard; conformable(ret,lhs); - parallel_for(int ss=0;ssoSites();ss++){ + accelerator_loop(ss,lhs,{ obj1 tmp; mac(&tmp,&lhs._odata[ss],&rhs); vstream(ret._odata[ss],tmp); - } + }); } -template strong_inline +template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.checkerboard = lhs.checkerboard; conformable(ret,lhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,lhs,{ obj1 tmp; sub(&tmp,&lhs._odata[ss],&rhs); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,lhs,{ sub(&ret._odata[ss],&lhs._odata[ss],&rhs); + }); #endif - } } -template strong_inline +template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.checkerboard = lhs.checkerboard; conformable(lhs,ret); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,lhs,{ obj1 tmp; add(&tmp,&lhs._odata[ss],&rhs); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,lhs,{ add(&ret._odata[ss],&lhs._odata[ss],&rhs); + }); #endif - } } ////////////////////////////////////////////////////////////////////////////////////////////////////// // avoid copy back routines for mult, mac, sub, add ////////////////////////////////////////////////////////////////////////////////////////////////////// -template strong_inline +template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,rhs,{ obj1 tmp; mult(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,rhs,{ mult(&ret._odata[ss],&lhs,&rhs._odata[ss]); + }); #endif - } } -template strong_inline +template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,rhs,{ obj1 tmp; mac(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,rhs,{ mac(&ret._odata[ss],&lhs,&rhs._odata[ss]); + }); #endif - } } -template strong_inline +template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,rhs,{ obj1 tmp; sub(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,rhs,{ sub(&ret._odata[ss],&lhs,&rhs._odata[ss]); + }); #endif - } } -template strong_inline +template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,rhs,{ obj1 tmp; add(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,rhs,{ add(&ret._odata[ss],&lhs,&rhs._odata[ss]); + }); #endif - } } -template strong_inline +template inline void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ ret.checkerboard = x.checkerboard; conformable(ret,x); conformable(x,y); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,x,{ vobj tmp = a*x._odata[ss]+y._odata[ss]; vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,x,{ ret._odata[ss]=a*x._odata[ss]+y._odata[ss]; + }); #endif - } } -template strong_inline +template inline void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ ret.checkerboard = x.checkerboard; conformable(ret,x); conformable(x,y); - parallel_for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES + accelerator_loop(ss,x,{ vobj tmp = a*x._odata[ss]+b*y._odata[ss]; vstream(ret._odata[ss],tmp); + }); #else + accelerator_loop(ss,x,{ ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss]; + }); #endif - } } -template strong_inline +template inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ ret.checkerboard = x.checkerboard; conformable(ret,x); @@ -249,7 +273,7 @@ RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice strong_inline +template inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ ret.checkerboard = x.checkerboard; conformable(ret,x);